diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -51,6 +51,7 @@ X86InstrFoldTables.cpp X86InstrInfo.cpp X86EvexToVex.cpp + X86MovapsToMovups.cpp X86LegalizerInfo.cpp X86LoadValueInjectionLoadHardening.cpp X86LoadValueInjectionRetHardening.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -114,6 +114,8 @@ /// encoding when possible in order to reduce code size. FunctionPass *createX86EvexToVexInsts(); +FunctionPass *createX86MovapsToMovupsInsts(); + /// This pass creates the thunks for the retpoline feature. FunctionPass *createX86IndirectThunksPass(); @@ -143,6 +145,7 @@ FunctionPass *createX86SpeculativeExecutionSideEffectSuppression(); void initializeEvexToVexInstPassPass(PassRegistry &); +void initializeMovapsToMovupsInstPassPass(PassRegistry &); void initializeFixupBWInstPassPass(PassRegistry &); void initializeFixupLEAPassPass(PassRegistry &); void initializeFPSPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86MovapsToMovups.cpp b/llvm/lib/Target/X86/X86MovapsToMovups.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/X86/X86MovapsToMovups.cpp @@ -0,0 +1,180 @@ +//===- X86MovapsToMovups.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file defines the pass that replace movaps with movups. movups +/// achieve the same performance as movaps does when the address is aligned. +/// If the address is not aligned, movups can run without raising exception. +/// So movups is prefered. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86InstComments.h" +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Pass.h" +#include +#include + +using namespace llvm; + +#define MOVAPS2MOVUPS_DESC "Replace movaps instruction to movups" +#define DEBUG_TYPE "x86-movaps-to-movups" + +namespace { + +class MovapsToMovupsInstPass : public MachineFunctionPass { + + bool MovapsToMovupsImpl(MachineInstr &MI) const; + +public: + static char ID; + + MovapsToMovupsInstPass() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return MOVAPS2MOVUPS_DESC; } + + bool runOnMachineFunction(MachineFunction &MF) override; + + // This pass runs after regalloc and doesn't support VReg operands. + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + +private: + /// Machine instruction info used throughout the class. + const X86InstrInfo *TII = nullptr; +}; + +} // end anonymous namespace + +char MovapsToMovupsInstPass::ID = 0; + +bool MovapsToMovupsInstPass::runOnMachineFunction(MachineFunction &MF) { + const X86Subtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + if (!ST.hasAVX()) + return false; + + bool Changed = false; + + /// Go over all basic blocks in function and replace + /// movaps with movups when possible. + for (MachineBasicBlock &MBB : MF) { + + // Traverse the basic block. + for (MachineInstr &MI : MBB) + Changed |= MovapsToMovupsImpl(MI); + } + + return Changed; +} + +bool MovapsToMovupsInstPass::MovapsToMovupsImpl(MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); + unsigned NewOpc; + + switch (Opc) { + default: + return false; + case X86::MOVAPSmr: + NewOpc = X86::MOVUPSmr; + break; + case X86::MOVAPSrm: + NewOpc = X86::MOVUPSrm; + break; + case X86::VMOVAPSYmr: + NewOpc = X86::VMOVUPSYmr; + break; + case X86::VMOVAPSYrm: + NewOpc = X86::VMOVUPSYrm; + break; + case X86::VMOVAPSZ128mr: + NewOpc = X86::VMOVUPSZ128mr; + break; + case X86::VMOVAPSZ128mr_NOVLX: + NewOpc = X86::VMOVUPSZ128mr_NOVLX; + break; + case X86::VMOVAPSZ128mrk: + NewOpc = X86::VMOVUPSZ128mrk; + break; + case X86::VMOVAPSZ128rm: + NewOpc = X86::VMOVUPSZ128rm; + break; + case X86::VMOVAPSZ128rm_NOVLX: + NewOpc = X86::VMOVUPSZ128rm_NOVLX; + break; + case X86::VMOVAPSZ128rmk: + NewOpc = X86::VMOVUPSZ128rmk; + break; + case X86::VMOVAPSZ128rmkz: + NewOpc = X86::VMOVUPSZ128rmkz; + break; + case X86::VMOVAPSZ256mr: + NewOpc = X86::VMOVUPSZ256mr; + break; + case X86::VMOVAPSZ256mr_NOVLX: + NewOpc = X86::VMOVUPSZ256mr_NOVLX; + break; + case X86::VMOVAPSZ256mrk: + NewOpc = X86::VMOVUPSZ256mrk; + break; + case X86::VMOVAPSZ256rm: + NewOpc = X86::VMOVUPSZ256rm; + break; + case X86::VMOVAPSZ256rm_NOVLX: + NewOpc = X86::VMOVUPSZ256rm_NOVLX; + break; + case X86::VMOVAPSZ256rmk: + NewOpc = X86::VMOVUPSZ256rmk; + break; + case X86::VMOVAPSZ256rmkz: + NewOpc = X86::VMOVUPSZ256rmkz; + break; + case X86::VMOVAPSZmr: + NewOpc = X86::VMOVUPSZmr; + break; + case X86::VMOVAPSZmrk: + NewOpc = X86::VMOVUPSZmrk; + break; + case X86::VMOVAPSZrm: + NewOpc = X86::VMOVUPSZrm; + break; + case X86::VMOVAPSZrmk: + NewOpc = X86::VMOVUPSZrmk; + break; + case X86::VMOVAPSZrmkz: + NewOpc = X86::VMOVUPSZrmkz; + break; + case X86::VMOVAPSmr: + NewOpc = X86::VMOVUPSmr; + break; + case X86::VMOVAPSrm: + NewOpc = X86::VMOVUPSrm; + break; + } + + MI.setDesc(TII->get(NewOpc)); + return true; +} + +INITIALIZE_PASS(MovapsToMovupsInstPass, DEBUG_TYPE, MOVAPS2MOVUPS_DESC, false, + false) + +FunctionPass *llvm::createX86MovapsToMovupsInsts() { + return new MovapsToMovupsInstPass(); +} diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -66,6 +66,7 @@ initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); initializeEvexToVexInstPassPass(PR); + initializeMovapsToMovupsInstPassPass(PR); initializeFixupLEAPassPass(PR); initializeFPSPass(PR); initializeX86FixupSetCCPassPass(PR); @@ -524,6 +525,7 @@ addPass(createX86PadShortFunctions()); addPass(createX86FixupLEAs()); } + addPass(createX86MovapsToMovupsInsts()); addPass(createX86EvexToVexInsts()); addPass(createX86DiscriminateMemOpsPass()); addPass(createX86InsertPrefetchPass()); diff --git a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll --- a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll +++ b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll @@ -4,15 +4,15 @@ define void @endless_loop() { ; CHECK-LABEL: endless_loop: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps (%eax), %xmm0 +; CHECK-NEXT: vmovups (%eax), %xmm0 ; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] -; CHECK-NEXT: vmovaps %ymm0, (%eax) -; CHECK-NEXT: vmovaps %ymm1, (%eax) +; CHECK-NEXT: vmovups %ymm0, (%eax) +; CHECK-NEXT: vmovups %ymm1, (%eax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl entry: diff --git a/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll b/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll --- a/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll +++ b/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll @@ -20,7 +20,7 @@ ; CHECK-NEXT: vhaddps %ymm4, %ymm0, %ymm0 ; CHECK-NEXT: vsubps %ymm0, %ymm0, %ymm0 ; CHECK-NEXT: vhaddps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vmovaps %ymm0, (%rdi) +; CHECK-NEXT: vmovups %ymm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %tmp = load <4 x float>, <4 x float>* null, align 1 diff --git a/llvm/test/CodeGen/X86/2012-1-10-buildvector.ll b/llvm/test/CodeGen/X86/2012-1-10-buildvector.ll --- a/llvm/test/CodeGen/X86/2012-1-10-buildvector.ll +++ b/llvm/test/CodeGen/X86/2012-1-10-buildvector.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: bad_cast: ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, (%eax) +; CHECK-NEXT: vmovups %xmm0, (%eax) ; CHECK-NEXT: movl $0, (%eax) ; CHECK-NEXT: retl %vext.i = shufflevector <2 x i64> undef, <2 x i64> undef, <3 x i32> @@ -18,7 +18,7 @@ ; CHECK-LABEL: bad_insert: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovaps %ymm0, (%eax) +; CHECK-NEXT: vmovups %ymm0, (%eax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %v2 = insertelement <8 x i32> zeroinitializer, i32 %t, i32 0 diff --git a/llvm/test/CodeGen/X86/GlobalISel/memop-vec.ll b/llvm/test/CodeGen/X86/GlobalISel/memop-vec.ll --- a/llvm/test/CodeGen/X86/GlobalISel/memop-vec.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/memop-vec.ll @@ -14,7 +14,7 @@ define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) { ; SKX-LABEL: test_load_v4i32_align: ; SKX: # %bb.0: -; SKX-NEXT: vmovaps (%rdi), %xmm0 +; SKX-NEXT: vmovups (%rdi), %xmm0 ; SKX-NEXT: retq %r = load <4 x i32>, <4 x i32>* %p1, align 16 ret <4 x i32> %r @@ -32,7 +32,7 @@ define <8 x i32> @test_load_v8i32_align(<8 x i32> * %p1) { ; SKX-LABEL: test_load_v8i32_align: ; SKX: # %bb.0: -; SKX-NEXT: vmovaps (%rdi), %ymm0 +; SKX-NEXT: vmovups (%rdi), %ymm0 ; SKX-NEXT: retq %r = load <8 x i32>, <8 x i32>* %p1, align 32 ret <8 x i32> %r @@ -68,7 +68,7 @@ define void @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) { ; SKX-LABEL: test_store_v4i32_align: ; SKX: # %bb.0: -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovups %xmm0, (%rdi) ; SKX-NEXT: retq store <4 x i32> %val, <4 x i32>* %p1, align 16 ret void @@ -87,7 +87,7 @@ define void @test_store_v8i32_align(<8 x i32> %val, <8 x i32>* %p1) { ; SKX-LABEL: test_store_v8i32_align: ; SKX: # %bb.0: -; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vmovups %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq store <8 x i32> %val, <8 x i32>* %p1, align 32 @@ -107,7 +107,7 @@ define void @test_store_v16i32_align(<16 x i32> %val, <16 x i32>* %p1) { ; SKX-LABEL: test_store_v16i32_align: ; SKX: # %bb.0: -; SKX-NEXT: vmovaps %zmm0, (%rdi) +; SKX-NEXT: vmovups %zmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq store <16 x i32> %val, <16 x i32>* %p1, align 64 diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll --- a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll +++ b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -772,8 +772,8 @@ ; CHECK-LABEL: merge_vec_stores_of_constants: ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, 48(%rdi) -; CHECK-NEXT: vmovaps %xmm0, 64(%rdi) +; CHECK-NEXT: vmovups %xmm0, 48(%rdi) +; CHECK-NEXT: vmovups %xmm0, 64(%rdi) ; CHECK-NEXT: retq %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3 %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4 diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -58,6 +58,7 @@ ; CHECK-NEXT: Implement the 'patchable-function' attribute ; CHECK-NEXT: X86 Indirect Branch Tracking ; CHECK-NEXT: X86 vzeroupper inserter +; CHECK-NEXT: Replace movaps instruction to movups ; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possibl ; CHECK-NEXT: X86 Discriminate Memory Operands ; CHECK-NEXT: X86 Insert Cache Prefetches diff --git a/llvm/test/CodeGen/X86/SwizzleShuff.ll b/llvm/test/CodeGen/X86/SwizzleShuff.ll --- a/llvm/test/CodeGen/X86/SwizzleShuff.ll +++ b/llvm/test/CodeGen/X86/SwizzleShuff.ll @@ -19,7 +19,7 @@ define <4 x i32> @multi_use_swizzle(<4 x i32>* %pA, <4 x i32>* %pB) { ; CHECK-LABEL: multi_use_swizzle: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vmovups (%rdi), %xmm0 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,2] ; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,2,2] ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,2] @@ -54,7 +54,7 @@ define <4 x i32> @reverse_1(<4 x i32>* %pA, <4 x i32>* %pB) { ; CHECK-LABEL: reverse_1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vmovups (%rdi), %xmm0 ; CHECK-NEXT: retq %A = load <4 x i32>, <4 x i32>* %pA %B = load <4 x i32>, <4 x i32>* %pB diff --git a/llvm/test/CodeGen/X86/anyregcc.ll b/llvm/test/CodeGen/X86/anyregcc.ll --- a/llvm/test/CodeGen/X86/anyregcc.ll +++ b/llvm/test/CodeGen/X86/anyregcc.ll @@ -513,22 +513,22 @@ ;AVX: pushq %rdx ;AVX: pushq %rcx ;AVX: pushq %rbx -;AVX: vmovaps %ymm15 -;AVX-NEXT: vmovaps %ymm14 -;AVX-NEXT: vmovaps %ymm13 -;AVX-NEXT: vmovaps %ymm12 -;AVX-NEXT: vmovaps %ymm11 -;AVX-NEXT: vmovaps %ymm10 -;AVX-NEXT: vmovaps %ymm9 -;AVX-NEXT: vmovaps %ymm8 -;AVX-NEXT: vmovaps %ymm7 -;AVX-NEXT: vmovaps %ymm6 -;AVX-NEXT: vmovaps %ymm5 -;AVX-NEXT: vmovaps %ymm4 -;AVX-NEXT: vmovaps %ymm3 -;AVX-NEXT: vmovaps %ymm2 -;AVX-NEXT: vmovaps %ymm1 -;AVX-NEXT: vmovaps %ymm0 +;AVX: vmovups %ymm15 +;AVX-NEXT: vmovups %ymm14 +;AVX-NEXT: vmovups %ymm13 +;AVX-NEXT: vmovups %ymm12 +;AVX-NEXT: vmovups %ymm11 +;AVX-NEXT: vmovups %ymm10 +;AVX-NEXT: vmovups %ymm9 +;AVX-NEXT: vmovups %ymm8 +;AVX-NEXT: vmovups %ymm7 +;AVX-NEXT: vmovups %ymm6 +;AVX-NEXT: vmovups %ymm5 +;AVX-NEXT: vmovups %ymm4 +;AVX-NEXT: vmovups %ymm3 +;AVX-NEXT: vmovups %ymm2 +;AVX-NEXT: vmovups %ymm1 +;AVX-NEXT: vmovups %ymm0 call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{rbp},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"() ret void } diff --git a/llvm/test/CodeGen/X86/arg-copy-elide-win64.ll b/llvm/test/CodeGen/X86/arg-copy-elide-win64.ll --- a/llvm/test/CodeGen/X86/arg-copy-elide-win64.ll +++ b/llvm/test/CodeGen/X86/arg-copy-elide-win64.ll @@ -9,31 +9,31 @@ ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: subq $368, %rsp # imm = 0x170 ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: andq $-128, %rsp ; CHECK-NEXT: movq 288(%rbp), %rax -; CHECK-NEXT: vmovaps (%rax), %ymm0 +; CHECK-NEXT: vmovups (%rax), %ymm0 ; CHECK-NEXT: movq 296(%rbp), %rax -; CHECK-NEXT: vmovaps (%rax), %ymm1 +; CHECK-NEXT: vmovups (%rax), %ymm1 ; CHECK-NEXT: movq 304(%rbp), %rax -; CHECK-NEXT: vmovaps (%rax), %ymm2 +; CHECK-NEXT: vmovups (%rax), %ymm2 ; CHECK-NEXT: movq 312(%rbp), %rax -; CHECK-NEXT: vmovaps (%rax), %ymm3 -; CHECK-NEXT: vmovaps (%rcx), %ymm4 -; CHECK-NEXT: vmovaps (%rdx), %ymm5 -; CHECK-NEXT: vmovaps (%r8), %ymm6 -; CHECK-NEXT: vmovaps (%r9), %ymm7 -; CHECK-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm0, (%rsp) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; CHECK-NEXT: vmovups (%rax), %ymm3 +; CHECK-NEXT: vmovups (%rcx), %ymm4 +; CHECK-NEXT: vmovups (%rdx), %ymm5 +; CHECK-NEXT: vmovups (%r8), %ymm6 +; CHECK-NEXT: vmovups (%r9), %ymm7 +; CHECK-NEXT: vmovups %ymm7, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %ymm6, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %ymm5, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %ymm3, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %ymm0, (%rsp) +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; CHECK-NEXT: leaq 240(%rbp), %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -131,7 +131,7 @@ ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: subl $44, %esp ; X86-AVX-NEXT: .cfi_def_cfa_offset 48 -; X86-AVX-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 +; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) @@ -181,7 +181,7 @@ ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: subq $24, %rsp ; X64-AVX-NEXT: .cfi_def_cfa_offset 32 -; X64-AVX-NEXT: vmovaps %xmm0, (%rsp) +; X64-AVX-NEXT: vmovups %xmm0, (%rsp) ; X64-AVX-NEXT: movq (%rsp), %rsi ; X64-AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; X64-AVX-NEXT: callq __sync_lock_test_and_set_16 @@ -412,7 +412,7 @@ ; X86-AVX-NEXT: calll __sync_val_compare_and_swap_16 ; X86-AVX-NEXT: subl $4, %esp ; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; X86-AVX-NEXT: vmovaps %xmm0, (%esi) +; X86-AVX-NEXT: vmovups %xmm0, (%esi) ; X86-AVX-NEXT: movl %esi, %eax ; X86-AVX-NEXT: addl $56, %esp ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 @@ -502,7 +502,7 @@ ; X64-AVX-NEXT: callq __sync_val_compare_and_swap_16 ; X64-AVX-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rax, (%rsp) -; X64-AVX-NEXT: vmovaps (%rsp), %xmm0 +; X64-AVX-NEXT: vmovups (%rsp), %xmm0 ; X64-AVX-NEXT: addq $24, %rsp ; X64-AVX-NEXT: .cfi_def_cfa_offset 8 ; X64-AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx-basic.ll b/llvm/test/CodeGen/X86/avx-basic.ll --- a/llvm/test/CodeGen/X86/avx-basic.ll +++ b/llvm/test/CodeGen/X86/avx-basic.ll @@ -10,7 +10,7 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: movq _z@{{.*}}(%rip), %rax -; CHECK-NEXT: vmovaps %xmm0, (%rax) +; CHECK-NEXT: vmovups %xmm0, (%rax) ; CHECK-NEXT: retq store <4 x float> zeroinitializer, <4 x float>* @z, align 16 ret void @@ -21,9 +21,9 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: movq _x@{{.*}}(%rip), %rax -; CHECK-NEXT: vmovaps %ymm0, (%rax) +; CHECK-NEXT: vmovups %ymm0, (%rax) ; CHECK-NEXT: movq _y@{{.*}}(%rip), %rax -; CHECK-NEXT: vmovaps %ymm0, (%rax) +; CHECK-NEXT: vmovups %ymm0, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq store <8 x float> zeroinitializer, <8 x float>* @x, align 32 @@ -36,7 +36,7 @@ ; CHECK: ## %bb.0: ## %allocas ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vmovaps %ymm0, (%rdi) +; CHECK-NEXT: vmovups %ymm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq allocas: @@ -53,7 +53,7 @@ ; CHECK: ## %bb.0: ## %allocas ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vmovaps %ymm0, (%rdi) +; CHECK-NEXT: vmovups %ymm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq allocas: @@ -92,7 +92,7 @@ define <16 x float> @fneg(<16 x float> %a) nounwind { ; CHECK-LABEL: fneg: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vmovups {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vxorps %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx-intel-ocl.ll b/llvm/test/CodeGen/X86/avx-intel-ocl.ll --- a/llvm/test/CodeGen/X86/avx-intel-ocl.ll +++ b/llvm/test/CodeGen/X86/avx-intel-ocl.ll @@ -67,27 +67,27 @@ ; test calling conventions - prolog and epilog ; WIN64-LABEL: test_prolog_epilog -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill -; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill +; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}} # 32-byte Spill ; WIN64: call -; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload -; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload +; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload ; X64-LABEL: test_prolog_epilog ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -807,12 +807,12 @@ ; X86-LABEL: test_mm256_load_pd: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovaps (%eax), %ymm0 +; X86-NEXT: vmovups (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_load_pd: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %ymm0 +; X64-NEXT: vmovups (%rdi), %ymm0 ; X64-NEXT: retq %arg0 = bitcast double* %a0 to <4 x double>* %res = load <4 x double>, <4 x double>* %arg0, align 32 @@ -823,12 +823,12 @@ ; X86-LABEL: test_mm256_load_ps: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovaps (%eax), %ymm0 +; X86-NEXT: vmovups (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_load_ps: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %ymm0 +; X64-NEXT: vmovups (%rdi), %ymm0 ; X64-NEXT: retq %arg0 = bitcast float* %a0 to <8 x float>* %res = load <8 x float>, <8 x float>* %arg0, align 32 @@ -839,12 +839,12 @@ ; X86-LABEL: test_mm256_load_si256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovaps (%eax), %ymm0 +; X86-NEXT: vmovups (%eax), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_load_si256: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %ymm0 +; X64-NEXT: vmovups (%rdi), %ymm0 ; X64-NEXT: retq %res = load <4 x i64>, <4 x i64>* %a0, align 32 ret <4 x i64> %res @@ -2522,13 +2522,13 @@ ; X86-LABEL: test_mm256_store_pd: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovaps %ymm0, (%eax) +; X86-NEXT: vmovups %ymm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_store_pd: ; X64: # %bb.0: -; X64-NEXT: vmovaps %ymm0, (%rdi) +; X64-NEXT: vmovups %ymm0, (%rdi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %arg0 = bitcast double* %a0 to <4 x double>* @@ -2540,13 +2540,13 @@ ; X86-LABEL: test_mm256_store_ps: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovaps %ymm0, (%eax) +; X86-NEXT: vmovups %ymm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_store_ps: ; X64: # %bb.0: -; X64-NEXT: vmovaps %ymm0, (%rdi) +; X64-NEXT: vmovups %ymm0, (%rdi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %arg0 = bitcast float* %a0 to <8 x float>* @@ -2558,13 +2558,13 @@ ; X86-LABEL: test_mm256_store_si256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovaps %ymm0, (%eax) +; X86-NEXT: vmovups %ymm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_store_si256: ; X64: # %bb.0: -; X64-NEXT: vmovaps %ymm0, (%rdi) +; X64-NEXT: vmovups %ymm0, (%rdi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq store <4 x i64> %a1, <4 x i64>* %a0, align 32 diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll --- a/llvm/test/CodeGen/X86/avx-load-store.ll +++ b/llvm/test/CodeGen/X86/avx-load-store.ll @@ -12,19 +12,19 @@ ; CHECK-NEXT: movq %rdx, %r14 ; CHECK-NEXT: movq %rsi, %r15 ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovups (%rdi), %ymm0 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovaps (%rsi), %ymm1 +; CHECK-NEXT: vmovups (%rsi), %ymm1 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovaps (%rdx), %ymm2 +; CHECK-NEXT: vmovups (%rdx), %ymm2 ; CHECK-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; CHECK-NEXT: callq dummy ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vmovaps %ymm0, (%rbx) +; CHECK-NEXT: vmovups %ymm0, (%rbx) ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vmovaps %ymm0, (%r15) +; CHECK-NEXT: vmovups %ymm0, (%r15) ; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; CHECK-NEXT: vmovaps %ymm0, (%r14) +; CHECK-NEXT: vmovups %ymm0, (%r14) ; CHECK-NEXT: addq $96, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 @@ -36,7 +36,7 @@ ; CHECK_O0: # %bb.0: # %entry ; CHECK_O0-NEXT: subq $152, %rsp ; CHECK_O0-NEXT: vmovapd (%rdi), %ymm0 -; CHECK_O0-NEXT: vmovaps (%rsi), %ymm1 +; CHECK_O0-NEXT: vmovups (%rsi), %ymm1 ; CHECK_O0-NEXT: vmovdqa (%rdx), %ymm2 ; CHECK_O0-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK_O0-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -50,7 +50,7 @@ ; CHECK_O0-NEXT: vmovapd %ymm0, (%rax) ; CHECK_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK_O0-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; CHECK_O0-NEXT: vmovaps %ymm1, (%rcx) +; CHECK_O0-NEXT: vmovups %ymm1, (%rcx) ; CHECK_O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; CHECK_O0-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; CHECK_O0-NEXT: vmovdqa %ymm2, (%rdx) @@ -111,7 +111,7 @@ define void @storev16i16(<16 x i16> %a) nounwind { ; CHECK-LABEL: storev16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %ymm0, (%rax) +; CHECK-NEXT: vmovups %ymm0, (%rax) ; ; CHECK_O0-LABEL: storev16i16: ; CHECK_O0: # %bb.0: @@ -138,7 +138,7 @@ define void @storev32i8(<32 x i8> %a) nounwind { ; CHECK-LABEL: storev32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %ymm0, (%rax) +; CHECK-NEXT: vmovups %ymm0, (%rax) ; ; CHECK_O0-LABEL: storev32i8: ; CHECK_O0: # %bb.0: @@ -167,8 +167,8 @@ define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp { ; CHECK-LABEL: double_save: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, 16(%rdi) -; CHECK-NEXT: vmovaps %xmm0, (%rdi) +; CHECK-NEXT: vmovups %xmm1, 16(%rdi) +; CHECK-NEXT: vmovups %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; CHECK_O0-LABEL: double_save: @@ -220,7 +220,7 @@ ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB9_4 ; CHECK-NEXT: # %bb.3: # %cif_mixed_test_all -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0] +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [4294967295,0,0,0] ; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax) ; CHECK-NEXT: .LBB9_4: # %cif_mixed_test_any_check ; @@ -289,14 +289,14 @@ define void @add4i64a64(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { ; CHECK-LABEL: add4i64a64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rsi), %ymm0 -; CHECK-NEXT: vmovaps %ymm0, (%rdi) +; CHECK-NEXT: vmovups (%rsi), %ymm0 +; CHECK-NEXT: vmovups %ymm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq ; ; CHECK_O0-LABEL: add4i64a64: ; CHECK_O0: # %bb.0: -; CHECK_O0-NEXT: vmovaps (%rsi), %ymm0 +; CHECK_O0-NEXT: vmovups (%rsi), %ymm0 ; CHECK_O0-NEXT: vmovdqa %ymm0, (%rdi) ; CHECK_O0-NEXT: vzeroupper ; CHECK_O0-NEXT: retq @@ -309,10 +309,10 @@ define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { ; CHECK-LABEL: add4i64a16: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rsi), %xmm0 -; CHECK-NEXT: vmovaps 16(%rsi), %xmm1 -; CHECK-NEXT: vmovaps %xmm1, 16(%rdi) -; CHECK-NEXT: vmovaps %xmm0, (%rdi) +; CHECK-NEXT: vmovups (%rsi), %xmm0 +; CHECK-NEXT: vmovups 16(%rsi), %xmm1 +; CHECK-NEXT: vmovups %xmm1, 16(%rdi) +; CHECK-NEXT: vmovups %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; CHECK_O0-LABEL: add4i64a16: diff --git a/llvm/test/CodeGen/X86/avx-unpack.ll b/llvm/test/CodeGen/X86/avx-unpack.ll --- a/llvm/test/CodeGen/X86/avx-unpack.ll +++ b/llvm/test/CodeGen/X86/avx-unpack.ll @@ -97,7 +97,7 @@ define <8 x i32> @unpackhips2(<8 x i32>* %src1, <8 x i32>* %src2) nounwind uwtable readnone ssp { ; CHECK-LABEL: unpackhips2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovups (%rdi), %ymm0 ; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; CHECK-NEXT: retq %a = load <8 x i32>, <8 x i32>* %src1 @@ -118,7 +118,7 @@ define <4 x i64> @unpackhipd2(<4 x i64>* %src1, <4 x i64>* %src2) nounwind uwtable readnone ssp { ; CHECK-LABEL: unpackhipd2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovups (%rdi), %ymm0 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; CHECK-NEXT: retq %a = load <4 x i64>, <4 x i64>* %src1 @@ -139,7 +139,7 @@ define <8 x i32> @unpacklops2(<8 x i32>* %src1, <8 x i32>* %src2) nounwind uwtable readnone ssp { ; CHECK-LABEL: unpacklops2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovups (%rdi), %ymm0 ; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; CHECK-NEXT: retq %a = load <8 x i32>, <8 x i32>* %src1 @@ -160,7 +160,7 @@ define <4 x i64> @unpacklopd2(<4 x i64>* %src1, <4 x i64>* %src2) nounwind uwtable readnone ssp { ; CHECK-LABEL: unpacklopd2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovups (%rdi), %ymm0 ; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; CHECK-NEXT: retq %a = load <4 x i64>, <4 x i64>* %src1 diff --git a/llvm/test/CodeGen/X86/avx-varargs-x86_64.ll b/llvm/test/CodeGen/X86/avx-varargs-x86_64.ll --- a/llvm/test/CodeGen/X86/avx-varargs-x86_64.ll +++ b/llvm/test/CodeGen/X86/avx-varargs-x86_64.ll @@ -6,7 +6,7 @@ declare i32 @f(i32, ...) ; CHECK-LABEL: test1: -; CHECK: vmovaps %ymm0, (%rsp) +; CHECK: vmovups %ymm0, (%rsp) define void @test1() nounwind uwtable ssp { entry: %0 = load <8 x float>, <8 x float>* @x, align 32 diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -300,12 +300,12 @@ define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp { ; X32-LABEL: _e2: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] +; X32-NEXT: vmovups {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] ; X32-NEXT: retl ; ; X64-LABEL: _e2: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] +; X64-NEXT: vmovups {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] ; X64-NEXT: retq entry: %vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0 diff --git a/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll b/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll --- a/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll @@ -128,16 +128,16 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm1 +; X32-NEXT: vmovups (%ecx), %xmm1 ; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovups %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_2f64_4f64_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm1 +; X64-NEXT: vmovups (%rdi), %xmm1 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: vmovups %xmm1, (%rsi) ; X64-NEXT: retq %1 = load <2 x double>, <2 x double>* %p0 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> @@ -150,16 +150,16 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm1 +; X32-NEXT: vmovups (%ecx), %xmm1 ; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovups %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_2i64_4i64_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm1 +; X64-NEXT: vmovups (%rdi), %xmm1 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: vmovups %xmm1, (%rsi) ; X64-NEXT: retq %1 = load <2 x i64>, <2 x i64>* %p0 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> @@ -172,16 +172,16 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm1 +; X32-NEXT: vmovups (%ecx), %xmm1 ; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovups %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_4f32_8f32_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm1 +; X64-NEXT: vmovups (%rdi), %xmm1 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: vmovups %xmm1, (%rsi) ; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %p0 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> @@ -194,16 +194,16 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm1 +; X32-NEXT: vmovups (%ecx), %xmm1 ; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovups %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_4i32_8i32_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm1 +; X64-NEXT: vmovups (%rdi), %xmm1 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: vmovups %xmm1, (%rsi) ; X64-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> @@ -216,16 +216,16 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm1 +; X32-NEXT: vmovups (%ecx), %xmm1 ; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovups %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_8i16_16i16_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm1 +; X64-NEXT: vmovups (%rdi), %xmm1 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: vmovups %xmm1, (%rsi) ; X64-NEXT: retq %1 = load <8 x i16>, <8 x i16> *%p0 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> @@ -238,16 +238,16 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm1 +; X32-NEXT: vmovups (%ecx), %xmm1 ; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovups %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_16i8_32i8_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm1 +; X64-NEXT: vmovups (%rdi), %xmm1 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 -; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: vmovups %xmm1, (%rsi) ; X64-NEXT: retq %1 = load <16 x i8>, <16 x i8> *%p0 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> @@ -262,14 +262,14 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X32-NEXT: vmovaps %ymm1, (%eax) +; X32-NEXT: vmovups %ymm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: PR29088: ; X64: # %bb.0: ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-NEXT: vmovaps %ymm1, (%rsi) +; X64-NEXT: vmovups %ymm1, (%rsi) ; X64-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 diff --git a/llvm/test/CodeGen/X86/avx-vextractf128.ll b/llvm/test/CodeGen/X86/avx-vextractf128.ll --- a/llvm/test/CodeGen/X86/avx-vextractf128.ll +++ b/llvm/test/CodeGen/X86/avx-vextractf128.ll @@ -64,7 +64,7 @@ define void @t5(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp { ; CHECK-LABEL: t5: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmovaps %xmm0, (%rdi) +; CHECK-NEXT: vmovups %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: @@ -77,7 +77,7 @@ define void @t6(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp { ; CHECK-LABEL: t6: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmovaps %xmm0, (%rdi) +; CHECK-NEXT: vmovups %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: @@ -90,7 +90,7 @@ define void @t7(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp { ; CHECK-LABEL: t7: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmovaps %xmm0, (%rdi) +; CHECK-NEXT: vmovups %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/avx-vzeroupper.ll b/llvm/test/CodeGen/X86/avx-vzeroupper.ll --- a/llvm/test/CodeGen/X86/avx-vzeroupper.ll +++ b/llvm/test/CodeGen/X86/avx-vzeroupper.ll @@ -34,12 +34,12 @@ ; VZ: # %bb.0: ; VZ-NEXT: subq $56, %rsp ; VZ-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; VZ-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; VZ-NEXT: vmovups {{.*}}(%rip), %xmm0 ; VZ-NEXT: vzeroupper ; VZ-NEXT: callq do_sse -; VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; VZ-NEXT: vmovups %xmm0, {{.*}}(%rip) ; VZ-NEXT: callq do_sse -; VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; VZ-NEXT: vmovups %xmm0, {{.*}}(%rip) ; VZ-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; VZ-NEXT: addq $56, %rsp ; VZ-NEXT: retq @@ -48,11 +48,11 @@ ; DISABLE-VZ: # %bb.0: ; DISABLE-VZ-NEXT: subq $56, %rsp ; DISABLE-VZ-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; DISABLE-VZ-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; DISABLE-VZ-NEXT: vmovups {{.*}}(%rip), %xmm0 ; DISABLE-VZ-NEXT: callq do_sse -; DISABLE-VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; DISABLE-VZ-NEXT: vmovups %xmm0, {{.*}}(%rip) ; DISABLE-VZ-NEXT: callq do_sse -; DISABLE-VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; DISABLE-VZ-NEXT: vmovups %xmm0, {{.*}}(%rip) ; DISABLE-VZ-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; DISABLE-VZ-NEXT: addq $56, %rsp ; DISABLE-VZ-NEXT: retq @@ -60,13 +60,13 @@ ; BDVER2-LABEL: test01: ; BDVER2: # %bb.0: ; BDVER2-NEXT: subq $56, %rsp -; BDVER2-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; BDVER2-NEXT: vmovups {{.*}}(%rip), %xmm0 ; BDVER2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; BDVER2-NEXT: vzeroupper ; BDVER2-NEXT: callq do_sse -; BDVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; BDVER2-NEXT: vmovups %xmm0, {{.*}}(%rip) ; BDVER2-NEXT: callq do_sse -; BDVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; BDVER2-NEXT: vmovups %xmm0, {{.*}}(%rip) ; BDVER2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; BDVER2-NEXT: addq $56, %rsp ; BDVER2-NEXT: retq @@ -74,12 +74,12 @@ ; BTVER2-LABEL: test01: ; BTVER2: # %bb.0: ; BTVER2-NEXT: subq $56, %rsp -; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; BTVER2-NEXT: vmovups {{.*}}(%rip), %xmm0 ; BTVER2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; BTVER2-NEXT: callq do_sse -; BTVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; BTVER2-NEXT: vmovups %xmm0, {{.*}}(%rip) ; BTVER2-NEXT: callq do_sse -; BTVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; BTVER2-NEXT: vmovups %xmm0, {{.*}}(%rip) ; BTVER2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; BTVER2-NEXT: addq $56, %rsp ; BTVER2-NEXT: retq @@ -130,7 +130,7 @@ ; VZ-NEXT: pushq %rbx ; VZ-NEXT: subq $16, %rsp ; VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; VZ-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; VZ-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; VZ-NEXT: .p2align 4, 0x90 ; VZ-NEXT: .LBB3_1: # %while.cond ; VZ-NEXT: # =>This Inner Loop Header: Depth=1 @@ -139,13 +139,13 @@ ; VZ-NEXT: jne .LBB3_1 ; VZ-NEXT: # %bb.2: # %for.body.preheader ; VZ-NEXT: movl $4, %ebx -; VZ-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; VZ-NEXT: vmovups (%rsp), %xmm0 # 16-byte Reload ; VZ-NEXT: .p2align 4, 0x90 ; VZ-NEXT: .LBB3_3: # %for.body ; VZ-NEXT: # =>This Inner Loop Header: Depth=1 ; VZ-NEXT: callq do_sse ; VZ-NEXT: callq do_sse -; VZ-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 +; VZ-NEXT: vmovups g+{{.*}}(%rip), %xmm0 ; VZ-NEXT: callq do_sse ; VZ-NEXT: decl %ebx ; VZ-NEXT: jne .LBB3_3 @@ -159,7 +159,7 @@ ; DISABLE-VZ-NEXT: pushq %rbx ; DISABLE-VZ-NEXT: subq $16, %rsp ; DISABLE-VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; DISABLE-VZ-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; DISABLE-VZ-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; DISABLE-VZ-NEXT: .p2align 4, 0x90 ; DISABLE-VZ-NEXT: .LBB3_1: # %while.cond ; DISABLE-VZ-NEXT: # =>This Inner Loop Header: Depth=1 @@ -168,13 +168,13 @@ ; DISABLE-VZ-NEXT: jne .LBB3_1 ; DISABLE-VZ-NEXT: # %bb.2: # %for.body.preheader ; DISABLE-VZ-NEXT: movl $4, %ebx -; DISABLE-VZ-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; DISABLE-VZ-NEXT: vmovups (%rsp), %xmm0 # 16-byte Reload ; DISABLE-VZ-NEXT: .p2align 4, 0x90 ; DISABLE-VZ-NEXT: .LBB3_3: # %for.body ; DISABLE-VZ-NEXT: # =>This Inner Loop Header: Depth=1 ; DISABLE-VZ-NEXT: callq do_sse ; DISABLE-VZ-NEXT: callq do_sse -; DISABLE-VZ-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 +; DISABLE-VZ-NEXT: vmovups g+{{.*}}(%rip), %xmm0 ; DISABLE-VZ-NEXT: callq do_sse ; DISABLE-VZ-NEXT: decl %ebx ; DISABLE-VZ-NEXT: jne .LBB3_3 @@ -188,7 +188,7 @@ ; BDVER2-NEXT: pushq %rbx ; BDVER2-NEXT: subq $16, %rsp ; BDVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; BDVER2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; BDVER2-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; BDVER2-NEXT: .p2align 4, 0x90 ; BDVER2-NEXT: .LBB3_1: # %while.cond ; BDVER2-NEXT: # =>This Inner Loop Header: Depth=1 @@ -196,14 +196,14 @@ ; BDVER2-NEXT: testl %eax, %eax ; BDVER2-NEXT: jne .LBB3_1 ; BDVER2-NEXT: # %bb.2: # %for.body.preheader -; BDVER2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; BDVER2-NEXT: vmovups (%rsp), %xmm0 # 16-byte Reload ; BDVER2-NEXT: movl $4, %ebx ; BDVER2-NEXT: .p2align 4, 0x90 ; BDVER2-NEXT: .LBB3_3: # %for.body ; BDVER2-NEXT: # =>This Inner Loop Header: Depth=1 ; BDVER2-NEXT: callq do_sse ; BDVER2-NEXT: callq do_sse -; BDVER2-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 +; BDVER2-NEXT: vmovups g+{{.*}}(%rip), %xmm0 ; BDVER2-NEXT: callq do_sse ; BDVER2-NEXT: decl %ebx ; BDVER2-NEXT: jne .LBB3_3 @@ -217,7 +217,7 @@ ; BTVER2-NEXT: pushq %rbx ; BTVER2-NEXT: subq $16, %rsp ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; BTVER2-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; BTVER2-NEXT: .p2align 4, 0x90 ; BTVER2-NEXT: .LBB3_1: # %while.cond ; BTVER2-NEXT: # =>This Inner Loop Header: Depth=1 @@ -225,14 +225,14 @@ ; BTVER2-NEXT: testl %eax, %eax ; BTVER2-NEXT: jne .LBB3_1 ; BTVER2-NEXT: # %bb.2: # %for.body.preheader -; BTVER2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; BTVER2-NEXT: vmovups (%rsp), %xmm0 # 16-byte Reload ; BTVER2-NEXT: movl $4, %ebx ; BTVER2-NEXT: .p2align 4, 0x90 ; BTVER2-NEXT: .LBB3_3: # %for.body ; BTVER2-NEXT: # =>This Inner Loop Header: Depth=1 ; BTVER2-NEXT: callq do_sse ; BTVER2-NEXT: callq do_sse -; BTVER2-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 +; BTVER2-NEXT: vmovups g+{{.*}}(%rip), %xmm0 ; BTVER2-NEXT: callq do_sse ; BTVER2-NEXT: decl %ebx ; BTVER2-NEXT: jne .LBB3_3 diff --git a/llvm/test/CodeGen/X86/avx-win64.ll b/llvm/test/CodeGen/X86/avx-win64.ll --- a/llvm/test/CodeGen/X86/avx-win64.ll +++ b/llvm/test/CodeGen/X86/avx-win64.ll @@ -10,12 +10,12 @@ ; CHECK: f___vyf ; CHECK: pushq %rbp ; CHECK: vmovmsk -; CHECK: vmovaps %ymm{{.*}}(%r -; CHECK: vmovaps %ymm{{.*}}(%r +; CHECK: vmovups %ymm{{.*}}(%r +; CHECK: vmovups %ymm{{.*}}(%r ; CHECK: call ; Two reloads. It's OK if these get folded. -; CHECK: vmovaps {{.*\(%r.*}}, %ymm -; CHECK: vmovaps {{.*\(%r.*}}, %ymm +; CHECK: vmovups {{.*\(%r.*}}, %ymm +; CHECK: vmovups {{.*\(%r.*}}, %ymm ; CHECK: blend define <8 x float> @f___vyf(<8 x float> %x, <8 x i32> %__mask) nounwind readnone { allocas: diff --git a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll --- a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll +++ b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll @@ -35,14 +35,14 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 +; X86-NEXT: vmovups (%ecx), %xmm0 ; X86-NEXT: vorps LCPI1_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: test2: ; X64: ## %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vmovups (%rdi), %xmm0 ; X64-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) ; X64-NEXT: retq @@ -62,14 +62,14 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 +; X86-NEXT: vmovups (%ecx), %xmm0 ; X86-NEXT: vxorps LCPI2_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: test3: ; X64: ## %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vmovups (%rdi), %xmm0 ; X64-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) ; X64-NEXT: retq @@ -88,14 +88,14 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 +; X86-NEXT: vmovups (%ecx), %xmm0 ; X86-NEXT: vandnps LCPI3_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: test4: ; X64: ## %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vmovups (%rdi), %xmm0 ; X64-NEXT: vandnps {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx2-conversions.ll b/llvm/test/CodeGen/X86/avx2-conversions.ll --- a/llvm/test/CodeGen/X86/avx2-conversions.ll +++ b/llvm/test/CodeGen/X86/avx2-conversions.ll @@ -14,7 +14,7 @@ ; ; X32-FAST-LABEL: trunc4: ; X32-FAST: # %bb.0: -; X32-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; X32-FAST-NEXT: vmovups {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> ; X32-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; X32-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; X32-FAST-NEXT: vzeroupper @@ -29,7 +29,7 @@ ; ; X64-FAST-LABEL: trunc4: ; X64-FAST: # %bb.0: -; X64-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; X64-FAST-NEXT: vmovups {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> ; X64-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; X64-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; X64-FAST-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll --- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -23,29 +23,29 @@ define <16 x i16> @test_x86_avx2_packssdw_fold() { ; X86-AVX-LABEL: test_x86_avx2_packssdw_fold: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] -; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovups {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] +; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packssdw_fold: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovaps {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] -; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: vmovups {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] +; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packssdw_fold: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] -; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovups {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] +; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packssdw_fold: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] -; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vmovups {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> zeroinitializer, <8 x i32> ) @@ -72,29 +72,29 @@ define <32 x i8> @test_x86_avx2_packsswb_fold() { ; X86-AVX-LABEL: test_x86_avx2_packsswb_fold: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovups {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packsswb_fold: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovaps {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: vmovups {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packsswb_fold: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovups {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packsswb_fold: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vmovups {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> , <16 x i16> zeroinitializer) @@ -121,29 +121,29 @@ define <32 x i8> @test_x86_avx2_packuswb_fold() { ; X86-AVX-LABEL: test_x86_avx2_packuswb_fold: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovups {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packuswb_fold: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovaps {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: vmovups {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packuswb_fold: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovups {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packuswb_fold: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vmovups {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> , <16 x i16> zeroinitializer) @@ -815,29 +815,29 @@ define <16 x i16> @test_x86_avx2_packusdw_fold() { ; X86-AVX-LABEL: test_x86_avx2_packusdw_fold: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovups {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovaps {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] -; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: vmovups {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] -; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovups {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] -; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vmovups {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x10,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> ) diff --git a/llvm/test/CodeGen/X86/avx2-masked-gather.ll b/llvm/test/CodeGen/X86/avx2-masked-gather.ll --- a/llvm/test/CodeGen/X86/avx2-masked-gather.ll +++ b/llvm/test/CodeGen/X86/avx2-masked-gather.ll @@ -120,7 +120,7 @@ ; ; X64-LABEL: masked_gather_v2float: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps (%rdi), %xmm2 +; X64-NEXT: vmovups (%rdi), %xmm2 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vpslld $31, %xmm0, %xmm0 ; X64-NEXT: vgatherqps %xmm0, (,%xmm2), %xmm1 @@ -170,7 +170,7 @@ ; ; X64-LABEL: masked_gather_v2float_concat: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps (%rdi), %xmm2 +; X64-NEXT: vmovups (%rdi), %xmm2 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vpslld $31, %xmm0, %xmm0 ; X64-NEXT: vgatherqps %xmm0, (,%xmm2), %xmm1 @@ -452,7 +452,7 @@ ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X86-NEXT: vpslld $31, %ymm0, %ymm0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovaps (%eax), %ymm2 +; X86-NEXT: vmovups (%eax), %ymm2 ; X86-NEXT: vgatherdps %ymm0, (,%ymm2), %ymm1 ; X86-NEXT: vmovaps %ymm1, %ymm0 ; X86-NEXT: retl @@ -461,8 +461,8 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-NEXT: vpslld $31, %ymm0, %ymm0 -; X64-NEXT: vmovaps (%rdi), %ymm2 -; X64-NEXT: vmovaps 32(%rdi), %ymm3 +; X64-NEXT: vmovups (%rdi), %ymm2 +; X64-NEXT: vmovups 32(%rdi), %ymm3 ; X64-NEXT: vextractf128 $1, %ymm1, %xmm4 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm5 ; X64-NEXT: vgatherqps %xmm5, (,%ymm3), %xmm4 diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll @@ -657,12 +657,12 @@ define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp { ; X32-LABEL: _e4: ; X32: ## %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> +; X32-NEXT: vmovups {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> ; X32-NEXT: retl ; ; X64-LABEL: _e4: ; X64: ## %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> +; X64-NEXT: vmovups {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> ; X64-NEXT: retq %vecinit0.i = insertelement <8 x i8> undef, i8 52, i32 0 %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1 @@ -1094,9 +1094,9 @@ ; X32-NEXT: .cfi_def_cfa_offset 64 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %xmm0, (%esp) +; X32-NEXT: vmovups %xmm0, (%esp) ; X32-NEXT: vpbroadcastb (%eax), %xmm1 -; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl @@ -1104,9 +1104,9 @@ ; X64-LABEL: isel_crash_16b: ; X64: ## %bb.0: ## %eintry ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vpbroadcastb (%rdi), %xmm1 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq eintry: @@ -1136,9 +1136,9 @@ ; X32-NEXT: subl $128, %esp ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %ymm0, (%esp) +; X32-NEXT: vmovups %ymm0, (%esp) ; X32-NEXT: vpbroadcastb (%eax), %ymm1 -; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp @@ -1155,9 +1155,9 @@ ; X64-NEXT: andq $-32, %rsp ; X64-NEXT: subq $128, %rsp ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %ymm0, (%rsp) +; X64-NEXT: vmovups %ymm0, (%rsp) ; X64-NEXT: vpbroadcastb (%rdi), %ymm1 -; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movq %rbp, %rsp ; X64-NEXT: popq %rbp @@ -1185,9 +1185,9 @@ ; X32-NEXT: .cfi_def_cfa_offset 64 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %xmm0, (%esp) +; X32-NEXT: vmovups %xmm0, (%esp) ; X32-NEXT: vpbroadcastw (%eax), %xmm1 -; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl @@ -1195,9 +1195,9 @@ ; X64-LABEL: isel_crash_8w: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vpbroadcastw (%rdi), %xmm1 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq entry: @@ -1227,9 +1227,9 @@ ; X32-NEXT: subl $128, %esp ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %ymm0, (%esp) +; X32-NEXT: vmovups %ymm0, (%esp) ; X32-NEXT: vpbroadcastw (%eax), %ymm1 -; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp @@ -1246,9 +1246,9 @@ ; X64-NEXT: andq $-32, %rsp ; X64-NEXT: subq $128, %rsp ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %ymm0, (%rsp) +; X64-NEXT: vmovups %ymm0, (%rsp) ; X64-NEXT: vpbroadcastw (%rdi), %ymm1 -; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movq %rbp, %rsp ; X64-NEXT: popq %rbp @@ -1276,20 +1276,20 @@ ; X32-NEXT: .cfi_def_cfa_offset 64 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %xmm0, (%esp) +; X32-NEXT: vmovups %xmm0, (%esp) ; X32-NEXT: vbroadcastss (%eax), %xmm1 -; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl ; ; X64-LABEL: isel_crash_4d: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vbroadcastss (%rdi), %xmm1 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq entry: %__a.addr.i = alloca <2 x i64>, align 16 @@ -1318,10 +1318,10 @@ ; X32-NEXT: subl $128, %esp ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %ymm0, (%esp) +; X32-NEXT: vmovups %ymm0, (%esp) ; X32-NEXT: vbroadcastss (%eax), %ymm1 -; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) -; X32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) +; X32-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp ; X32-NEXT: vzeroupper @@ -1337,10 +1337,10 @@ ; X64-NEXT: andq $-32, %rsp ; X64-NEXT: subq $128, %rsp ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %ymm0, (%rsp) +; X64-NEXT: vmovups %ymm0, (%rsp) ; X64-NEXT: vbroadcastss (%rdi), %ymm1 -; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movq %rbp, %rsp ; X64-NEXT: popq %rbp ; X64-NEXT: vzeroupper @@ -1367,20 +1367,20 @@ ; X32-NEXT: .cfi_def_cfa_offset 64 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %xmm0, (%esp) +; X32-NEXT: vmovups %xmm0, (%esp) ; X32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl ; ; X64-LABEL: isel_crash_2q: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq entry: %__a.addr.i = alloca <2 x i64>, align 16 @@ -1408,10 +1408,10 @@ ; X32-NEXT: subl $128, %esp ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %ymm0, (%esp) +; X32-NEXT: vmovups %ymm0, (%esp) ; X32-NEXT: vbroadcastsd (%eax), %ymm1 -; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) -; X32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) +; X32-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp ; X32-NEXT: vzeroupper @@ -1427,10 +1427,10 @@ ; X64-NEXT: andq $-32, %rsp ; X64-NEXT: subq $128, %rsp ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %ymm0, (%rsp) +; X64-NEXT: vmovups %ymm0, (%rsp) ; X64-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; X64-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movq %rbp, %rsp ; X64-NEXT: popq %rbp ; X64-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll --- a/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll @@ -171,18 +171,18 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm1 +; X32-NEXT: vmovups (%ecx), %xmm1 ; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 ; X32-NEXT: vaddps {{\.LCPI.*}}, %ymm0, %ymm0 -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovups %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_4f32_8f32_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm1 +; X64-NEXT: vmovups (%rdi), %xmm1 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 ; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vmovaps %xmm1, (%rsi) +; X64-NEXT: vmovups %xmm1, (%rsi) ; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %p0 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> @@ -273,14 +273,14 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X32-NEXT: vmovaps %ymm1, (%eax) +; X32-NEXT: vmovups %ymm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: PR29088: ; X64: # %bb.0: ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-NEXT: vmovaps %ymm1, (%rsi) +; X64-NEXT: vmovups %ymm1, (%rsi) ; X64-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 diff --git a/llvm/test/CodeGen/X86/avx2-vperm.ll b/llvm/test/CodeGen/X86/avx2-vperm.ll --- a/llvm/test/CodeGen/X86/avx2-vperm.ll +++ b/llvm/test/CodeGen/X86/avx2-vperm.ll @@ -5,13 +5,13 @@ define <8 x i32> @perm_cl_int_8x32(<8 x i32> %A) nounwind readnone { ; X32-LABEL: perm_cl_int_8x32: ; X32: # %bb.0: # %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0] ; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: perm_cl_int_8x32: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0] ; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq entry: @@ -23,13 +23,13 @@ define <8 x float> @perm_cl_fp_8x32(<8 x float> %A) nounwind readnone { ; X32-LABEL: perm_cl_fp_8x32: ; X32: # %bb.0: # %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = +; X32-NEXT: vmovups {{.*#+}} ymm1 = ; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: perm_cl_fp_8x32: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = +; X64-NEXT: vmovups {{.*#+}} ymm1 = ; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -584,7 +584,7 @@ ; CHECK-LABEL: bcast_unfold_or_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [3,3] +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB17_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -786,7 +786,7 @@ ; CHECK-LABEL: bcast_unfold_fneg_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0] +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB23_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1003,7 +1003,7 @@ ; CHECK-LABEL: bcast_unfold_fabs_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN] +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [NaN,NaN] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB29_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll b/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll --- a/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll +++ b/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll @@ -12,12 +12,12 @@ ; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; CHECK-NEXT: vmovaps %zmm1, (%rdi) +; CHECK-NEXT: vmovups %zmm1, (%rdi) ; CHECK-NEXT: callq _Print__512 ; CHECK-NEXT: vmovups (%rsp), %zmm0 ## 64-byte Reload ; CHECK-NEXT: callq _Print__512 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; CHECK-NEXT: vmovaps %zmm0, (%rbx) +; CHECK-NEXT: vmovups %zmm0, (%rbx) ; CHECK-NEXT: addq $112, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll --- a/llvm/test/CodeGen/X86/avx512-build-vector.ll +++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll @@ -15,7 +15,7 @@ ; CHECK-LABEL: test3: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15] +; CHECK-NEXT: vmovups {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15] ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -557,9 +557,9 @@ ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: .cfi_offset %rbx, -16 ; KNL-NEXT: movq %rdi, %rbx -; KNL-NEXT: vmovaps (%rdi), %zmm0 +; KNL-NEXT: vmovups (%rdi), %zmm0 ; KNL-NEXT: callq _test14_callee -; KNL-NEXT: vmovaps %zmm0, (%rbx) +; KNL-NEXT: vmovups %zmm0, (%rbx) ; KNL-NEXT: popq %rbx ; KNL-NEXT: retq ; @@ -569,9 +569,9 @@ ; SKX-NEXT: .cfi_def_cfa_offset 16 ; SKX-NEXT: .cfi_offset %rbx, -16 ; SKX-NEXT: movq %rdi, %rbx -; SKX-NEXT: vmovaps (%rdi), %zmm0 +; SKX-NEXT: vmovups (%rdi), %zmm0 ; SKX-NEXT: callq _test14_callee -; SKX-NEXT: vmovaps %zmm0, (%rbx) +; SKX-NEXT: vmovups %zmm0, (%rbx) ; SKX-NEXT: popq %rbx ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -584,9 +584,9 @@ ; KNL_X32-NEXT: .cfi_def_cfa_offset 16 ; KNL_X32-NEXT: .cfi_offset %esi, -8 ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; KNL_X32-NEXT: vmovaps (%esi), %zmm0 +; KNL_X32-NEXT: vmovups (%esi), %zmm0 ; KNL_X32-NEXT: calll _test14_callee -; KNL_X32-NEXT: vmovaps %zmm0, (%esi) +; KNL_X32-NEXT: vmovups %zmm0, (%esi) ; KNL_X32-NEXT: addl $8, %esp ; KNL_X32-NEXT: popl %esi ; KNL_X32-NEXT: retl @@ -597,9 +597,9 @@ ; FASTISEL-NEXT: .cfi_def_cfa_offset 16 ; FASTISEL-NEXT: .cfi_offset %rbx, -16 ; FASTISEL-NEXT: movq %rdi, %rbx -; FASTISEL-NEXT: vmovaps (%rdi), %zmm0 +; FASTISEL-NEXT: vmovups (%rdi), %zmm0 ; FASTISEL-NEXT: callq _test14_callee -; FASTISEL-NEXT: vmovaps %zmm0, (%rbx) +; FASTISEL-NEXT: vmovups %zmm0, (%rbx) ; FASTISEL-NEXT: popq %rbx ; FASTISEL-NEXT: vzeroupper ; FASTISEL-NEXT: retq @@ -617,9 +617,9 @@ ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: .cfi_offset %rbx, -16 ; KNL-NEXT: movq %rdi, %rbx -; KNL-NEXT: vmovaps (%rdi), %zmm0 +; KNL-NEXT: vmovups (%rdi), %zmm0 ; KNL-NEXT: callq _test15_callee -; KNL-NEXT: vmovaps %zmm0, (%rbx) +; KNL-NEXT: vmovups %zmm0, (%rbx) ; KNL-NEXT: popq %rbx ; KNL-NEXT: retq ; @@ -629,9 +629,9 @@ ; SKX-NEXT: .cfi_def_cfa_offset 16 ; SKX-NEXT: .cfi_offset %rbx, -16 ; SKX-NEXT: movq %rdi, %rbx -; SKX-NEXT: vmovaps (%rdi), %zmm0 +; SKX-NEXT: vmovups (%rdi), %zmm0 ; SKX-NEXT: callq _test15_callee -; SKX-NEXT: vmovaps %zmm0, (%rbx) +; SKX-NEXT: vmovups %zmm0, (%rbx) ; SKX-NEXT: popq %rbx ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -644,9 +644,9 @@ ; KNL_X32-NEXT: .cfi_def_cfa_offset 16 ; KNL_X32-NEXT: .cfi_offset %esi, -8 ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; KNL_X32-NEXT: vmovaps (%esi), %zmm0 +; KNL_X32-NEXT: vmovups (%esi), %zmm0 ; KNL_X32-NEXT: calll _test15_callee -; KNL_X32-NEXT: vmovaps %zmm0, (%esi) +; KNL_X32-NEXT: vmovups %zmm0, (%esi) ; KNL_X32-NEXT: addl $8, %esp ; KNL_X32-NEXT: popl %esi ; KNL_X32-NEXT: retl @@ -657,9 +657,9 @@ ; FASTISEL-NEXT: .cfi_def_cfa_offset 16 ; FASTISEL-NEXT: .cfi_offset %rbx, -16 ; FASTISEL-NEXT: movq %rdi, %rbx -; FASTISEL-NEXT: vmovaps (%rdi), %zmm0 +; FASTISEL-NEXT: vmovups (%rdi), %zmm0 ; FASTISEL-NEXT: callq _test15_callee -; FASTISEL-NEXT: vmovaps %zmm0, (%rbx) +; FASTISEL-NEXT: vmovups %zmm0, (%rbx) ; FASTISEL-NEXT: popq %rbx ; FASTISEL-NEXT: vzeroupper ; FASTISEL-NEXT: retq @@ -3558,8 +3558,8 @@ ; KNL: ## %bb.0: ; KNL-NEXT: subq $24, %rsp ; KNL-NEXT: .cfi_def_cfa_offset 32 -; KNL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; KNL-NEXT: vmovaps %xmm8, (%rsp) +; KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm8 +; KNL-NEXT: vmovups %xmm8, (%rsp) ; KNL-NEXT: callq _v2i1_mem_callee ; KNL-NEXT: addq $24, %rsp ; KNL-NEXT: retq @@ -3568,8 +3568,8 @@ ; SKX: ## %bb.0: ; SKX-NEXT: subq $24, %rsp ; SKX-NEXT: .cfi_def_cfa_offset 32 -; SKX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; SKX-NEXT: vmovaps %xmm8, (%rsp) +; SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm8 +; SKX-NEXT: vmovups %xmm8, (%rsp) ; SKX-NEXT: callq _v2i1_mem_callee ; SKX-NEXT: addq $24, %rsp ; SKX-NEXT: vzeroupper @@ -3584,16 +3584,16 @@ ; KNL_X32-NEXT: .cfi_def_cfa_register %ebp ; KNL_X32-NEXT: andl $-64, %esp ; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 -; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 -; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 -; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 -; KNL_X32-NEXT: vmovaps 264(%ebp), %xmm4 -; KNL_X32-NEXT: vmovaps %xmm4, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 -; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: vmovups 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovups 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovups 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovups 264(%ebp), %xmm4 +; KNL_X32-NEXT: vmovups %xmm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovups %zmm4, (%esp) ; KNL_X32-NEXT: calll _v2i1_mem_callee ; KNL_X32-NEXT: movl %ebp, %esp ; KNL_X32-NEXT: popl %ebp @@ -3621,8 +3621,8 @@ ; KNL: ## %bb.0: ; KNL-NEXT: subq $24, %rsp ; KNL-NEXT: .cfi_def_cfa_offset 32 -; KNL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; KNL-NEXT: vmovaps %xmm8, (%rsp) +; KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm8 +; KNL-NEXT: vmovups %xmm8, (%rsp) ; KNL-NEXT: callq _v4i1_mem_callee ; KNL-NEXT: addq $24, %rsp ; KNL-NEXT: retq @@ -3631,8 +3631,8 @@ ; SKX: ## %bb.0: ; SKX-NEXT: subq $24, %rsp ; SKX-NEXT: .cfi_def_cfa_offset 32 -; SKX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; SKX-NEXT: vmovaps %xmm8, (%rsp) +; SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm8 +; SKX-NEXT: vmovups %xmm8, (%rsp) ; SKX-NEXT: callq _v4i1_mem_callee ; SKX-NEXT: addq $24, %rsp ; SKX-NEXT: vzeroupper @@ -3647,16 +3647,16 @@ ; KNL_X32-NEXT: .cfi_def_cfa_register %ebp ; KNL_X32-NEXT: andl $-64, %esp ; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 -; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 -; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 -; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 -; KNL_X32-NEXT: vmovaps 264(%ebp), %xmm4 -; KNL_X32-NEXT: vmovaps %xmm4, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 -; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: vmovups 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovups 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovups 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovups 264(%ebp), %xmm4 +; KNL_X32-NEXT: vmovups %xmm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovups %zmm4, (%esp) ; KNL_X32-NEXT: calll _v4i1_mem_callee ; KNL_X32-NEXT: movl %ebp, %esp ; KNL_X32-NEXT: popl %ebp @@ -3684,8 +3684,8 @@ ; KNL: ## %bb.0: ; KNL-NEXT: subq $24, %rsp ; KNL-NEXT: .cfi_def_cfa_offset 32 -; KNL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; KNL-NEXT: vmovaps %xmm8, (%rsp) +; KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm8 +; KNL-NEXT: vmovups %xmm8, (%rsp) ; KNL-NEXT: callq _v8i1_mem_callee ; KNL-NEXT: addq $24, %rsp ; KNL-NEXT: retq @@ -3694,8 +3694,8 @@ ; SKX: ## %bb.0: ; SKX-NEXT: subq $24, %rsp ; SKX-NEXT: .cfi_def_cfa_offset 32 -; SKX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; SKX-NEXT: vmovaps %xmm8, (%rsp) +; SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm8 +; SKX-NEXT: vmovups %xmm8, (%rsp) ; SKX-NEXT: callq _v8i1_mem_callee ; SKX-NEXT: addq $24, %rsp ; SKX-NEXT: vzeroupper @@ -3710,16 +3710,16 @@ ; KNL_X32-NEXT: .cfi_def_cfa_register %ebp ; KNL_X32-NEXT: andl $-64, %esp ; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 -; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 -; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 -; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 -; KNL_X32-NEXT: vmovaps 264(%ebp), %xmm4 -; KNL_X32-NEXT: vmovaps %xmm4, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 -; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: vmovups 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovups 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovups 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovups 264(%ebp), %xmm4 +; KNL_X32-NEXT: vmovups %xmm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovups %zmm4, (%esp) ; KNL_X32-NEXT: calll _v8i1_mem_callee ; KNL_X32-NEXT: movl %ebp, %esp ; KNL_X32-NEXT: popl %ebp @@ -3747,8 +3747,8 @@ ; KNL: ## %bb.0: ; KNL-NEXT: subq $24, %rsp ; KNL-NEXT: .cfi_def_cfa_offset 32 -; KNL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; KNL-NEXT: vmovaps %xmm8, (%rsp) +; KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm8 +; KNL-NEXT: vmovups %xmm8, (%rsp) ; KNL-NEXT: callq _v16i1_mem_callee ; KNL-NEXT: addq $24, %rsp ; KNL-NEXT: retq @@ -3757,8 +3757,8 @@ ; SKX: ## %bb.0: ; SKX-NEXT: subq $24, %rsp ; SKX-NEXT: .cfi_def_cfa_offset 32 -; SKX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; SKX-NEXT: vmovaps %xmm8, (%rsp) +; SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm8 +; SKX-NEXT: vmovups %xmm8, (%rsp) ; SKX-NEXT: callq _v16i1_mem_callee ; SKX-NEXT: addq $24, %rsp ; SKX-NEXT: vzeroupper @@ -3773,16 +3773,16 @@ ; KNL_X32-NEXT: .cfi_def_cfa_register %ebp ; KNL_X32-NEXT: andl $-64, %esp ; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 -; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 -; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 -; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 -; KNL_X32-NEXT: vmovaps 264(%ebp), %xmm4 -; KNL_X32-NEXT: vmovaps %xmm4, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 -; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: vmovups 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovups 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovups 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovups 264(%ebp), %xmm4 +; KNL_X32-NEXT: vmovups %xmm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovups %zmm4, (%esp) ; KNL_X32-NEXT: calll _v16i1_mem_callee ; KNL_X32-NEXT: movl %ebp, %esp ; KNL_X32-NEXT: popl %ebp @@ -3815,8 +3815,8 @@ ; KNL-NEXT: .cfi_def_cfa_register %rbp ; KNL-NEXT: andq $-32, %rsp ; KNL-NEXT: subq $64, %rsp -; KNL-NEXT: vmovaps 16(%rbp), %ymm8 -; KNL-NEXT: vmovaps %ymm8, (%rsp) +; KNL-NEXT: vmovups 16(%rbp), %ymm8 +; KNL-NEXT: vmovups %ymm8, (%rsp) ; KNL-NEXT: callq _v32i1_mem_callee ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp @@ -3831,8 +3831,8 @@ ; SKX-NEXT: .cfi_def_cfa_register %rbp ; SKX-NEXT: andq $-32, %rsp ; SKX-NEXT: subq $64, %rsp -; SKX-NEXT: vmovaps 16(%rbp), %ymm8 -; SKX-NEXT: vmovaps %ymm8, (%rsp) +; SKX-NEXT: vmovups 16(%rbp), %ymm8 +; SKX-NEXT: vmovups %ymm8, (%rsp) ; SKX-NEXT: callq _v32i1_mem_callee ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp @@ -3848,16 +3848,16 @@ ; KNL_X32-NEXT: .cfi_def_cfa_register %ebp ; KNL_X32-NEXT: andl $-64, %esp ; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 -; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 -; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 -; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 -; KNL_X32-NEXT: vmovaps 264(%ebp), %ymm4 -; KNL_X32-NEXT: vmovaps %ymm4, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 -; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: vmovups 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovups 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovups 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovups 264(%ebp), %ymm4 +; KNL_X32-NEXT: vmovups %ymm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovups %zmm4, (%esp) ; KNL_X32-NEXT: calll _v32i1_mem_callee ; KNL_X32-NEXT: movl %ebp, %esp ; KNL_X32-NEXT: popl %ebp @@ -4020,8 +4020,8 @@ ; SKX-NEXT: .cfi_def_cfa_register %rbp ; SKX-NEXT: andq $-64, %rsp ; SKX-NEXT: subq $128, %rsp -; SKX-NEXT: vmovaps 16(%rbp), %zmm8 -; SKX-NEXT: vmovaps %zmm8, (%rsp) +; SKX-NEXT: vmovups 16(%rbp), %zmm8 +; SKX-NEXT: vmovups %zmm8, (%rsp) ; SKX-NEXT: callq _v64i1_mem_callee ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp @@ -4037,10 +4037,10 @@ ; KNL_X32-NEXT: .cfi_def_cfa_register %ebp ; KNL_X32-NEXT: andl $-64, %esp ; KNL_X32-NEXT: subl $576, %esp ## imm = 0x240 -; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 -; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 -; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 -; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovups 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovups 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovups 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovups 200(%ebp), %zmm7 ; KNL_X32-NEXT: movl 516(%ebp), %eax ; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: movl 512(%ebp), %eax @@ -4169,10 +4169,10 @@ ; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: movl 264(%ebp), %eax ; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: vmovups %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovups %zmm4, (%esp) ; KNL_X32-NEXT: calll _v64i1_mem_callee ; KNL_X32-NEXT: movl %ebp, %esp ; KNL_X32-NEXT: popl %ebp diff --git a/llvm/test/CodeGen/X86/avx512-cmp-mask.ll b/llvm/test/CodeGen/X86/avx512-cmp-mask.ll --- a/llvm/test/CodeGen/X86/avx512-cmp-mask.ll +++ b/llvm/test/CodeGen/X86/avx512-cmp-mask.ll @@ -39,7 +39,7 @@ ; CHECK-NEXT: vcmpltps %ymm3, %ymm2, %k1 ; CHECK-NEXT: kunpckbw %k1, %k0, %k1 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovups %zmm0, (%rdi) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: @@ -62,7 +62,7 @@ ; CHECK-NEXT: vcmpltps %ymm3, %ymm2, %k1 ; CHECK-NEXT: kunpckbw %k1, %k0, %k1 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1} +; CHECK-NEXT: vmovups %zmm0, (%rdi) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: @@ -88,7 +88,7 @@ ; AVX512VL-NEXT: kmovw %eax, %k1 ; AVX512VL-NEXT: korw %k1, %k0, %k1 ; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovaps %ymm0, (%rdi) {%k1} +; AVX512VL-NEXT: vmovups %ymm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -99,7 +99,7 @@ ; AVX512DQVL-NEXT: kshiftlb $4, %k0, %k0 ; AVX512DQVL-NEXT: korb %k0, %k1, %k1 ; AVX512DQVL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovaps %ymm0, (%rdi) {%k1} +; AVX512DQVL-NEXT: vmovups %ymm0, (%rdi) {%k1} ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq entry: @@ -119,7 +119,7 @@ ; CHECK-NEXT: vcmpltps {sae}, %zmm1, %zmm0, %k0 ; CHECK-NEXT: vcmpltps %zmm3, %zmm2, %k1 ; CHECK-NEXT: kxnorw %k1, %k0, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} +; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} ; CHECK-NEXT: retq entry: %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 1, i16 -1, i32 8) diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll --- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -98,7 +98,7 @@ ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512-NEXT: vpmovd2m %xmm2, %k1 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovaps %xmm1, (%rsi) +; AVX512-NEXT: vmovups %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1: @@ -110,7 +110,7 @@ ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) +; AVX512NOTDQ-NEXT: vmovups %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <16 x i1>, <16 x i1>* %a0 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32> @@ -156,7 +156,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm2, %k1 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovaps %xmm1, (%rsi) +; AVX512-NEXT: vmovups %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1: @@ -168,7 +168,7 @@ ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) +; AVX512NOTDQ-NEXT: vmovups %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <16 x i1>, <16 x i1>* %a0 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32> @@ -214,7 +214,7 @@ ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512-NEXT: vpmovd2m %xmm2, %k1 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovaps %xmm1, (%rsi) +; AVX512-NEXT: vmovups %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1: @@ -226,7 +226,7 @@ ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) +; AVX512NOTDQ-NEXT: vmovups %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32> @@ -242,7 +242,7 @@ ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512-NEXT: vpmovd2m %ymm2, %k1 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; AVX512-NEXT: vmovaps %ymm1, (%rsi) +; AVX512-NEXT: vmovups %ymm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -254,7 +254,7 @@ ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) +; AVX512NOTDQ-NEXT: vmovups %ymm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 @@ -301,7 +301,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm2, %k1 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovaps %xmm1, (%rsi) +; AVX512-NEXT: vmovups %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1: @@ -313,7 +313,7 @@ ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) +; AVX512NOTDQ-NEXT: vmovups %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32> @@ -330,7 +330,7 @@ ; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512-NEXT: vpmovd2m %ymm2, %k1 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; AVX512-NEXT: vmovaps %ymm1, (%rsi) +; AVX512-NEXT: vmovups %ymm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -344,7 +344,7 @@ ; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) +; AVX512NOTDQ-NEXT: vmovups %ymm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 @@ -391,7 +391,7 @@ ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512-NEXT: vpmovd2m %xmm2, %k1 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovaps %xmm1, (%rsi) +; AVX512-NEXT: vmovups %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1: @@ -403,7 +403,7 @@ ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) +; AVX512NOTDQ-NEXT: vmovups %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32> @@ -419,7 +419,7 @@ ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512-NEXT: vpmovd2m %ymm2, %k1 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; AVX512-NEXT: vmovaps %ymm1, (%rsi) +; AVX512-NEXT: vmovups %ymm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -431,7 +431,7 @@ ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) +; AVX512NOTDQ-NEXT: vmovups %ymm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 @@ -448,7 +448,7 @@ ; AVX512-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512-NEXT: vpmovd2m %zmm2, %k1 ; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovaps %zmm1, (%rsi) +; AVX512-NEXT: vmovups %zmm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -459,7 +459,7 @@ ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi) +; AVX512NOTDQ-NEXT: vmovups %zmm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 @@ -506,7 +506,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm2, %k1 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovaps %xmm1, (%rsi) +; AVX512-NEXT: vmovups %xmm1, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1: @@ -518,7 +518,7 @@ ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) +; AVX512NOTDQ-NEXT: vmovups %xmm1, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32> @@ -535,7 +535,7 @@ ; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512-NEXT: vpmovd2m %ymm2, %k1 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; AVX512-NEXT: vmovaps %ymm1, (%rsi) +; AVX512-NEXT: vmovups %ymm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -549,7 +549,7 @@ ; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) +; AVX512NOTDQ-NEXT: vmovups %ymm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 @@ -567,7 +567,7 @@ ; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm2 ; AVX512-NEXT: vpmovd2m %zmm2, %k1 ; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovaps %zmm1, (%rsi) +; AVX512-NEXT: vmovups %zmm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -579,7 +579,7 @@ ; AVX512NOTDQ-NEXT: vpermd %zmm2, %zmm3, %zmm2 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi) +; AVX512NOTDQ-NEXT: vmovups %zmm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector.ll --- a/llvm/test/CodeGen/X86/avx512-extract-subvector.ll +++ b/llvm/test/CodeGen/X86/avx512-extract-subvector.ll @@ -155,7 +155,7 @@ define void @extract_subvector256_v4f64_store_lo_align_16(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v4f64_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovups %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -181,7 +181,7 @@ define void @extract_subvector256_v4f32_store_lo_align_16(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v4f32_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovups %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -207,7 +207,7 @@ define void @extract_subvector256_v2i64_store_lo_align_16(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v2i64_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovups %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -233,7 +233,7 @@ define void @extract_subvector256_v4i32_store_lo_align_16(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v4i32_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovups %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -259,7 +259,7 @@ define void @extract_subvector256_v8i16_store_lo_align_16(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v8i16_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovups %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -285,7 +285,7 @@ define void @extract_subvector256_v16i8_store_lo_align_16(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v16i8_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovups %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -311,7 +311,7 @@ define void @extract_subvector512_v2f64_store_lo_align_16(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v2f64_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovups %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -337,7 +337,7 @@ define void @extract_subvector512_v4f32_store_lo_align_16(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v4f32_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovups %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -363,7 +363,7 @@ define void @extract_subvector512_v2i64_store_lo_align_16(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v2i64_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovups %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -389,7 +389,7 @@ define void @extract_subvector512_v4i32_store_lo_align_16(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v4i32_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovups %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -428,7 +428,7 @@ define void @extract_subvector512_v16i8_store_lo_align_16(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v16i8_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovups %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -467,7 +467,7 @@ define void @extract_subvector512_v4f64_store_lo_align_32(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v4f64_store_lo_align_32: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vmovups %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -506,7 +506,7 @@ define void @extract_subvector512_v8f32_store_lo_align_32(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v8f32_store_lo_align_32: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vmovups %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -545,7 +545,7 @@ define void @extract_subvector512_v4i64_store_lo_align_32(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v4i64_store_lo_align_32: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vmovups %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -584,7 +584,7 @@ define void @extract_subvector512_v8i32_store_lo_align_32(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v8i32_store_lo_align_32: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vmovups %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -623,7 +623,7 @@ define void @extract_subvector512_v16i16_store_lo_align_32(i16* nocapture %addr, <32 x i16> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v16i16_store_lo_align_32: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vmovups %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -662,7 +662,7 @@ define void @extract_subvector512_v32i8_store_lo_align_32(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v32i8_store_lo_align_32: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vmovups %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll --- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll +++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll @@ -229,7 +229,7 @@ ; CHECK-LABEL: scatter_mask_dps_execdomain: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovups (%rdi), %zmm1 ; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -242,7 +242,7 @@ ; CHECK-LABEL: scatter_mask_qps_execdomain: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vmovups (%rdi), %ymm1 ; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll --- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -224,7 +224,7 @@ ; CHECK-LABEL: scatter_mask_dps_execdomain: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovups (%rdi), %zmm1 ; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -238,7 +238,7 @@ ; CHECK-LABEL: scatter_mask_qps_execdomain: ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vmovups (%rdi), %ymm1 ; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -85,7 +85,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movq %rbp, %rsp @@ -104,7 +104,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: andl $7, %edi ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movq %rbp, %rsp @@ -123,7 +123,7 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovups %ymm0, (%rsp) ; CHECK-NEXT: andl $7, %edi ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movq %rbp, %rsp @@ -142,7 +142,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: movl (%rsp,%rdi,4), %eax ; CHECK-NEXT: movq %rbp, %rsp @@ -1073,7 +1073,7 @@ ; CHECK-LABEL: test_extractelement_variable_v2i64: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: movq -24(%rsp,%rdi,8), %rax ; CHECK-NEXT: retq @@ -1092,7 +1092,7 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovups %ymm0, (%rsp) ; CHECK-NEXT: andl $3, %edi ; CHECK-NEXT: movq (%rsp,%rdi,8), %rax ; CHECK-NEXT: movq %rbp, %rsp @@ -1114,7 +1114,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: andl $7, %edi ; CHECK-NEXT: movq (%rsp,%rdi,8), %rax ; CHECK-NEXT: movq %rbp, %rsp @@ -1129,7 +1129,7 @@ ; CHECK-LABEL: test_extractelement_variable_v2f64: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retq @@ -1148,7 +1148,7 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovups %ymm0, (%rsp) ; CHECK-NEXT: andl $3, %edi ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movq %rbp, %rsp @@ -1170,7 +1170,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: andl $7, %edi ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movq %rbp, %rsp @@ -1185,7 +1185,7 @@ ; CHECK-LABEL: test_extractelement_variable_v4i32: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $3, %edi ; CHECK-NEXT: movl -24(%rsp,%rdi,4), %eax ; CHECK-NEXT: retq @@ -1204,7 +1204,7 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovups %ymm0, (%rsp) ; CHECK-NEXT: andl $7, %edi ; CHECK-NEXT: movl (%rsp,%rdi,4), %eax ; CHECK-NEXT: movq %rbp, %rsp @@ -1226,7 +1226,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: movl (%rsp,%rdi,4), %eax ; CHECK-NEXT: movq %rbp, %rsp @@ -1241,7 +1241,7 @@ ; CHECK-LABEL: test_extractelement_variable_v4f32: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $3, %edi ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: retq @@ -1260,7 +1260,7 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovups %ymm0, (%rsp) ; CHECK-NEXT: andl $7, %edi ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movq %rbp, %rsp @@ -1282,7 +1282,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movq %rbp, %rsp @@ -1297,7 +1297,7 @@ ; CHECK-LABEL: test_extractelement_variable_v8i16: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $7, %edi ; CHECK-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; CHECK-NEXT: retq @@ -1316,7 +1316,7 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovups %ymm0, (%rsp) ; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: movzwl (%rsp,%rdi,2), %eax ; CHECK-NEXT: movq %rbp, %rsp @@ -1338,7 +1338,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: andl $31, %edi ; CHECK-NEXT: movzwl (%rsp,%rdi,2), %eax ; CHECK-NEXT: movq %rbp, %rsp @@ -1353,7 +1353,7 @@ ; CHECK-LABEL: test_extractelement_variable_v16i8: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: movb -24(%rsp,%rdi), %al ; CHECK-NEXT: retq @@ -1372,7 +1372,7 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovups %ymm0, (%rsp) ; CHECK-NEXT: andl $31, %edi ; CHECK-NEXT: movb (%rsp,%rdi), %al ; CHECK-NEXT: movq %rbp, %rsp @@ -1395,7 +1395,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: andl $63, %edi ; CHECK-NEXT: movb (%rsp,%rdi), %al ; CHECK-NEXT: movq %rbp, %rsp @@ -1418,7 +1418,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: addb %dil, %dil -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovups %zmm0, (%rsp) ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: andl $63, %eax ; CHECK-NEXT: movb (%rsp,%rax), %al diff --git a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll --- a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll +++ b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll @@ -51,7 +51,7 @@ ; WIN64-NEXT: subq $176, %rsp ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp ; WIN64-NEXT: andq $-64, %rsp -; WIN64-NEXT: vmovaps (%rcx), %zmm0 +; WIN64-NEXT: vmovups (%rcx), %zmm0 ; WIN64-NEXT: vaddps (%rdx), %zmm0, %zmm0 ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: callq func_float16_ptr @@ -94,7 +94,7 @@ ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-64, %esp ; X32-NEXT: subl $256, %esp ## imm = 0x100 -; X32-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; X32-NEXT: vmovups %zmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill ; X32-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl %eax, (%esp) @@ -111,7 +111,7 @@ ; WIN32-NEXT: movl %esp, %ebp ; WIN32-NEXT: andl $-64, %esp ; WIN32-NEXT: subl $192, %esp -; WIN32-NEXT: vmovaps %zmm1, (%esp) # 64-byte Spill +; WIN32-NEXT: vmovups %zmm1, (%esp) # 64-byte Spill ; WIN32-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: pushl %eax @@ -129,7 +129,7 @@ ; WIN64-NEXT: subq $176, %rsp ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp ; WIN64-NEXT: andq $-64, %rsp -; WIN64-NEXT: vmovaps (%rdx), %zmm16 +; WIN64-NEXT: vmovups (%rdx), %zmm16 ; WIN64-NEXT: vaddps (%rcx), %zmm16, %zmm0 ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: callq func_float16_ptr @@ -190,44 +190,44 @@ ; WIN64-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; WIN64-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; WIN64-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; WIN64-KNL-NEXT: andq $-64, %rsp -; WIN64-KNL-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) -; WIN64-KNL-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) +; WIN64-KNL-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp) +; WIN64-KNL-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-KNL-NEXT: callq func_float16 -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; WIN64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -245,44 +245,44 @@ ; WIN64-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; WIN64-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; WIN64-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; WIN64-SKX-NEXT: andq $-64, %rsp -; WIN64-SKX-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) -; WIN64-SKX-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) +; WIN64-SKX-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp) +; WIN64-SKX-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-SKX-NEXT: callq func_float16 -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; WIN64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload ; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 8-byte Reload @@ -417,7 +417,7 @@ ; WIN64-KNL-NEXT: .seh_stackalloc 40 ; WIN64-KNL-NEXT: .seh_endprologue ; WIN64-KNL-NEXT: # kill: def $dx killed $dx def $edx -; WIN64-KNL-NEXT: vmovaps (%rcx), %zmm0 +; WIN64-KNL-NEXT: vmovups (%rcx), %zmm0 ; WIN64-KNL-NEXT: kmovw %edx, %k1 ; WIN64-KNL-NEXT: callq func_float16_mask ; WIN64-KNL-NEXT: nop @@ -431,7 +431,7 @@ ; WIN64-SKX-NEXT: .seh_stackalloc 40 ; WIN64-SKX-NEXT: .seh_endprologue ; WIN64-SKX-NEXT: # kill: def $dx killed $dx def $edx -; WIN64-SKX-NEXT: vmovaps (%rcx), %zmm0 +; WIN64-SKX-NEXT: vmovups (%rcx), %zmm0 ; WIN64-SKX-NEXT: kmovd %edx, %k1 ; WIN64-SKX-NEXT: callq func_float16_mask ; WIN64-SKX-NEXT: nop diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -590,16 +590,16 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c] -; X86-NEXT: vmovaps %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x29,0x01] -; X86-NEXT: vmovaps %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x00] +; X86-NEXT: vmovups %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x11,0x01] +; X86-NEXT: vmovups %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x00] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_store_aligned_ps: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca] -; X64-NEXT: vmovaps %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x29,0x07] -; X64-NEXT: vmovaps %zmm0, (%rsi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x06] +; X64-NEXT: vmovups %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x11,0x07] +; X64-NEXT: vmovups %zmm0, (%rsi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x06] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask) @@ -741,19 +741,19 @@ ; X86-LABEL: test_mask_load_aligned_ps: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovaps (%eax), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x00] +; X86-NEXT: vmovups (%eax), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x00] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovaps (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x28,0x00] -; X86-NEXT: vmovaps (%eax), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x08] +; X86-NEXT: vmovups (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x10,0x00] +; X86-NEXT: vmovups (%eax), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x10,0x08] ; X86-NEXT: vaddps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc0] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_load_aligned_ps: ; X64: ## %bb.0: -; X64-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] +; X64-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vmovaps (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x28,0x07] -; X64-NEXT: vmovaps (%rdi), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x0f] +; X64-NEXT: vmovups (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x10,0x07] +; X64-NEXT: vmovups (%rdi), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x10,0x0f] ; X64-NEXT: vaddps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1) @@ -10878,7 +10878,7 @@ ; X86-NEXT: vcmpltps {sae}, %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x01] ; X86-NEXT: vcmpltps %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf1,0x6c,0x48,0xc2,0xcb,0x01] ; X86-NEXT: kxnorw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc9] -; X86-NEXT: vmovaps (%eax), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x00] +; X86-NEXT: vmovups (%eax), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x10,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_cmp_512: @@ -10886,7 +10886,7 @@ ; X64-NEXT: vcmpltps {sae}, %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x01] ; X64-NEXT: vcmpltps %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf1,0x6c,0x48,0xc2,0xcb,0x01] ; X64-NEXT: kxnorw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc9] -; X64-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x07] +; X64-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x10,0x07] ; X64-NEXT: retq ## encoding: [0xc3] entry: %0 = tail call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 1, i32 8) diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -5908,7 +5908,7 @@ define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ss_load0(i8 zeroext %0, <4 x float>* nocapture readonly %1, float %2, float %3) { ; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_load0: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rsi), %xmm2 +; X64-NEXT: vmovups (%rsi), %xmm2 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm2 {%k1} {z} = (xmm0 * xmm2) + xmm1 ; X64-NEXT: vmovaps %xmm2, %xmm0 @@ -5919,7 +5919,7 @@ ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 +; X86-NEXT: vmovups (%ecx), %xmm0 ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vfmadd132ss {{.*#+}} xmm0 {%k1} {z} = (xmm0 * mem) + xmm1 ; X86-NEXT: retl @@ -7510,11 +7510,11 @@ ; X86-NEXT: .cfi_def_cfa_register %ebp ; X86-NEXT: andl $-64, %esp ; X86-NEXT: subl $64, %esp -; X86-NEXT: vmovaps 72(%ebp), %zmm3 +; X86-NEXT: vmovups 72(%ebp), %zmm3 ; X86-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0 ; X86-NEXT: vcmplt_oqpd 8(%ebp), %zmm2, %k1 ; X86-NEXT: kunpckbw %k0, %k1, %k1 -; X86-NEXT: vmovaps 136(%ebp), %zmm3 {%k1} +; X86-NEXT: vmovups 136(%ebp), %zmm3 {%k1} ; X86-NEXT: vmovaps %zmm3, %zmm0 ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp @@ -7552,9 +7552,9 @@ ; X86-NEXT: .cfi_def_cfa_register %ebp ; X86-NEXT: andl $-64, %esp ; X86-NEXT: subl $64, %esp -; X86-NEXT: vmovaps 72(%ebp), %zmm2 +; X86-NEXT: vmovups 72(%ebp), %zmm2 ; X86-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k1 -; X86-NEXT: vmovaps 136(%ebp), %zmm2 {%k1} +; X86-NEXT: vmovups 136(%ebp), %zmm2 {%k1} ; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -979,12 +979,12 @@ define <4 x i1> @test14() { ; CHECK-LABEL: test14: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,0,1] +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [1,1,0,1] ; CHECK-NEXT: retq ; ; X86-LABEL: test14: ; X86: ## %bb.0: -; X86-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,0,1] +; X86-NEXT: vmovups {{.*#+}} xmm0 = [1,1,0,1] ; X86-NEXT: retl %a = bitcast i16 21845 to <16 x i1> %b = extractelement <16 x i1> %a, i32 2 @@ -2120,13 +2120,13 @@ ; KNL-NEXT: orl %eax, %ecx ; KNL-NEXT: je LBB45_2 ; KNL-NEXT: ## %bb.1: ## %L1 -; KNL-NEXT: vmovaps %zmm0, (%rdi) -; KNL-NEXT: vmovaps %zmm1, 64(%rdi) +; KNL-NEXT: vmovups %zmm0, (%rdi) +; KNL-NEXT: vmovups %zmm1, 64(%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; KNL-NEXT: LBB45_2: ## %L2 -; KNL-NEXT: vmovaps %zmm0, 4(%rdi) -; KNL-NEXT: vmovaps %zmm1, 68(%rdi) +; KNL-NEXT: vmovups %zmm0, 4(%rdi) +; KNL-NEXT: vmovups %zmm1, 68(%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -2143,13 +2143,13 @@ ; SKX-NEXT: kortestd %k1, %k0 ; SKX-NEXT: je LBB45_2 ; SKX-NEXT: ## %bb.1: ## %L1 -; SKX-NEXT: vmovaps %zmm0, (%rdi) -; SKX-NEXT: vmovaps %zmm1, 64(%rdi) +; SKX-NEXT: vmovups %zmm0, (%rdi) +; SKX-NEXT: vmovups %zmm1, 64(%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; SKX-NEXT: LBB45_2: ## %L2 -; SKX-NEXT: vmovaps %zmm0, 4(%rdi) -; SKX-NEXT: vmovaps %zmm1, 68(%rdi) +; SKX-NEXT: vmovups %zmm0, 4(%rdi) +; SKX-NEXT: vmovups %zmm1, 68(%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; @@ -2166,13 +2166,13 @@ ; AVX512BW-NEXT: kortestd %k1, %k0 ; AVX512BW-NEXT: je LBB45_2 ; AVX512BW-NEXT: ## %bb.1: ## %L1 -; AVX512BW-NEXT: vmovaps %zmm0, (%rdi) -; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdi) +; AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; AVX512BW-NEXT: vmovups %zmm1, 64(%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; AVX512BW-NEXT: LBB45_2: ## %L2 -; AVX512BW-NEXT: vmovaps %zmm0, 4(%rdi) -; AVX512BW-NEXT: vmovaps %zmm1, 68(%rdi) +; AVX512BW-NEXT: vmovups %zmm0, 4(%rdi) +; AVX512BW-NEXT: vmovups %zmm1, 68(%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2192,13 +2192,13 @@ ; AVX512DQ-NEXT: orl %eax, %ecx ; AVX512DQ-NEXT: je LBB45_2 ; AVX512DQ-NEXT: ## %bb.1: ## %L1 -; AVX512DQ-NEXT: vmovaps %zmm0, (%rdi) -; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rdi) +; AVX512DQ-NEXT: vmovups %zmm0, (%rdi) +; AVX512DQ-NEXT: vmovups %zmm1, 64(%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; AVX512DQ-NEXT: LBB45_2: ## %L2 -; AVX512DQ-NEXT: vmovaps %zmm0, 4(%rdi) -; AVX512DQ-NEXT: vmovaps %zmm1, 68(%rdi) +; AVX512DQ-NEXT: vmovups %zmm0, 4(%rdi) +; AVX512DQ-NEXT: vmovups %zmm1, 68(%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2216,13 +2216,13 @@ ; X86-NEXT: kortestd %k1, %k0 ; X86-NEXT: je LBB45_2 ; X86-NEXT: ## %bb.1: ## %L1 -; X86-NEXT: vmovaps %zmm0, (%eax) -; X86-NEXT: vmovaps %zmm1, 64(%eax) +; X86-NEXT: vmovups %zmm0, (%eax) +; X86-NEXT: vmovups %zmm1, 64(%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; X86-NEXT: LBB45_2: ## %L2 -; X86-NEXT: vmovaps %zmm0, 4(%eax) -; X86-NEXT: vmovaps %zmm1, 68(%eax) +; X86-NEXT: vmovups %zmm0, 4(%eax) +; X86-NEXT: vmovups %zmm1, 68(%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl %addr1 = getelementptr float, float * %base, i64 0 @@ -4314,8 +4314,8 @@ ; X86-LABEL: store_v128i1_constant: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovaps {{.*#+}} xmm0 = [4294963197,3758096251,4294959101,3221225403] -; X86-NEXT: vmovaps %xmm0, (%eax) +; X86-NEXT: vmovups {{.*#+}} xmm0 = [4294963197,3758096251,4294959101,3221225403] +; X86-NEXT: vmovups %xmm0, (%eax) ; X86-NEXT: retl entry: store <128 x i1> , <128 x i1>* %R diff --git a/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll b/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll --- a/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll @@ -26,7 +26,7 @@ ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl %eax, %edx ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; CHECK-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK-NEXT: callq _calc_expected_mask_val ; CHECK-NEXT: ## kill: def $eax killed $eax killed $rax @@ -37,7 +37,7 @@ ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload ; CHECK-NEXT: callq _check_mask16 -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; CHECK-NEXT: vpmovd2m %xmm0, %k0 ; CHECK-NEXT: kmovq %k0, %k1 ; CHECK-NEXT: kmovd %k0, %ecx diff --git a/llvm/test/CodeGen/X86/avx512-mov.ll b/llvm/test/CodeGen/X86/avx512-mov.ll --- a/llvm/test/CodeGen/X86/avx512-mov.ll +++ b/llvm/test/CodeGen/X86/avx512-mov.ll @@ -161,7 +161,7 @@ define <16 x i32> @test17(i8 * %addr) { ; CHECK-LABEL: test17: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x i32>* %res = load <16 x i32>, <16 x i32>* %vaddr, align 64 @@ -171,7 +171,7 @@ define void @test18(i8 * %addr, <8 x i64> %data) { ; CHECK-LABEL: test18: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07] +; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i64>* store <8 x i64>%data, <8 x i64>* %vaddr, align 64 @@ -191,7 +191,7 @@ define void @test20(i8 * %addr, <16 x i32> %data) { ; CHECK-LABEL: test20: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07] +; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x i32>* store <16 x i32>%data, <16 x i32>* %vaddr, align 64 @@ -201,7 +201,7 @@ define <8 x i64> @test21(i8 * %addr) { ; CHECK-LABEL: test21: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i64>* %res = load <8 x i64>, <8 x i64>* %vaddr, align 64 @@ -231,7 +231,7 @@ define void @test24(i8 * %addr, <8 x double> %data) { ; CHECK-LABEL: test24: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07] +; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x double>* store <8 x double>%data, <8 x double>* %vaddr, align 64 @@ -241,7 +241,7 @@ define <8 x double> @test25(i8 * %addr) { ; CHECK-LABEL: test25: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x double>* %res = load <8 x double>, <8 x double>* %vaddr, align 64 @@ -251,7 +251,7 @@ define void @test26(i8 * %addr, <16 x float> %data) { ; CHECK-LABEL: test26: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07] +; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x float>* store <16 x float>%data, <16 x float>* %vaddr, align 64 @@ -261,7 +261,7 @@ define <16 x float> @test27(i8 * %addr) { ; CHECK-LABEL: test27: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x float>* %res = load <16 x float>, <16 x float>* %vaddr, align 64 @@ -417,7 +417,7 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] ; CHECK-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x0c] -; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* @@ -445,7 +445,7 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] ; CHECK-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x48,0xc2,0xc9,0x0c] -; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll --- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll @@ -98,9 +98,9 @@ ; X32: # %bb.0: # %entry ; X32-NEXT: pushl %edi ; X32-NEXT: subl $88, %esp -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [2,1,2,1] +; X32-NEXT: vmovups {{.*#+}} xmm0 = [2,1,2,1] ; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: vmovaps {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1] +; X32-NEXT: vmovups {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1] ; X32-NEXT: vmovups %zmm0, (%esp) ; X32-NEXT: movl $1, {{[0-9]+}}(%esp) ; X32-NEXT: movl $2, {{[0-9]+}}(%esp) @@ -129,9 +129,9 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $48, %rsp ; WIN64-NEXT: .seh_stackalloc 48 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 32 -; WIN64-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 16 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: movabsq $4294967298, %rax # imm = 0x100000002 @@ -148,8 +148,8 @@ ; WIN64-NEXT: movq %rax, %r15 ; WIN64-NEXT: movq %rax, %rsi ; WIN64-NEXT: callq test_argv64i1 -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $48, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi @@ -247,16 +247,16 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $40, %rsp ; WIN64-NEXT: .seh_stackalloc 40 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 16 -; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm6, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: callq test_retv64i1 ; WIN64-NEXT: kmovq %rax, %k0 ; WIN64-NEXT: vpmovm2b %k0, %zmm0 -; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovups (%rsp), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $40, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi @@ -352,14 +352,14 @@ ; LINUXOSX64-NEXT: pushq %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: subq $128, %rsp -; LINUXOSX64-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm8, (%rsp) # 16-byte Spill ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 144 ; LINUXOSX64-NEXT: .cfi_offset %rsp, -16 ; LINUXOSX64-NEXT: .cfi_offset %xmm8, -144 @@ -380,14 +380,14 @@ ; LINUXOSX64-NEXT: # kill: def $ymm1 killed $ymm1 killed $zmm1 ; LINUXOSX64-NEXT: # kill: def $ymm2 killed $ymm2 killed $zmm2 ; LINUXOSX64-NEXT: callq test_argv32i1helper -; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups (%rsp), %xmm8 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; LINUXOSX64-NEXT: addq $128, %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp @@ -417,17 +417,17 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $40, %rsp ; WIN64-NEXT: .seh_stackalloc 40 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 16 -; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm6, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: movl $1, %eax ; WIN64-NEXT: movl $1, %ecx ; WIN64-NEXT: movl $1, %edx ; WIN64-NEXT: callq test_argv32i1 -; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovups (%rsp), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $40, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi @@ -482,15 +482,15 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $40, %rsp ; WIN64-NEXT: .seh_stackalloc 40 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 16 -; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm6, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: callq test_retv32i1 ; WIN64-NEXT: incl %eax -; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovups (%rsp), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $40, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi @@ -581,14 +581,14 @@ ; LINUXOSX64-NEXT: pushq %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: subq $128, %rsp -; LINUXOSX64-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm8, (%rsp) # 16-byte Spill ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 144 ; LINUXOSX64-NEXT: .cfi_offset %rsp, -16 ; LINUXOSX64-NEXT: .cfi_offset %xmm8, -144 @@ -610,14 +610,14 @@ ; LINUXOSX64-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 ; LINUXOSX64-NEXT: vzeroupper ; LINUXOSX64-NEXT: callq test_argv16i1helper -; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups (%rsp), %xmm8 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; LINUXOSX64-NEXT: addq $128, %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp @@ -645,17 +645,17 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $40, %rsp ; WIN64-NEXT: .seh_stackalloc 40 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 16 -; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm6, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: movl $1, %eax ; WIN64-NEXT: movl $1, %ecx ; WIN64-NEXT: movl $1, %edx ; WIN64-NEXT: callq test_argv16i1 -; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovups (%rsp), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $40, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi @@ -712,17 +712,17 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $40, %rsp ; WIN64-NEXT: .seh_stackalloc 40 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 16 -; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm6, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: callq test_retv16i1 ; WIN64-NEXT: # kill: def $ax killed $ax def $eax ; WIN64-NEXT: incl %eax ; WIN64-NEXT: # kill: def $ax killed $ax killed $eax -; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovups (%rsp), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $40, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi @@ -815,14 +815,14 @@ ; LINUXOSX64-NEXT: pushq %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: subq $128, %rsp -; LINUXOSX64-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm8, (%rsp) # 16-byte Spill ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 144 ; LINUXOSX64-NEXT: .cfi_offset %rsp, -16 ; LINUXOSX64-NEXT: .cfi_offset %xmm8, -144 @@ -844,14 +844,14 @@ ; LINUXOSX64-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 ; LINUXOSX64-NEXT: vzeroupper ; LINUXOSX64-NEXT: callq test_argv8i1helper -; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups (%rsp), %xmm8 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; LINUXOSX64-NEXT: addq $128, %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp @@ -879,17 +879,17 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $40, %rsp ; WIN64-NEXT: .seh_stackalloc 40 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 16 -; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm6, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: movl $1, %eax ; WIN64-NEXT: movl $1, %ecx ; WIN64-NEXT: movl $1, %edx ; WIN64-NEXT: callq test_argv8i1 -; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovups (%rsp), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $40, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi @@ -948,9 +948,9 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $40, %rsp ; WIN64-NEXT: .seh_stackalloc 40 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 16 -; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm6, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: callq test_retv8i1 @@ -958,8 +958,8 @@ ; WIN64-NEXT: kmovd %eax, %k0 ; WIN64-NEXT: vpmovm2w %k0, %zmm0 ; WIN64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovups (%rsp), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $40, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll --- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -371,14 +371,14 @@ ; WIN64-NEXT: .seh_pushreg %rsp ; WIN64-NEXT: subq $16, %rsp ; WIN64-NEXT: .seh_stackalloc 16 -; WIN64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm8, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm8, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero ; WIN64-NEXT: vaddss %xmm0, %xmm8, %xmm0 ; WIN64-NEXT: callq test_argRetFloat ; WIN64-NEXT: vaddss %xmm0, %xmm8, %xmm0 -; WIN64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; WIN64-NEXT: vmovups (%rsp), %xmm8 # 16-byte Reload ; WIN64-NEXT: addq $16, %rsp ; WIN64-NEXT: popq %rsp ; WIN64-NEXT: retq @@ -389,7 +389,7 @@ ; LINUXOSX64-NEXT: pushq %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: subq $16, %rsp -; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm8, (%rsp) # 16-byte Spill ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 32 ; LINUXOSX64-NEXT: .cfi_offset %rsp, -16 ; LINUXOSX64-NEXT: .cfi_offset %xmm8, -32 @@ -397,7 +397,7 @@ ; LINUXOSX64-NEXT: vaddss %xmm0, %xmm8, %xmm0 ; LINUXOSX64-NEXT: callq test_argRetFloat ; LINUXOSX64-NEXT: vaddss %xmm0, %xmm8, %xmm0 -; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups (%rsp), %xmm8 # 16-byte Reload ; LINUXOSX64-NEXT: addq $16, %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp @@ -451,14 +451,14 @@ ; WIN64-NEXT: .seh_pushreg %rsp ; WIN64-NEXT: subq $16, %rsp ; WIN64-NEXT: .seh_stackalloc 16 -; WIN64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm8, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm8, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero ; WIN64-NEXT: vaddsd %xmm0, %xmm8, %xmm0 ; WIN64-NEXT: callq test_argRetDouble ; WIN64-NEXT: vaddsd %xmm0, %xmm8, %xmm0 -; WIN64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; WIN64-NEXT: vmovups (%rsp), %xmm8 # 16-byte Reload ; WIN64-NEXT: addq $16, %rsp ; WIN64-NEXT: popq %rsp ; WIN64-NEXT: retq @@ -469,7 +469,7 @@ ; LINUXOSX64-NEXT: pushq %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: subq $16, %rsp -; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm8, (%rsp) # 16-byte Spill ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 32 ; LINUXOSX64-NEXT: .cfi_offset %rsp, -16 ; LINUXOSX64-NEXT: .cfi_offset %xmm8, -32 @@ -477,7 +477,7 @@ ; LINUXOSX64-NEXT: vaddsd %xmm0, %xmm8, %xmm0 ; LINUXOSX64-NEXT: callq test_argRetDouble ; LINUXOSX64-NEXT: vaddsd %xmm0, %xmm8, %xmm0 -; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups (%rsp), %xmm8 # 16-byte Reload ; LINUXOSX64-NEXT: addq $16, %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp @@ -738,7 +738,7 @@ ; WIN64-NEXT: .seh_pushreg %rsp ; WIN64-NEXT: subq $32, %rsp ; WIN64-NEXT: .seh_stackalloc 32 -; WIN64-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovups %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm8, 16 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: vmovdqa %xmm1, %xmm8 @@ -750,7 +750,7 @@ ; WIN64-NEXT: callq test_argRet128Vector ; WIN64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; WIN64-NEXT: vmovdqa32 %xmm8, %xmm0 {%k1} -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; WIN64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; WIN64-NEXT: addq $32, %rsp ; WIN64-NEXT: popq %rsp ; WIN64-NEXT: retq @@ -761,7 +761,7 @@ ; LINUXOSX64-NEXT: pushq %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: subq $32, %rsp -; LINUXOSX64-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovups %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 48 ; LINUXOSX64-NEXT: .cfi_offset %rsp, -16 ; LINUXOSX64-NEXT: .cfi_offset %xmm8, -32 @@ -774,7 +774,7 @@ ; LINUXOSX64-NEXT: callq test_argRet128Vector ; LINUXOSX64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; LINUXOSX64-NEXT: vmovdqa32 %xmm8, %xmm0 {%k1} -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; LINUXOSX64-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; LINUXOSX64-NEXT: addq $32, %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp diff --git a/llvm/test/CodeGen/X86/avx512-rotate.ll b/llvm/test/CodeGen/X86/avx512-rotate.ll --- a/llvm/test/CodeGen/X86/avx512-rotate.ll +++ b/llvm/test/CodeGen/X86/avx512-rotate.ll @@ -245,7 +245,7 @@ define <8 x i64> @test_fold_rol_v8i64() { ; CHECK-LABEL: test_fold_rol_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [1,2,4,9223372036854775808,2,4611686018427387904,9223372036854775808,9223372036854775808] +; CHECK-NEXT: vmovups {{.*#+}} zmm0 = [1,2,4,9223372036854775808,2,4611686018427387904,9223372036854775808,9223372036854775808] ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> , <8 x i64> , <8 x i64> zeroinitializer, i8 -1) ret <8 x i64> %res diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -114,7 +114,7 @@ ; X86-NEXT: .cfi_def_cfa_register %ebp ; X86-NEXT: andl $-64, %esp ; X86-NEXT: subl $64, %esp -; X86-NEXT: vmovaps 8(%ebp), %zmm1 +; X86-NEXT: vmovups 8(%ebp), %zmm1 ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: .cfi_def_cfa %esp, 4 diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll @@ -488,7 +488,7 @@ define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) { ; CHECK-LABEL: test_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovups (%rdi), %ymm0 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,2,4,6,7,6] ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp @@ -498,7 +498,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 +; CHECK-NEXT: vmovups (%rdi), %ymm2 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = ymm2[3,0,0,2,4,6,7,6] @@ -513,7 +513,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vmovups (%rdi), %ymm1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm1[3,0,0,2,4,6,7,6] @@ -556,7 +556,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 +; CHECK-NEXT: vmovups (%rdi), %ymm2 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3,4,4,7,4] @@ -571,7 +571,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vmovups (%rdi), %ymm1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3,4,4,7,4] @@ -623,7 +623,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask4(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 +; CHECK-NEXT: vmovups (%rdi), %ymm2 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = ymm2[0,1,0,1,4,6,5,4] @@ -638,7 +638,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask4(<8 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vmovups (%rdi), %ymm1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm1[0,1,0,1,4,6,5,4] @@ -681,7 +681,7 @@ define <8 x float> @test_8xfloat_perm_mem_mask6(<8 x float>* %vp) { ; CHECK-LABEL: test_8xfloat_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovups (%rdi), %ymm0 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,7,4,6,7] ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp @@ -691,7 +691,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask6(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 +; CHECK-NEXT: vmovups (%rdi), %ymm2 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = ymm2[0,1,2,3,7,4,6,7] @@ -706,7 +706,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask6(<8 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vmovups (%rdi), %ymm1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm1[0,1,2,3,7,4,6,7] @@ -981,7 +981,7 @@ define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,3,3,0,6,6,6,6,11,10,9,10,12,14,12,12] ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -991,7 +991,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 +; CHECK-NEXT: vmovups (%rdi), %zmm2 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm2[3,3,3,0,6,6,6,6,11,10,9,10,12,14,12,12] @@ -1006,7 +1006,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovups (%rdi), %zmm1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm1[3,3,3,0,6,6,6,6,11,10,9,10,12,14,12,12] @@ -1049,7 +1049,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 +; CHECK-NEXT: vmovups (%rdi), %zmm2 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm2[2,0,0,3,5,5,6,5,9,8,8,8,14,12,13,13] @@ -1064,7 +1064,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovups (%rdi), %zmm1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm1[2,0,0,3,5,5,6,5,9,8,8,8,14,12,13,13] @@ -1116,7 +1116,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask4(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 +; CHECK-NEXT: vmovups (%rdi), %zmm2 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm2[3,3,1,1,6,5,5,6,11,11,10,9,15,14,12,12] @@ -1131,7 +1131,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask4(<16 x float>* %vp, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovups (%rdi), %zmm1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm1[3,3,1,1,6,5,5,6,11,11,10,9,15,14,12,12] @@ -1174,7 +1174,7 @@ define <16 x float> @test_16xfloat_perm_mem_mask6(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,1,1,2,6,5,5,7,9,11,9,9,12,15,14,15] ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -1184,7 +1184,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask6(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 +; CHECK-NEXT: vmovups (%rdi), %zmm2 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm2[2,1,1,2,6,5,5,7,9,11,9,9,12,15,14,15] @@ -1199,7 +1199,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask6(<16 x float>* %vp, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovups (%rdi), %zmm1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm1[2,1,1,2,6,5,5,7,9,11,9,9,12,15,14,15] diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -911,7 +911,7 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,3,2] +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [4,0,3,2] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -1009,7 +1009,7 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [5,3,2,5] +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [5,3,2,5] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -1049,7 +1049,7 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm0 +; CHECK-NEXT: vmovups 16(%rdi), %xmm0 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[3,1],mem[0,0] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -1059,7 +1059,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 +; CHECK-NEXT: vmovups 16(%rdi), %xmm2 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[3,1],mem[0,0] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} @@ -1074,7 +1074,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 +; CHECK-NEXT: vmovups 16(%rdi), %xmm1 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[3,1],mem[0,0] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} @@ -1330,7 +1330,7 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -1430,7 +1430,7 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,0,13] +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [3,0,0,13] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -1470,7 +1470,7 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4] +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4] ; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1773,7 +1773,7 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [12,9,4,10] +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [12,9,4,10] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -1849,7 +1849,7 @@ define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp) { ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vmovups (%rdi), %xmm0 ; CHECK-NEXT: vunpckhpd 16(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[1] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp @@ -2520,7 +2520,7 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) { ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %xmm0 +; CHECK-NEXT: vmovups 32(%rdi), %xmm0 ; CHECK-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2634,7 +2634,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,0] +; CHECK-NEXT: vmovups {{.*#+}} xmm3 = [1,3,5,0] ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 @@ -2650,7 +2650,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,0] +; CHECK-NEXT: vmovups {{.*#+}} xmm2 = [1,3,5,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -2665,7 +2665,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,2,7,0] +; CHECK-NEXT: vmovups {{.*#+}} xmm3 = [3,2,7,0] ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 @@ -2681,7 +2681,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,2,7,0] +; CHECK-NEXT: vmovups {{.*#+}} xmm2 = [3,2,7,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -2696,7 +2696,7 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,5,2] +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [3,3,5,2] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -2707,7 +2707,7 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,3,5,2] +; CHECK-NEXT: vmovups {{.*#+}} xmm3 = [3,3,5,2] ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 @@ -2723,7 +2723,7 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,5,2] +; CHECK-NEXT: vmovups {{.*#+}} xmm2 = [3,3,5,2] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -2738,8 +2738,8 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,6,0,1] +; CHECK-NEXT: vmovups 16(%rdi), %xmm1 +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [2,6,0,1] ; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp @@ -2749,8 +2749,8 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,0,1] +; CHECK-NEXT: vmovups 16(%rdi), %xmm2 +; CHECK-NEXT: vmovups {{.*#+}} xmm3 = [2,6,0,1] ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 @@ -2766,8 +2766,8 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,0,1] +; CHECK-NEXT: vmovups 16(%rdi), %xmm2 +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [2,6,0,1] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} @@ -2783,8 +2783,8 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,7,7,2] +; CHECK-NEXT: vmovups 16(%rdi), %xmm2 +; CHECK-NEXT: vmovups {{.*#+}} xmm3 = [2,7,7,2] ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 @@ -2800,8 +2800,8 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,7,7,2] +; CHECK-NEXT: vmovups 16(%rdi), %xmm2 +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [2,7,7,2] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} @@ -2817,8 +2817,8 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,1,3,7] +; CHECK-NEXT: vmovups (%rdi), %xmm2 +; CHECK-NEXT: vmovups {{.*#+}} xmm3 = [3,1,3,7] ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 @@ -2834,8 +2834,8 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,1,3,7] +; CHECK-NEXT: vmovups (%rdi), %xmm2 +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [3,1,3,7] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} @@ -2851,8 +2851,8 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm1 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,3,5,3] +; CHECK-NEXT: vmovups (%rdi), %xmm1 +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [1,3,5,3] ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp @@ -2862,8 +2862,8 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,3] +; CHECK-NEXT: vmovups (%rdi), %xmm2 +; CHECK-NEXT: vmovups {{.*#+}} xmm3 = [1,3,5,3] ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 @@ -2879,8 +2879,8 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [1,3,5,3] +; CHECK-NEXT: vmovups (%rdi), %xmm2 +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [1,3,5,3] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} @@ -2923,7 +2923,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7] +; CHECK-NEXT: vmovups {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -2953,7 +2953,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14] +; CHECK-NEXT: vmovups {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -2967,7 +2967,7 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4] +; CHECK-NEXT: vmovups {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4] ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 @@ -2982,7 +2982,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4] +; CHECK-NEXT: vmovups {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -3023,7 +3023,7 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8] +; CHECK-NEXT: vmovups {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -3037,7 +3037,7 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,8,9,10] +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [4,8,9,10] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -3048,7 +3048,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [4,8,9,10] +; CHECK-NEXT: vmovups {{.*#+}} xmm3 = [4,8,9,10] ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 @@ -3064,7 +3064,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [4,8,9,10] +; CHECK-NEXT: vmovups {{.*#+}} xmm2 = [4,8,9,10] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -3079,7 +3079,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [8,6,10,6] +; CHECK-NEXT: vmovups {{.*#+}} xmm3 = [8,6,10,6] ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 @@ -3095,7 +3095,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [8,6,10,6] +; CHECK-NEXT: vmovups {{.*#+}} xmm2 = [8,6,10,6] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -3142,7 +3142,7 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [10,2,11,6] +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [10,2,11,6] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper @@ -3153,7 +3153,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [10,2,11,6] +; CHECK-NEXT: vmovups {{.*#+}} xmm3 = [10,2,11,6] ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 @@ -3169,7 +3169,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [10,2,11,6] +; CHECK-NEXT: vmovups {{.*#+}} xmm2 = [10,2,11,6] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -3184,8 +3184,8 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm1 -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4] +; CHECK-NEXT: vmovups (%rdi), %ymm1 +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4] ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3195,8 +3195,8 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4] +; CHECK-NEXT: vmovups (%rdi), %ymm2 +; CHECK-NEXT: vmovups {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4] ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 @@ -3212,8 +3212,8 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4] +; CHECK-NEXT: vmovups (%rdi), %ymm2 +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} @@ -3229,8 +3229,8 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8] +; CHECK-NEXT: vmovups (%rdi), %ymm2 +; CHECK-NEXT: vmovups {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8] ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 @@ -3246,8 +3246,8 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8] +; CHECK-NEXT: vmovups (%rdi), %ymm2 +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} @@ -3263,8 +3263,8 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1] +; CHECK-NEXT: vmovups 32(%rdi), %ymm2 +; CHECK-NEXT: vmovups {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1] ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 @@ -3280,8 +3280,8 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] +; CHECK-NEXT: vmovups 32(%rdi), %ymm2 +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} @@ -3297,8 +3297,8 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9] +; CHECK-NEXT: vmovups 32(%rdi), %ymm1 +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9] ; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3308,8 +3308,8 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9] +; CHECK-NEXT: vmovups 32(%rdi), %ymm2 +; CHECK-NEXT: vmovups {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9] ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 @@ -3325,8 +3325,8 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9] +; CHECK-NEXT: vmovups 32(%rdi), %ymm2 +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} @@ -3343,7 +3343,7 @@ ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm1 # ymm1 = mem[3,1,2,3] -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,7,3] +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [0,6,7,3] ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3355,7 +3355,7 @@ ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3] -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,7,3] +; CHECK-NEXT: vmovups {{.*#+}} xmm3 = [0,6,7,3] ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 @@ -3373,7 +3373,7 @@ ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3] -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,7,3] +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [0,6,7,3] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} @@ -3390,8 +3390,8 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <0,10,6,15,u,u,u,u> +; CHECK-NEXT: vmovups 32(%rdi), %ymm2 +; CHECK-NEXT: vmovups {{.*#+}} ymm3 = <0,10,6,15,u,u,u,u> ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 @@ -3408,8 +3408,8 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <0,10,6,15,u,u,u,u> +; CHECK-NEXT: vmovups 32(%rdi), %ymm2 +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = <0,10,6,15,u,u,u,u> ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} @@ -3426,7 +3426,7 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 +; CHECK-NEXT: vmovups 32(%rdi), %ymm2 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm3 = [60129542148,60129542148,60129542148,60129542148] ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 @@ -3444,7 +3444,7 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 +; CHECK-NEXT: vmovups 32(%rdi), %ymm2 ; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [60129542148,60129542148,60129542148,60129542148] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 @@ -3462,8 +3462,8 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,15,9] -; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [3,3,15,9] +; CHECK-NEXT: vmovups (%rdi), %ymm0 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper @@ -3475,8 +3475,8 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9] -; CHECK-NEXT: vmovaps (%rdi), %ymm3 +; CHECK-NEXT: vmovups {{.*#+}} xmm2 = [3,3,15,9] +; CHECK-NEXT: vmovups (%rdi), %ymm3 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 @@ -3493,8 +3493,8 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9] -; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vmovups {{.*#+}} xmm2 = [3,3,15,9] +; CHECK-NEXT: vmovups (%rdi), %ymm1 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} @@ -3579,7 +3579,7 @@ define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp) { ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vmovups (%rdi), %xmm0 ; CHECK-NEXT: vblendps $3, 16(%rdi), %xmm0, %xmm0 # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: retq %vec = load <4 x double>, <4 x double>* %vp diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll @@ -512,7 +512,7 @@ define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> @@ -599,7 +599,7 @@ define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> @@ -634,7 +634,7 @@ define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) { ; CHECK-LABEL: test_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] ; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -728,7 +728,7 @@ define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) { ; CHECK-LABEL: test_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] ; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp @@ -766,7 +766,7 @@ define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] +; CHECK-NEXT: vmovups {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> @@ -853,7 +853,7 @@ define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] +; CHECK-NEXT: vmovups {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> @@ -888,7 +888,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) { ; CHECK-LABEL: test_16xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] +; CHECK-NEXT: vmovups {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] ; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -982,7 +982,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) { ; CHECK-LABEL: test_16xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] +; CHECK-NEXT: vmovups {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] ; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1254,7 +1254,7 @@ define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] +; CHECK-NEXT: vmovups {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> @@ -1421,7 +1421,7 @@ define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] +; CHECK-NEXT: vmovups {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> @@ -1480,7 +1480,7 @@ define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) { ; CHECK-LABEL: test_8xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] +; CHECK-NEXT: vmovups {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] ; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1661,7 +1661,7 @@ define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) { ; CHECK-LABEL: test_8xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] +; CHECK-NEXT: vmovups {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] ; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -1725,7 +1725,7 @@ define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> @@ -1734,7 +1734,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] +; CHECK-NEXT: vmovups {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} @@ -1749,7 +1749,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] +; CHECK-NEXT: vmovups {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -1762,7 +1762,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] +; CHECK-NEXT: vmovups {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} @@ -1777,7 +1777,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] +; CHECK-NEXT: vmovups {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -1790,7 +1790,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] +; CHECK-NEXT: vmovups {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} @@ -1805,7 +1805,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] +; CHECK-NEXT: vmovups {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -1818,7 +1818,7 @@ define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> @@ -1827,7 +1827,7 @@ define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] +; CHECK-NEXT: vmovups {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} @@ -1842,7 +1842,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] +; CHECK-NEXT: vmovups {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} @@ -1855,7 +1855,7 @@ define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) { ; CHECK-LABEL: test_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] ; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp @@ -1865,7 +1865,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] +; CHECK-NEXT: vmovups {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} @@ -1880,7 +1880,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} @@ -1895,7 +1895,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] +; CHECK-NEXT: vmovups {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} @@ -1910,7 +1910,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} @@ -1925,7 +1925,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] +; CHECK-NEXT: vmovups {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} @@ -1940,7 +1940,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} @@ -1955,7 +1955,7 @@ define <8 x float> @test_8xfloat_perm_mem_mask3(<8 x float>* %vp) { ; CHECK-LABEL: test_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] ; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp @@ -1965,7 +1965,7 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] +; CHECK-NEXT: vmovups {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} @@ -1980,7 +1980,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} @@ -1995,7 +1995,7 @@ define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] +; CHECK-NEXT: vmovups {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> @@ -2004,7 +2004,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] +; CHECK-NEXT: vmovups {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %zmm4, %zmm2, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -2019,7 +2019,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] +; CHECK-NEXT: vmovups {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -2032,7 +2032,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] +; CHECK-NEXT: vmovups {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %zmm4, %zmm2, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -2047,7 +2047,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] +; CHECK-NEXT: vmovups {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -2060,7 +2060,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] +; CHECK-NEXT: vmovups {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %zmm4, %zmm2, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -2075,7 +2075,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] +; CHECK-NEXT: vmovups {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -2088,7 +2088,7 @@ define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] +; CHECK-NEXT: vmovups {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> @@ -2097,7 +2097,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] +; CHECK-NEXT: vmovups {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %zmm4, %zmm2, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} @@ -2112,7 +2112,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] +; CHECK-NEXT: vmovups {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} @@ -2125,7 +2125,7 @@ define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] +; CHECK-NEXT: vmovups {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] ; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -2135,7 +2135,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] +; CHECK-NEXT: vmovups {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -2150,7 +2150,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] +; CHECK-NEXT: vmovups {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -2165,7 +2165,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] +; CHECK-NEXT: vmovups {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -2180,7 +2180,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] +; CHECK-NEXT: vmovups {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -2195,7 +2195,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] +; CHECK-NEXT: vmovups {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -2210,7 +2210,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] +; CHECK-NEXT: vmovups {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -2225,7 +2225,7 @@ define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] +; CHECK-NEXT: vmovups {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] ; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -2235,7 +2235,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] +; CHECK-NEXT: vmovups {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} @@ -2250,7 +2250,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] +; CHECK-NEXT: vmovups {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1 ; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} @@ -2515,7 +2515,7 @@ define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] +; CHECK-NEXT: vmovups {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> @@ -2694,7 +2694,7 @@ define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] +; CHECK-NEXT: vmovups {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> @@ -2757,7 +2757,7 @@ define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) { ; CHECK-LABEL: test_8xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] +; CHECK-NEXT: vmovups {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] ; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp @@ -2950,7 +2950,7 @@ define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) { ; CHECK-LABEL: test_8xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] +; CHECK-NEXT: vmovups {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] ; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp diff --git a/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll b/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll --- a/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll +++ b/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll @@ -13,7 +13,7 @@ ; ; CHECK-LABEL: test_max_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovups (%rdi), %zmm1 ; CHECK-NEXT: vmaxps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %a = load <16 x float>, <16 x float>* %a_ptr @@ -30,7 +30,7 @@ ; ; CHECK-LABEL: test_min_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovups (%rdi), %zmm1 ; CHECK-NEXT: vminps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %a = load <16 x float>, <16 x float>* %a_ptr diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll @@ -401,7 +401,7 @@ ; ALL-NEXT: subq $24, %rsp ; ALL-NEXT: .cfi_def_cfa_offset 32 ; ALL-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; ALL-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; ALL-NEXT: callq func_f32 ; ALL-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload ; ALL-NEXT: addq $24, %rsp diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll --- a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll @@ -196,7 +196,7 @@ ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX512-NEXT: vmovaps %ymm1, (%rsi) +; X64-AVX512-NEXT: vmovups %ymm1, (%rsi) ; X64-AVX512-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -1207,15 +1207,15 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c] ; X86-NEXT: kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca] -; X86-NEXT: vmovaps %xmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x29,0x01] -; X86-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-NEXT: vmovups %xmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x11,0x01] +; X86-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_store_ps_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca] -; X64-NEXT: vmovaps %xmm0, (%rdi) {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x29,0x07] -; X64-NEXT: vmovaps %xmm0, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x06] +; X64-NEXT: vmovups %xmm0, (%rdi) {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x11,0x07] +; X64-NEXT: vmovups %xmm0, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x06] ; X64-NEXT: retq # encoding: [0xc3] call void @llvm.x86.avx512.mask.store.ps.128(i8* %ptr1, <4 x float> %x1, i8 %x2) call void @llvm.x86.avx512.mask.store.ps.128(i8* %ptr2, <4 x float> %x1, i8 -1) @@ -1231,16 +1231,16 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c] ; X86-NEXT: kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca] -; X86-NEXT: vmovaps %ymm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x29,0x01] -; X86-NEXT: vmovaps %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x00] +; X86-NEXT: vmovups %ymm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x11,0x01] +; X86-NEXT: vmovups %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x00] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_store_ps_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca] -; X64-NEXT: vmovaps %ymm0, (%rdi) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x29,0x07] -; X64-NEXT: vmovaps %ymm0, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x06] +; X64-NEXT: vmovups %ymm0, (%rdi) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x11,0x07] +; X64-NEXT: vmovups %ymm0, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x06] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] call void @llvm.x86.avx512.mask.store.ps.256(i8* %ptr1, <8 x float> %x1, i8 %x2) @@ -1502,20 +1502,20 @@ ; X86-LABEL: test_mask_load_aligned_ps_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovaps (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x00] +; X86-NEXT: vmovups (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x00] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] ; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vmovaps (%eax), %ymm0 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x28,0x00] -; X86-NEXT: vmovaps (%eax), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0x08] +; X86-NEXT: vmovups (%eax), %ymm0 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x10,0x00] +; X86-NEXT: vmovups (%eax), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x10,0x08] ; X86-NEXT: vaddps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_load_aligned_ps_256: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] +; X64-NEXT: vmovups (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vmovaps (%rdi), %ymm0 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x28,0x07] -; X64-NEXT: vmovaps (%rdi), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0x0f] +; X64-NEXT: vmovups (%rdi), %ymm0 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x10,0x07] +; X64-NEXT: vmovups (%rdi), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x10,0x0f] ; X64-NEXT: vaddps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1) @@ -1618,20 +1618,20 @@ ; X86-LABEL: test_mask_load_aligned_ps_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00] +; X86-NEXT: vmovups (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x00] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] ; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vmovaps (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x28,0x00] -; X86-NEXT: vmovaps (%eax), %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0x08] +; X86-NEXT: vmovups (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x10,0x00] +; X86-NEXT: vmovups (%eax), %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x10,0x08] ; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_load_aligned_ps_128: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; X64-NEXT: vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vmovaps (%rdi), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x28,0x07] -; X64-NEXT: vmovaps (%rdi), %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x28,0x0f] +; X64-NEXT: vmovups (%rdi), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x10,0x07] +; X64-NEXT: vmovups (%rdi), %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x10,0x0f] ; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1) @@ -17250,7 +17250,7 @@ ; X86-NEXT: kshiftlw $4, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x04] ; X86-NEXT: korw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x45,0xc9] ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; X86-NEXT: vmovaps %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x29,0x00] +; X86-NEXT: vmovups %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x11,0x00] ; X86-NEXT: movl %ebp, %esp # encoding: [0x89,0xec] ; X86-NEXT: popl %ebp # encoding: [0x5d] ; X86-NEXT: .cfi_def_cfa %esp, 4 @@ -17264,7 +17264,7 @@ ; X64-NEXT: kshiftlw $4, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x04] ; X64-NEXT: korw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x45,0xc9] ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; X64-NEXT: vmovaps %ymm0, (%rdi) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x29,0x07] +; X64-NEXT: vmovups %ymm0, (%rdi) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x11,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] entry: @@ -17291,7 +17291,7 @@ ; X86-NEXT: vcmpltps 8(%ebp), %ymm2, %k1 # encoding: [0x62,0xf1,0x6c,0x28,0xc2,0x8d,0x08,0x00,0x00,0x00,0x01] ; X86-NEXT: kunpckbw %k0, %k1, %k1 # encoding: [0xc5,0xf5,0x4b,0xc8] ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; X86-NEXT: vmovaps %zmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x29,0x00] +; X86-NEXT: vmovups %zmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x11,0x00] ; X86-NEXT: movl %ebp, %esp # encoding: [0x89,0xec] ; X86-NEXT: popl %ebp # encoding: [0x5d] ; X86-NEXT: .cfi_def_cfa %esp, 4 @@ -17304,7 +17304,7 @@ ; X64-NEXT: vcmpltps %ymm3, %ymm2, %k1 # encoding: [0x62,0xf1,0x6c,0x28,0xc2,0xcb,0x01] ; X64-NEXT: kunpckbw %k0, %k1, %k1 # encoding: [0xc5,0xf5,0x4b,0xc8] ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; X64-NEXT: vmovaps %zmm0, (%rdi) {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x29,0x07] +; X64-NEXT: vmovups %zmm0, (%rdi) {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x11,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] entry: diff --git a/llvm/test/CodeGen/X86/avx512vl-mov.ll b/llvm/test/CodeGen/X86/avx512vl-mov.ll --- a/llvm/test/CodeGen/X86/avx512vl-mov.ll +++ b/llvm/test/CodeGen/X86/avx512vl-mov.ll @@ -14,7 +14,7 @@ define <8 x i32> @test_256_2(i8 * %addr) { ; CHECK-LABEL: test_256_2: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i32>* %res = load <8 x i32>, <8 x i32>* %vaddr, align 32 @@ -24,7 +24,7 @@ define void @test_256_3(i8 * %addr, <4 x i64> %data) { ; CHECK-LABEL: test_256_3: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x07] +; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x i64>* store <4 x i64>%data, <4 x i64>* %vaddr, align 32 @@ -44,7 +44,7 @@ define void @test_256_5(i8 * %addr, <8 x i32> %data) { ; CHECK-LABEL: test_256_5: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x07] +; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i32>* store <8 x i32>%data, <8 x i32>* %vaddr, align 32 @@ -54,7 +54,7 @@ define <4 x i64> @test_256_6(i8 * %addr) { ; CHECK-LABEL: test_256_6: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x i64>* %res = load <4 x i64>, <4 x i64>* %vaddr, align 32 @@ -84,7 +84,7 @@ define void @test_256_9(i8 * %addr, <4 x double> %data) { ; CHECK-LABEL: test_256_9: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x07] +; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x double>* store <4 x double>%data, <4 x double>* %vaddr, align 32 @@ -94,7 +94,7 @@ define <4 x double> @test_256_10(i8 * %addr) { ; CHECK-LABEL: test_256_10: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x double>* %res = load <4 x double>, <4 x double>* %vaddr, align 32 @@ -104,7 +104,7 @@ define void @test_256_11(i8 * %addr, <8 x float> %data) { ; CHECK-LABEL: test_256_11: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x07] +; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x float>* store <8 x float>%data, <8 x float>* %vaddr, align 32 @@ -114,7 +114,7 @@ define <8 x float> @test_256_12(i8 * %addr) { ; CHECK-LABEL: test_256_12: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x float>* %res = load <8 x float>, <8 x float>* %vaddr, align 32 @@ -270,7 +270,7 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] ; CHECK-NEXT: vcmpneq_oqps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x0c] -; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = fcmp one <8 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x float>* @@ -298,7 +298,7 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] ; CHECK-NEXT: vcmpneq_oqps %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc9,0x0c] -; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = fcmp one <8 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x float>* @@ -386,7 +386,7 @@ define <4 x i32> @test_128_2(i8 * %addr) { ; CHECK-LABEL: test_128_2: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x i32>* %res = load <4 x i32>, <4 x i32>* %vaddr, align 16 @@ -396,7 +396,7 @@ define void @test_128_3(i8 * %addr, <2 x i64> %data) { ; CHECK-LABEL: test_128_3: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <2 x i64>* store <2 x i64>%data, <2 x i64>* %vaddr, align 16 @@ -416,7 +416,7 @@ define void @test_128_5(i8 * %addr, <4 x i32> %data) { ; CHECK-LABEL: test_128_5: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x i32>* store <4 x i32>%data, <4 x i32>* %vaddr, align 16 @@ -426,7 +426,7 @@ define <2 x i64> @test_128_6(i8 * %addr) { ; CHECK-LABEL: test_128_6: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <2 x i64>* %res = load <2 x i64>, <2 x i64>* %vaddr, align 16 @@ -456,7 +456,7 @@ define void @test_128_9(i8 * %addr, <2 x double> %data) { ; CHECK-LABEL: test_128_9: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <2 x double>* store <2 x double>%data, <2 x double>* %vaddr, align 16 @@ -466,7 +466,7 @@ define <2 x double> @test_128_10(i8 * %addr) { ; CHECK-LABEL: test_128_10: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <2 x double>* %res = load <2 x double>, <2 x double>* %vaddr, align 16 @@ -476,7 +476,7 @@ define void @test_128_11(i8 * %addr, <4 x float> %data) { ; CHECK-LABEL: test_128_11: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x float>* store <4 x float>%data, <4 x float>* %vaddr, align 16 @@ -486,7 +486,7 @@ define <4 x float> @test_128_12(i8 * %addr) { ; CHECK-LABEL: test_128_12: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x float>* %res = load <4 x float>, <4 x float>* %vaddr, align 16 @@ -641,7 +641,7 @@ ; CHECK-LABEL: test_128_25: ; CHECK: ## %bb.0: ; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 ## encoding: [0x62,0xf2,0x75,0x08,0x27,0xc9] -; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <4 x float>* @@ -667,7 +667,7 @@ ; CHECK-LABEL: test_128_27: ; CHECK: ## %bb.0: ; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc8] -; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x28,0x07] +; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %mask = icmp ne <4 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <4 x float>* diff --git a/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll @@ -8,7 +8,7 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq func_f32 ; CHECK-NEXT: vbroadcastss (%rsp), %ymm0 # 16-byte Folded Reload ; CHECK-NEXT: addq $24, %rsp @@ -27,7 +27,7 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq func_f32 ; CHECK-NEXT: vbroadcastss (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: addq $24, %rsp diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll --- a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -19398,7 +19398,7 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: vmovaps (%rdi), %xmm1 +; NoVLX-NEXT: vmovups (%rdi), %xmm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -19490,7 +19490,7 @@ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: vmovups (%rsi), %xmm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -19584,7 +19584,7 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: vmovaps (%rdi), %xmm1 +; NoVLX-NEXT: vmovups (%rdi), %xmm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -19676,7 +19676,7 @@ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: vmovups (%rsi), %xmm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -19767,7 +19767,7 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: vmovaps (%rdi), %xmm1 +; NoVLX-NEXT: vmovups (%rdi), %xmm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -19853,7 +19853,7 @@ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: vmovups (%rsi), %xmm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -19941,7 +19941,7 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: vmovaps (%rdi), %xmm1 +; NoVLX-NEXT: vmovups (%rdi), %xmm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -20027,7 +20027,7 @@ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: vmovups (%rsi), %xmm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -20120,7 +20120,7 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: vmovaps (%rdi), %ymm1 +; NoVLX-NEXT: vmovups (%rdi), %ymm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 @@ -20214,7 +20214,7 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: vmovaps (%rsi), %ymm1 +; NoVLX-NEXT: vmovups (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 @@ -20309,7 +20309,7 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: vmovaps (%rdi), %ymm1 +; NoVLX-NEXT: vmovups (%rdi), %ymm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 @@ -20397,7 +20397,7 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: vmovaps (%rsi), %ymm1 +; NoVLX-NEXT: vmovups (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 @@ -20489,7 +20489,7 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: vmovaps (%rdi), %ymm1 +; NoVLX-NEXT: vmovups (%rdi), %ymm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 @@ -20577,7 +20577,7 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: vmovaps (%rsi), %ymm1 +; NoVLX-NEXT: vmovups (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll @@ -86,7 +86,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] -; X86-NEXT: vmovaps (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x02] +; X86-NEXT: vmovups (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x02] ; X86-NEXT: vp2intersectd (%ecx), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x01] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] @@ -98,7 +98,7 @@ ; ; X64-LABEL: test_mm256_2intersect_epi32_p: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] +; X64-NEXT: vmovups (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] ; X64-NEXT: vp2intersectd (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x06] ; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] ; X64-NEXT: kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0] @@ -131,7 +131,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-NEXT: vmovaps (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x06] +; X86-NEXT: vmovups (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x06] ; X86-NEXT: vp2intersectq (%edx), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x02] ; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] @@ -148,7 +148,7 @@ ; ; X64-LABEL: test_mm256_2intersect_epi64_p: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] +; X64-NEXT: vmovups (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] ; X64-NEXT: vp2intersectq (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x06] ; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] @@ -370,7 +370,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-NEXT: vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06] +; X86-NEXT: vmovups (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x06] ; X86-NEXT: vp2intersectd (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x02] ; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] @@ -386,7 +386,7 @@ ; ; X64-LABEL: test_mm_2intersect_epi32_p: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; X64-NEXT: vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] ; X64-NEXT: vp2intersectd (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x06] ; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] @@ -424,7 +424,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-NEXT: vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06] +; X86-NEXT: vmovups (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x06] ; X86-NEXT: vp2intersectq (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x02] ; X86-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] ; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] @@ -440,7 +440,7 @@ ; ; X64-LABEL: test_mm_2intersect_epi64_p: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; X64-NEXT: vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] ; X64-NEXT: vp2intersectq (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x06] ; X64-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] ; X64-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] diff --git a/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll @@ -76,7 +76,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-NEXT: vmovaps (%esi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0x06] +; X86-NEXT: vmovups (%esi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x10,0x06] ; X86-NEXT: vp2intersectd (%edx), %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0x02] ; X86-NEXT: kmovw %k0, (%ecx) # encoding: [0xc5,0xf8,0x91,0x01] ; X86-NEXT: kmovw %k1, (%eax) # encoding: [0xc5,0xf8,0x91,0x08] @@ -87,7 +87,7 @@ ; ; X64-LABEL: test_mm512_2intersect_epi32_p: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] +; X64-NEXT: vmovups (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] ; X64-NEXT: vp2intersectd (%rsi), %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0x06] ; X64-NEXT: kmovw %k0, (%rdx) # encoding: [0xc5,0xf8,0x91,0x02] ; X64-NEXT: kmovw %k1, (%rcx) # encoding: [0xc5,0xf8,0x91,0x09] @@ -114,7 +114,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] -; X86-NEXT: vmovaps (%edx), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0x02] +; X86-NEXT: vmovups (%edx), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x10,0x02] ; X86-NEXT: vp2intersectq (%ecx), %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0x01] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] @@ -126,7 +126,7 @@ ; ; X64-LABEL: test_mm512_2intersect_epi64_p: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] +; X64-NEXT: vmovups (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] ; X64-NEXT: vp2intersectq (%rsi), %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0x06] ; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] ; X64-NEXT: kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0] diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -569,7 +569,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -728,7 +728,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm4 diff --git a/llvm/test/CodeGen/X86/break-false-dep.ll b/llvm/test/CodeGen/X86/break-false-dep.ll --- a/llvm/test/CodeGen/X86/break-false-dep.ll +++ b/llvm/test/CodeGen/X86/break-false-dep.ll @@ -343,16 +343,16 @@ ; AVX-LABEL: loopdep2: ; AVX: # %bb.0: # %entry ; AVX-NEXT: subq $184, %rsp -; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: movq (%rcx), %rax ; AVX-NEXT: movl $1, %r8d ; AVX-NEXT: .p2align 4, 0x90 @@ -372,16 +372,16 @@ ; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 ; AVX-NEXT: jne .LBB7_1 ; AVX-NEXT: # %bb.2: # %ret -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX-NEXT: addq $184, %rsp ; AVX-NEXT: retq entry: @@ -527,25 +527,25 @@ ; AVX-NEXT: .seh_pushreg %rsi ; AVX-NEXT: subq $160, %rsp ; AVX-NEXT: .seh_stackalloc 160 -; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm15, 144 -; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm14, 128 -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm13, 112 -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm12, 96 -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm11, 80 -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm10, 64 -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm9, 48 -; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm8, 32 -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm7, 16 -; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm6, (%rsp) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm6, 0 ; AVX-NEXT: .seh_endprologue ; AVX-NEXT: xorl %r9d, %r9d @@ -582,16 +582,16 @@ ; AVX-NEXT: cmpl $100000, %r9d # imm = 0x186A0 ; AVX-NEXT: jne .LBB8_1 ; AVX-NEXT: # %bb.4: # %for.end16 -; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX-NEXT: addq $160, %rsp ; AVX-NEXT: popq %rsi ; AVX-NEXT: retq @@ -718,25 +718,25 @@ ; AVX: # %bb.0: # %top ; AVX-NEXT: subq $168, %rsp ; AVX-NEXT: .seh_stackalloc 168 -; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm15, 144 -; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm14, 128 -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm13, 112 -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm12, 96 -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm11, 80 -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm10, 64 -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm9, 48 -; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm8, 32 -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm7, 16 -; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm6, (%rsp) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm6, 0 ; AVX-NEXT: .seh_endprologue ; AVX-NEXT: #APP @@ -757,16 +757,16 @@ ; AVX-NEXT: #NO_APP ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0 -; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX-NEXT: addq $168, %rsp ; AVX-NEXT: retq ; AVX-NEXT: .seh_endproc @@ -877,25 +877,25 @@ ; AVX: # %bb.0: # %top ; AVX-NEXT: subq $184, %rsp ; AVX-NEXT: .seh_stackalloc 184 -; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm15, 160 -; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm14, 144 -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm13, 128 -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm12, 112 -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm11, 96 -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm10, 80 -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm9, 64 -; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm8, 48 -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm7, 32 -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm6, 16 ; AVX-NEXT: .seh_endprologue ; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill @@ -920,16 +920,16 @@ ; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX-NEXT: addq $184, %rsp ; AVX-NEXT: retq ; AVX-NEXT: .seh_endproc @@ -1037,25 +1037,25 @@ ; AVX: # %bb.0: # %top ; AVX-NEXT: subq $168, %rsp ; AVX-NEXT: .seh_stackalloc 168 -; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm15, 144 -; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm14, 128 -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm13, 112 -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm12, 96 -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm11, 80 -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm10, 64 -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm9, 48 -; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm8, 32 -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm7, 16 -; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm6, (%rsp) # 16-byte Spill ; AVX-NEXT: .seh_savexmm %xmm6, 0 ; AVX-NEXT: .seh_endprologue ; AVX-NEXT: #APP @@ -1078,16 +1078,16 @@ ; AVX-NEXT: #NO_APP ; AVX-NEXT: vxorps %xmm6, %xmm6, %xmm6 ; AVX-NEXT: vcvtsi2sd %rcx, %xmm6, %xmm0 -; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX-NEXT: addq $168, %rsp ; AVX-NEXT: retq ; AVX-NEXT: .seh_endproc @@ -1195,14 +1195,14 @@ ; AVX-LABEL: loopclearence: ; AVX: # %bb.0: # %entry ; AVX-NEXT: subq $136, %rsp -; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm8, (%rsp) # 16-byte Spill ; AVX-NEXT: movq (%rcx), %rax ; AVX-NEXT: movl $1, %r8d ; AVX-NEXT: .p2align 4, 0x90 @@ -1230,14 +1230,14 @@ ; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 ; AVX-NEXT: jne .LBB12_1 ; AVX-NEXT: # %bb.2: # %ret -; AVX-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX-NEXT: addq $136, %rsp ; AVX-NEXT: retq entry: @@ -1406,23 +1406,23 @@ ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: subq $152, %rsp ; AVX1-NEXT: .seh_stackalloc 152 -; AVX1-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovups %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: .seh_savexmm %xmm15, 128 -; AVX1-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovups %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: .seh_savexmm %xmm14, 112 -; AVX1-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovups %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: .seh_savexmm %xmm13, 96 -; AVX1-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovups %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: .seh_savexmm %xmm12, 80 -; AVX1-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovups %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: .seh_savexmm %xmm11, 64 -; AVX1-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovups %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: .seh_savexmm %xmm10, 48 -; AVX1-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovups %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: .seh_savexmm %xmm9, 32 -; AVX1-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovups %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: .seh_savexmm %xmm8, 16 -; AVX1-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovups %xmm7, (%rsp) # 16-byte Spill ; AVX1-NEXT: .seh_savexmm %xmm7, 0 ; AVX1-NEXT: .seh_endprologue ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r8 @@ -1470,15 +1470,15 @@ ; AVX1-NEXT: cmpq %r10, %r8 ; AVX1-NEXT: jge .LBB13_1 ; AVX1-NEXT: # %bb.3: # %loopdone -; AVX1-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-NEXT: vmovups (%rsp), %xmm7 # 16-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-NEXT: addq $152, %rsp ; AVX1-NEXT: retq ; AVX1-NEXT: .seh_endproc @@ -1487,23 +1487,23 @@ ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: subq $152, %rsp ; AVX512VL-NEXT: .seh_stackalloc 152 -; AVX512VL-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovups %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm15, 128 -; AVX512VL-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovups %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm14, 112 -; AVX512VL-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovups %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm13, 96 -; AVX512VL-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovups %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm12, 80 -; AVX512VL-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovups %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm11, 64 -; AVX512VL-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovups %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm10, 48 -; AVX512VL-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovups %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm9, 32 -; AVX512VL-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovups %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm8, 16 -; AVX512VL-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovups %xmm7, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm7, 0 ; AVX512VL-NEXT: .seh_endprologue ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero @@ -1551,15 +1551,15 @@ ; AVX512VL-NEXT: cmpq %r10, %r8 ; AVX512VL-NEXT: jge .LBB13_1 ; AVX512VL-NEXT: # %bb.3: # %loopdone -; AVX512VL-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512VL-NEXT: vmovups (%rsp), %xmm7 # 16-byte Reload +; AVX512VL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512VL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512VL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512VL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512VL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512VL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512VL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512VL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX512VL-NEXT: addq $152, %rsp ; AVX512VL-NEXT: retq ; AVX512VL-NEXT: .seh_endproc diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll --- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -296,7 +296,7 @@ ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vmovups {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -328,7 +328,7 @@ ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vmovups {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -577,7 +577,7 @@ ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm4, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX-NEXT: vmovups {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -611,7 +611,7 @@ ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm4, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX-64-NEXT: vmovups {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -1055,7 +1055,7 @@ ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm4, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vmovups {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -1089,7 +1089,7 @@ ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm4, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vmovups {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -1508,7 +1508,7 @@ ; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddq %xmm4, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0] +; AVX-NEXT: vmovups {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -1542,7 +1542,7 @@ ; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddq %xmm4, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3] +; AVX-64-NEXT: vmovups {{.*#+}} ymm2 = [0,1,2,3] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -1800,7 +1800,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) { ; AVX-LABEL: f16xf32_f256: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] +; AVX-NEXT: vmovups {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 @@ -1809,7 +1809,7 @@ ; ; AVX2-LABEL: f16xf32_f256: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] +; AVX2-NEXT: vmovups {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 @@ -1826,7 +1826,7 @@ ; ; AVX-64-LABEL: f16xf32_f256: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] +; AVX-64-NEXT: vmovups {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 @@ -1835,7 +1835,7 @@ ; ; AVX2-64-LABEL: f16xf32_f256: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] +; AVX2-64-NEXT: vmovups {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 diff --git a/llvm/test/CodeGen/X86/bswap-vector.ll b/llvm/test/CodeGen/X86/bswap-vector.ll --- a/llvm/test/CodeGen/X86/bswap-vector.ll +++ b/llvm/test/CodeGen/X86/bswap-vector.ll @@ -344,7 +344,7 @@ ; ; CHECK-AVX-LABEL: fold_v8i16: ; CHECK-AVX: # %bb.0: # %entry -; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536] +; CHECK-AVX-NEXT: vmovups {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536] ; CHECK-AVX-NEXT: retq entry: %r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> ) @@ -359,7 +359,7 @@ ; ; CHECK-AVX-LABEL: fold_v4i32: ; CHECK-AVX: # %bb.0: # %entry -; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863] +; CHECK-AVX-NEXT: vmovups {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863] ; CHECK-AVX-NEXT: retq entry: %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> ) @@ -379,7 +379,7 @@ ; ; CHECK-AVX-LABEL: fold_v2i64: ; CHECK-AVX: # %bb.0: # %entry -; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615] +; CHECK-AVX-NEXT: vmovups {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615] ; CHECK-AVX-NEXT: retq entry: %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> ) @@ -395,7 +395,7 @@ ; ; CHECK-AVX-LABEL: fold_v16i16: ; CHECK-AVX: # %bb.0: # %entry -; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584] +; CHECK-AVX-NEXT: vmovups {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584] ; CHECK-AVX-NEXT: retq entry: %r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> ) @@ -411,7 +411,7 @@ ; ; CHECK-AVX-LABEL: fold_v8i32: ; CHECK-AVX: # %bb.0: # %entry -; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296] +; CHECK-AVX-NEXT: vmovups {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296] ; CHECK-AVX-NEXT: retq entry: %r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> ) @@ -433,7 +433,7 @@ ; ; CHECK-AVX-LABEL: fold_v4i64: ; CHECK-AVX: # %bb.0: # %entry -; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160] +; CHECK-AVX-NEXT: vmovups {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160] ; CHECK-AVX-NEXT: retq entry: %r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> ) diff --git a/llvm/test/CodeGen/X86/bug37521.ll b/llvm/test/CodeGen/X86/bug37521.ll --- a/llvm/test/CodeGen/X86/bug37521.ll +++ b/llvm/test/CodeGen/X86/bug37521.ll @@ -15,7 +15,7 @@ ; CHECK-NEXT: movq a+{{.*}}(%rip), %rdx ; CHECK-NEXT: movq a+{{.*}}(%rip), %rsi ; CHECK-NEXT: movq {{.*}}(%rip), %rdi -; CHECK-NEXT: vmovaps a+{{.*}}(%rip), %xmm0 +; CHECK-NEXT: vmovups a+{{.*}}(%rip), %xmm0 ; CHECK-NEXT: vmovups %xmm0, (%rsp) ; CHECK-NEXT: callq goo ; CHECK-NEXT: addq $24, %rsp diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -745,8 +745,8 @@ ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovups %ymm0, 32(%rsi) +; AVX1-NEXT: vmovups %ymm0, (%rsi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll --- a/llvm/test/CodeGen/X86/cast-vsel.ll +++ b/llvm/test/CodeGen/X86/cast-vsel.ll @@ -339,7 +339,7 @@ ; AVX1-LABEL: example25: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000 -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vmovups {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB5_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 @@ -493,8 +493,8 @@ ; AVX1-NEXT: vblendvps %xmm4, %xmm0, %xmm1, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-NEXT: vblendvps %xmm4, %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vmovaps %xmm4, dj+4112(%rax) -; AVX1-NEXT: vmovaps %xmm5, dj+4096(%rax) +; AVX1-NEXT: vmovups %xmm4, dj+4112(%rax) +; AVX1-NEXT: vmovups %xmm5, dj+4096(%rax) ; AVX1-NEXT: addq $32, %rax ; AVX1-NEXT: jne .LBB6_1 ; AVX1-NEXT: # %bb.2: # %for.end diff --git a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll --- a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -665,7 +665,7 @@ ; ; AVX-LABEL: _clearupper16xi16b: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX-NEXT: vmovups {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -803,7 +803,7 @@ ; AVX-LABEL: _clearupper16xi8b: ; AVX: # %bb.0: ; AVX-NEXT: pushq %rbx -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %r9 ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: movq %r9, %r8 @@ -850,7 +850,7 @@ ; AVX-NEXT: shlq $56, %r8 ; AVX-NEXT: orq %r10, %r8 ; AVX-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: vmovups -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: popq %rbx ; AVX-NEXT: retq %x4 = bitcast <16 x i8> %0 to <32 x i4> @@ -984,7 +984,7 @@ ; ; AVX1-LABEL: _clearupper32xi8b: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: movq %rax, %r8 ; AVX1-NEXT: movq %rax, %rdx @@ -1076,7 +1076,7 @@ ; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX1-NEXT: shrq $56, %rcx ; AVX1-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm1 +; AVX1-NEXT: vmovups -{{[0-9]+}}(%rsp), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/combine-abs.ll b/llvm/test/CodeGen/X86/combine-abs.ll --- a/llvm/test/CodeGen/X86/combine-abs.ll +++ b/llvm/test/CodeGen/X86/combine-abs.ll @@ -14,7 +14,7 @@ ; ; AVX-LABEL: combine_v4i32_abs_constant: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,3,2147483648] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [0,1,3,2147483648] ; AVX-NEXT: retq %1 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> , i1 false) ret <4 x i32> %1 @@ -29,7 +29,7 @@ ; ; AVX-LABEL: combine_v16i16_abs_constant: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,1,3,3,7,7,255,255,4096,4096,32767,32767,32768,32768,0] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [0,1,1,3,3,7,7,255,255,4096,4096,32767,32767,32768,32768,0] ; AVX-NEXT: retq %1 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> , i1 false) ret <16 x i16> %1 diff --git a/llvm/test/CodeGen/X86/combine-add-ssat.ll b/llvm/test/CodeGen/X86/combine-add-ssat.ll --- a/llvm/test/CodeGen/X86/combine-add-ssat.ll +++ b/llvm/test/CodeGen/X86/combine-add-ssat.ll @@ -53,7 +53,7 @@ ; ; AVX-LABEL: combine_constfold_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,256,65534,0,65280,32768,0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [1,0,256,65534,0,65280,32768,0] ; AVX-NEXT: retq %res = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res @@ -67,7 +67,7 @@ ; ; AVX-LABEL: combine_constfold_undef_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65535,65535,65534,0,65280,32768,0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [65535,65535,65535,65534,0,65280,32768,0] ; AVX-NEXT: retq %res = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res diff --git a/llvm/test/CodeGen/X86/combine-add-usat.ll b/llvm/test/CodeGen/X86/combine-add-usat.ll --- a/llvm/test/CodeGen/X86/combine-add-usat.ll +++ b/llvm/test/CodeGen/X86/combine-add-usat.ll @@ -53,7 +53,7 @@ ; ; AVX-LABEL: combine_constfold_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,65535,256,65535,65535,65535,2,65535] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [1,65535,256,65535,65535,65535,2,65535] ; AVX-NEXT: retq %res = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res @@ -67,7 +67,7 @@ ; ; AVX-LABEL: combine_constfold_undef_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,2,65535] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,2,65535] ; AVX-NEXT: retq %res = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll --- a/llvm/test/CodeGen/X86/combine-bitselect.ll +++ b/llvm/test/CodeGen/X86/combine-bitselect.ll @@ -64,7 +64,7 @@ ; ; AVX-LABEL: bitselect_v2i64_rm: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm1 +; AVX-NEXT: vmovups (%rdi), %xmm1 ; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 @@ -72,7 +72,7 @@ ; ; AVX512F-LABEL: bitselect_v2i64_rm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm1 +; AVX512F-NEXT: vmovups (%rdi), %xmm1 ; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 @@ -107,7 +107,7 @@ ; ; AVX-LABEL: bitselect_v2i64_mr: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm1 +; AVX-NEXT: vmovups (%rdi), %xmm1 ; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 @@ -115,7 +115,7 @@ ; ; AVX512F-LABEL: bitselect_v2i64_mr: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm1 +; AVX512F-NEXT: vmovups (%rdi), %xmm1 ; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 @@ -152,8 +152,8 @@ ; ; AVX-LABEL: bitselect_v2i64_mm: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vmovaps (%rsi), %xmm1 +; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rsi), %xmm1 ; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 @@ -161,8 +161,8 @@ ; ; AVX512F-LABEL: bitselect_v2i64_mm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 -; AVX512F-NEXT: vmovaps (%rsi), %xmm1 +; AVX512F-NEXT: vmovups (%rdi), %xmm0 +; AVX512F-NEXT: vmovups (%rsi), %xmm1 ; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 @@ -369,7 +369,7 @@ ; ; AVX-LABEL: bitselect_v4i64_rm: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %ymm1 +; AVX-NEXT: vmovups (%rdi), %ymm1 ; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -377,7 +377,7 @@ ; ; AVX512F-LABEL: bitselect_v4i64_rm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm1 +; AVX512F-NEXT: vmovups (%rdi), %ymm1 ; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -420,7 +420,7 @@ ; ; AVX-LABEL: bitselect_v4i64_mr: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %ymm1 +; AVX-NEXT: vmovups (%rdi), %ymm1 ; AVX-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -428,7 +428,7 @@ ; ; AVX512F-LABEL: bitselect_v4i64_mr: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm1 +; AVX512F-NEXT: vmovups (%rdi), %ymm1 ; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -470,8 +470,8 @@ ; ; AVX-LABEL: bitselect_v4i64_mm: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: vmovaps (%rsi), %ymm1 +; AVX-NEXT: vmovups (%rdi), %ymm0 +; AVX-NEXT: vmovups (%rsi), %ymm1 ; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -479,8 +479,8 @@ ; ; AVX512F-LABEL: bitselect_v4i64_mm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; AVX512F-NEXT: vmovaps (%rsi), %ymm1 +; AVX512F-NEXT: vmovups (%rdi), %ymm0 +; AVX512F-NEXT: vmovups (%rsi), %ymm1 ; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -662,7 +662,7 @@ ; ; AVX-LABEL: bitselect_v8i64_rr: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725] +; AVX-NEXT: vmovups {{.*#+}} ymm4 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725] ; AVX-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0 diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll --- a/llvm/test/CodeGen/X86/combine-concatvectors.ll +++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: PR32957: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: vmovaps %ymm0, (%rsi) +; CHECK-NEXT: vmovups %ymm0, (%rsi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %ld = load <2 x float>, <2 x float>* %in, align 8 diff --git a/llvm/test/CodeGen/X86/combine-fabs.ll b/llvm/test/CodeGen/X86/combine-fabs.ll --- a/llvm/test/CodeGen/X86/combine-fabs.ll +++ b/llvm/test/CodeGen/X86/combine-fabs.ll @@ -30,7 +30,7 @@ ; ; AVX-LABEL: combine_vec_fabs_constant: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,2.0E+0,2.0E+0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [0.0E+0,0.0E+0,2.0E+0,2.0E+0] ; AVX-NEXT: retq %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> ) ret <4 x float> %1 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -149,7 +149,7 @@ ; ; AVX1-LABEL: combine_vec_sdiv_dupe: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; AVX1-NEXT: vmovups {{.*#+}} xmm0 = [1,1,1,1] ; AVX1-NEXT: retq ; ; AVX2ORLATER-LABEL: combine_vec_sdiv_dupe: @@ -159,7 +159,7 @@ ; ; XOP-LABEL: combine_vec_sdiv_dupe: ; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; XOP-NEXT: vmovups {{.*#+}} xmm0 = [1,1,1,1] ; XOP-NEXT: retq %1 = sdiv <4 x i32> %x, %x ret <4 x i32> %1 diff --git a/llvm/test/CodeGen/X86/combine-sub-ssat.ll b/llvm/test/CodeGen/X86/combine-sub-ssat.ll --- a/llvm/test/CodeGen/X86/combine-sub-ssat.ll +++ b/llvm/test/CodeGen/X86/combine-sub-ssat.ll @@ -53,7 +53,7 @@ ; ; AVX-LABEL: combine_constfold_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,2,254,0,65534,65282,32786,2] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [65535,2,254,0,65534,65282,32786,2] ; AVX-NEXT: retq %res = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res @@ -67,7 +67,7 @@ ; ; AVX-LABEL: combine_constfold_undef_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,65282,32786,2] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [0,0,0,0,65534,65282,32786,2] ; AVX-NEXT: retq %res = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll --- a/llvm/test/CodeGen/X86/combine-sub-usat.ll +++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll @@ -53,7 +53,7 @@ ; ; AVX-LABEL: combine_constfold_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0] ; AVX-NEXT: retq %res = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res @@ -67,7 +67,7 @@ ; ; AVX-LABEL: combine_constfold_undef_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0] ; AVX-NEXT: retq %res = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -137,7 +137,7 @@ ; ; AVX1-LABEL: combine_vec_udiv_dupe: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; AVX1-NEXT: vmovups {{.*#+}} xmm0 = [1,1,1,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_vec_udiv_dupe: @@ -147,7 +147,7 @@ ; ; XOP-LABEL: combine_vec_udiv_dupe: ; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; XOP-NEXT: vmovups {{.*#+}} xmm0 = [1,1,1,1] ; XOP-NEXT: retq %1 = udiv <4 x i32> %x, %x ret <4 x i32> %1 diff --git a/llvm/test/CodeGen/X86/commute-fcmp.ll b/llvm/test/CodeGen/X86/commute-fcmp.ll --- a/llvm/test/CodeGen/X86/commute-fcmp.ll +++ b/llvm/test/CodeGen/X86/commute-fcmp.ll @@ -152,13 +152,13 @@ ; ; AVX-LABEL: commute_cmpps_lt: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm1 +; AVX-NEXT: vmovups (%rdi), %xmm1 ; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: commute_cmpps_lt: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rdi), %xmm1 +; AVX512-NEXT: vmovups (%rdi), %xmm1 ; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = load <4 x float>, <4 x float>* %a0 @@ -177,13 +177,13 @@ ; ; AVX-LABEL: commute_cmpps_le: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm1 +; AVX-NEXT: vmovups (%rdi), %xmm1 ; AVX-NEXT: vcmpleps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: commute_cmpps_le: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rdi), %xmm1 +; AVX512-NEXT: vmovups (%rdi), %xmm1 ; AVX512-NEXT: vcmpleps %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = load <4 x float>, <4 x float>* %a0 @@ -353,13 +353,13 @@ ; ; AVX-LABEL: commute_cmpps_lt_ymm: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %ymm1 +; AVX-NEXT: vmovups (%rdi), %ymm1 ; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: commute_cmpps_lt_ymm: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rdi), %ymm1 +; AVX512-NEXT: vmovups (%rdi), %ymm1 ; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = load <8 x float>, <8 x float>* %a0 @@ -381,13 +381,13 @@ ; ; AVX-LABEL: commute_cmpps_le_ymm: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %ymm1 +; AVX-NEXT: vmovups (%rdi), %ymm1 ; AVX-NEXT: vcmpleps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: commute_cmpps_le_ymm: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rdi), %ymm1 +; AVX512-NEXT: vmovups (%rdi), %ymm1 ; AVX512-NEXT: vcmpleps %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = load <8 x float>, <8 x float>* %a0 @@ -996,8 +996,8 @@ ; ; AVX-LABEL: commute_cmpps_lt_zmm: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %ymm2 -; AVX-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX-NEXT: vmovups (%rdi), %ymm2 +; AVX-NEXT: vmovups 32(%rdi), %ymm3 ; AVX-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 ; AVX-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 ; AVX-NEXT: retq @@ -1032,8 +1032,8 @@ ; ; AVX-LABEL: commute_cmpps_le_zmm: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %ymm2 -; AVX-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX-NEXT: vmovups (%rdi), %ymm2 +; AVX-NEXT: vmovups 32(%rdi), %ymm3 ; AVX-NEXT: vcmpleps %ymm0, %ymm2, %ymm0 ; AVX-NEXT: vcmpleps %ymm1, %ymm3, %ymm1 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/concat-cast.ll b/llvm/test/CodeGen/X86/concat-cast.ll --- a/llvm/test/CodeGen/X86/concat-cast.ll +++ b/llvm/test/CodeGen/X86/concat-cast.ll @@ -160,11 +160,11 @@ ; ; AVX1-LABEL: fptoui_v4f32_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX1-NEXT: vmovups {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; AVX1-NEXT: vcmpltps %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vsubps %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 -; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vmovups {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vxorps %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 ; AVX1-NEXT: vblendvps %xmm3, %xmm0, %xmm4, %xmm0 diff --git a/llvm/test/CodeGen/X86/dynamic-allocas-VLAs.ll b/llvm/test/CodeGen/X86/dynamic-allocas-VLAs.ll --- a/llvm/test/CodeGen/X86/dynamic-allocas-VLAs.ll +++ b/llvm/test/CodeGen/X86/dynamic-allocas-VLAs.ll @@ -114,7 +114,7 @@ ; CHECK: _t5 ; CHECK: subq ${{[0-9]+}}, %rsp ; -; CHECK: vmovaps (%rdi), [[AVXREG:%ymm[0-9]+]] +; CHECK: vmovups (%rdi), [[AVXREG:%ymm[0-9]+]] ; CHECK: vmovups [[AVXREG]], (%rsp) ; CHECK: leaq {{[0-9]+}}(%rsp), %rdi ; CHECK: callq _t5_helper1 diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -90,9 +90,9 @@ ; AVX512F-LABEL: catcat: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [2,2,2,2,3,3,3,3] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [2,2,2,2,3,3,3,3] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm1 ; AVX512F-NEXT: vmovaps %zmm2, %zmm0 ; AVX512F-NEXT: retq @@ -165,7 +165,7 @@ ; ; AVX-LABEL: cat_ext_straddle: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX-NEXT: vmovups 16(%rdi), %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX-NEXT: retq %x = load <6 x i32>, <6 x i32>* %px diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -353,7 +353,7 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $16, %esp -; X86-NEXT: vmovaps 8(%ebp), %xmm3 +; X86-NEXT: vmovups 8(%ebp), %xmm3 ; X86-NEXT: vcmpneq_oqss %xmm1, %xmm0, %xmm0 ; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 ; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/extractelement-index.ll b/llvm/test/CodeGen/X86/extractelement-index.ll --- a/llvm/test/CodeGen/X86/extractelement-index.ll +++ b/llvm/test/CodeGen/X86/extractelement-index.ll @@ -433,7 +433,7 @@ ; AVX-LABEL: extractelement_v16i8_var: ; AVX: # %bb.0: ; AVX-NEXT: andl $15, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movb -24(%rsp,%rdi), %al ; AVX-NEXT: retq %b = extractelement <16 x i8> %a, i256 %i @@ -456,7 +456,7 @@ ; AVX-NEXT: andq $-32, %rsp ; AVX-NEXT: subq $64, %rsp ; AVX-NEXT: andl $31, %edi -; AVX-NEXT: vmovaps %ymm0, (%rsp) +; AVX-NEXT: vmovups %ymm0, (%rsp) ; AVX-NEXT: movb (%rsp,%rdi), %al ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp @@ -477,7 +477,7 @@ ; AVX-LABEL: extractelement_v8i16_var: ; AVX: # %bb.0: ; AVX-NEXT: andl $7, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; AVX-NEXT: retq %b = extractelement <8 x i16> %a, i256 %i @@ -500,7 +500,7 @@ ; AVX-NEXT: andq $-32, %rsp ; AVX-NEXT: subq $64, %rsp ; AVX-NEXT: andl $15, %edi -; AVX-NEXT: vmovaps %ymm0, (%rsp) +; AVX-NEXT: vmovups %ymm0, (%rsp) ; AVX-NEXT: movzwl (%rsp,%rdi,2), %eax ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp @@ -521,7 +521,7 @@ ; AVX-LABEL: extractelement_v4i32_var: ; AVX: # %bb.0: ; AVX-NEXT: andl $3, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movl -24(%rsp,%rdi,4), %eax ; AVX-NEXT: retq %b = extractelement <4 x i32> %a, i256 %i @@ -544,7 +544,7 @@ ; AVX-NEXT: andq $-32, %rsp ; AVX-NEXT: subq $64, %rsp ; AVX-NEXT: andl $7, %edi -; AVX-NEXT: vmovaps %ymm0, (%rsp) +; AVX-NEXT: vmovups %ymm0, (%rsp) ; AVX-NEXT: movl (%rsp,%rdi,4), %eax ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp @@ -565,7 +565,7 @@ ; AVX-LABEL: extractelement_v2i64_var: ; AVX: # %bb.0: ; AVX-NEXT: andl $1, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq -24(%rsp,%rdi,8), %rax ; AVX-NEXT: retq %b = extractelement <2 x i64> %a, i256 %i @@ -588,7 +588,7 @@ ; AVX-NEXT: andq $-32, %rsp ; AVX-NEXT: subq $64, %rsp ; AVX-NEXT: andl $3, %edi -; AVX-NEXT: vmovaps %ymm0, (%rsp) +; AVX-NEXT: vmovups %ymm0, (%rsp) ; AVX-NEXT: movq (%rsp,%rdi,8), %rax ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -119,7 +119,7 @@ ; ; X64-AVX-LABEL: t5: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vmovups (%rdi), %xmm0 ; X64-AVX-NEXT: vmovhps %xmm0, (%rsi) ; X64-AVX-NEXT: retq %vecload = load volatile <2 x double>, <2 x double>* %a0, align 16 diff --git a/llvm/test/CodeGen/X86/fast-isel-store.ll b/llvm/test/CodeGen/X86/fast-isel-store.ll --- a/llvm/test/CodeGen/X86/fast-isel-store.ll +++ b/llvm/test/CodeGen/X86/fast-isel-store.ll @@ -147,13 +147,13 @@ ; ; AVX32-LABEL: test_store_4xf32_aligned: ; AVX32: # %bb.0: -; AVX32-NEXT: vmovaps %xmm0, (%rdi) +; AVX32-NEXT: vmovups %xmm0, (%rdi) ; AVX32-NEXT: retq ; ; AVX64-LABEL: test_store_4xf32_aligned: ; AVX64: # %bb.0: ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovaps %xmm0, (%eax) +; AVX64-NEXT: vmovups %xmm0, (%eax) ; AVX64-NEXT: retl store <4 x float> %value, <4 x float>* %addr, align 16 ret <4 x float> %value @@ -265,13 +265,13 @@ ; ; AVX32-LABEL: test_store_8xi32_aligned: ; AVX32: # %bb.0: -; AVX32-NEXT: vmovaps %ymm0, (%rdi) +; AVX32-NEXT: vmovups %ymm0, (%rdi) ; AVX32-NEXT: retq ; ; AVX64-LABEL: test_store_8xi32_aligned: ; AVX64: # %bb.0: ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovaps %ymm0, (%eax) +; AVX64-NEXT: vmovups %ymm0, (%eax) ; AVX64-NEXT: retl store <8 x i32> %value, <8 x i32>* %addr, align 32 ret <8 x i32> %value @@ -321,13 +321,13 @@ ; ; AVX32-LABEL: test_store_8xf32_aligned: ; AVX32: # %bb.0: -; AVX32-NEXT: vmovaps %ymm0, (%rdi) +; AVX32-NEXT: vmovups %ymm0, (%rdi) ; AVX32-NEXT: retq ; ; AVX64-LABEL: test_store_8xf32_aligned: ; AVX64: # %bb.0: ; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovaps %ymm0, (%eax) +; AVX64-NEXT: vmovups %ymm0, (%eax) ; AVX64-NEXT: retl store <8 x float> %value, <8 x float>* %addr, align 32 ret <8 x float> %value @@ -488,26 +488,26 @@ ; ; AVXONLY32-LABEL: test_store_16xi32_aligned: ; AVXONLY32: # %bb.0: -; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi) -; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi) +; AVXONLY32-NEXT: vmovups %ymm0, (%rdi) +; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi) ; AVXONLY32-NEXT: retq ; ; AVXONLY64-LABEL: test_store_16xi32_aligned: ; AVXONLY64: # %bb.0: ; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVXONLY64-NEXT: vmovaps %ymm0, (%eax) -; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax) +; AVXONLY64-NEXT: vmovups %ymm0, (%eax) +; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax) ; AVXONLY64-NEXT: retl ; ; AVX51232-LABEL: test_store_16xi32_aligned: ; AVX51232: # %bb.0: -; AVX51232-NEXT: vmovaps %zmm0, (%rdi) +; AVX51232-NEXT: vmovups %zmm0, (%rdi) ; AVX51232-NEXT: retq ; ; AVX51264-LABEL: test_store_16xi32_aligned: ; AVX51264: # %bb.0: ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX51264-NEXT: vmovaps %zmm0, (%eax) +; AVX51264-NEXT: vmovups %zmm0, (%eax) ; AVX51264-NEXT: retl store <16 x i32> %value, <16 x i32>* %addr, align 64 ret <16 x i32> %value @@ -588,26 +588,26 @@ ; ; AVXONLY32-LABEL: test_store_16xf32_aligned: ; AVXONLY32: # %bb.0: -; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi) -; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi) +; AVXONLY32-NEXT: vmovups %ymm0, (%rdi) +; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi) ; AVXONLY32-NEXT: retq ; ; AVXONLY64-LABEL: test_store_16xf32_aligned: ; AVXONLY64: # %bb.0: ; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVXONLY64-NEXT: vmovaps %ymm0, (%eax) -; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax) +; AVXONLY64-NEXT: vmovups %ymm0, (%eax) +; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax) ; AVXONLY64-NEXT: retl ; ; AVX51232-LABEL: test_store_16xf32_aligned: ; AVX51232: # %bb.0: -; AVX51232-NEXT: vmovaps %zmm0, (%rdi) +; AVX51232-NEXT: vmovups %zmm0, (%rdi) ; AVX51232-NEXT: retq ; ; AVX51264-LABEL: test_store_16xf32_aligned: ; AVX51264: # %bb.0: ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX51264-NEXT: vmovaps %zmm0, (%eax) +; AVX51264-NEXT: vmovups %zmm0, (%eax) ; AVX51264-NEXT: retl store <16 x float> %value, <16 x float>* %addr, align 64 ret <16 x float> %value diff --git a/llvm/test/CodeGen/X86/fast-isel-vecload.ll b/llvm/test/CodeGen/X86/fast-isel-vecload.ll --- a/llvm/test/CodeGen/X86/fast-isel-vecload.ll +++ b/llvm/test/CodeGen/X86/fast-isel-vecload.ll @@ -135,7 +135,7 @@ ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: retq entry: %0 = load <4 x float>, <4 x float>* %V, align 16 @@ -255,7 +255,7 @@ ; ; AVX-LABEL: test_v4f32_abi_alignment: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: retq entry: %0 = load <4 x float>, <4 x float>* %V @@ -414,7 +414,7 @@ ; ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps (%rdi), %ymm0 +; AVX-NEXT: vmovups (%rdi), %ymm0 ; AVX-NEXT: retq entry: %0 = load <8 x float>, <8 x float>* %V, align 32 @@ -480,8 +480,8 @@ ; ; AVXONLY-LABEL: test_v64i8: ; AVXONLY: # %bb.0: # %entry -; AVXONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVXONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVXONLY-NEXT: vmovups (%rdi), %ymm0 +; AVXONLY-NEXT: vmovups 32(%rdi), %ymm1 ; AVXONLY-NEXT: retq ; ; AVX512-LABEL: test_v64i8: @@ -504,8 +504,8 @@ ; ; AVXONLY-LABEL: test_v32i16: ; AVXONLY: # %bb.0: # %entry -; AVXONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVXONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVXONLY-NEXT: vmovups (%rdi), %ymm0 +; AVXONLY-NEXT: vmovups 32(%rdi), %ymm1 ; AVXONLY-NEXT: retq ; ; AVX512-LABEL: test_v32i16: @@ -528,8 +528,8 @@ ; ; AVXONLY-LABEL: test_v16i32: ; AVXONLY: # %bb.0: # %entry -; AVXONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVXONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVXONLY-NEXT: vmovups (%rdi), %ymm0 +; AVXONLY-NEXT: vmovups 32(%rdi), %ymm1 ; AVXONLY-NEXT: retq ; ; AVX512-LABEL: test_v16i32: @@ -552,8 +552,8 @@ ; ; AVXONLY-LABEL: test_v8i64: ; AVXONLY: # %bb.0: # %entry -; AVXONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVXONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVXONLY-NEXT: vmovups (%rdi), %ymm0 +; AVXONLY-NEXT: vmovups 32(%rdi), %ymm1 ; AVXONLY-NEXT: retq ; ; AVX512-LABEL: test_v8i64: @@ -670,7 +670,7 @@ ; ; AVX-LABEL: test_v16f32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps (%rdi), %ymm0 +; AVX-NEXT: vmovups (%rdi), %ymm0 ; AVX-NEXT: retq entry: %0 = load <8 x float>, <8 x float>* %V, align 64 diff --git a/llvm/test/CodeGen/X86/fma-commute-x86.ll b/llvm/test/CodeGen/X86/fma-commute-x86.ll --- a/llvm/test/CodeGen/X86/fma-commute-x86.ll +++ b/llvm/test/CodeGen/X86/fma-commute-x86.ll @@ -9,7 +9,7 @@ define <4 x float> @test_x86_fmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fmadd_baa_ss: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %xmm0 +; FMA-NEXT: vmovups (%rdx), %xmm0 ; FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 ; FMA-NEXT: retq @@ -20,7 +20,7 @@ define <4 x float> @test_x86_fmadd_aba_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fmadd_aba_ss: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %xmm0 +; FMA-NEXT: vmovups (%rcx), %xmm0 ; FMA-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind @@ -30,7 +30,7 @@ define <4 x float> @test_x86_fmadd_bba_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fmadd_bba_ss: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %xmm0 +; FMA-NEXT: vmovups (%rdx), %xmm0 ; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + mem ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind @@ -41,7 +41,7 @@ define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fmadd_baa_ps: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %xmm0 +; FMA-NEXT: vmovups (%rcx), %xmm0 ; FMA-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind @@ -51,7 +51,7 @@ define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fmadd_aba_ps: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %xmm0 +; FMA-NEXT: vmovups (%rcx), %xmm0 ; FMA-NEXT: vfmadd231ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind @@ -61,7 +61,7 @@ define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fmadd_bba_ps: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %xmm0 +; FMA-NEXT: vmovups (%rdx), %xmm0 ; FMA-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm0 * xmm0) + mem ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind @@ -72,7 +72,7 @@ define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA-LABEL: test_x86_fmadd_baa_ps_y: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %ymm0 +; FMA-NEXT: vmovups (%rcx), %ymm0 ; FMA-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm0 ; FMA-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind @@ -82,7 +82,7 @@ define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA-LABEL: test_x86_fmadd_aba_ps_y: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %ymm0 +; FMA-NEXT: vmovups (%rcx), %ymm0 ; FMA-NEXT: vfmadd231ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm0 ; FMA-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind @@ -92,7 +92,7 @@ define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA-LABEL: test_x86_fmadd_bba_ps_y: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %ymm0 +; FMA-NEXT: vmovups (%rdx), %ymm0 ; FMA-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm0 * ymm0) + mem ; FMA-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind @@ -198,7 +198,7 @@ define <4 x float> @test_x86_fnmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmadd_baa_ss: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %xmm0 +; FMA-NEXT: vmovups (%rdx), %xmm0 ; FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; FMA-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 ; FMA-NEXT: retq @@ -209,7 +209,7 @@ define <4 x float> @test_x86_fnmadd_aba_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmadd_aba_ss: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %xmm0 +; FMA-NEXT: vmovups (%rcx), %xmm0 ; FMA-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0 ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind @@ -219,7 +219,7 @@ define <4 x float> @test_x86_fnmadd_bba_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmadd_bba_ss: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %xmm0 +; FMA-NEXT: vmovups (%rdx), %xmm0 ; FMA-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm0 * xmm0) + mem ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind @@ -230,7 +230,7 @@ define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmadd_baa_ps: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %xmm0 +; FMA-NEXT: vmovups (%rcx), %xmm0 ; FMA-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0 ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind @@ -240,7 +240,7 @@ define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmadd_aba_ps: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %xmm0 +; FMA-NEXT: vmovups (%rcx), %xmm0 ; FMA-NEXT: vfnmadd231ps {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0 ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind @@ -250,7 +250,7 @@ define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmadd_bba_ps: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %xmm0 +; FMA-NEXT: vmovups (%rdx), %xmm0 ; FMA-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm0 * xmm0) + mem ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind @@ -261,7 +261,7 @@ define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmadd_baa_ps_y: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %ymm0 +; FMA-NEXT: vmovups (%rcx), %ymm0 ; FMA-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0 ; FMA-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind @@ -271,7 +271,7 @@ define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmadd_aba_ps_y: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %ymm0 +; FMA-NEXT: vmovups (%rcx), %ymm0 ; FMA-NEXT: vfnmadd231ps {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0 ; FMA-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind @@ -281,7 +281,7 @@ define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmadd_bba_ps_y: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %ymm0 +; FMA-NEXT: vmovups (%rdx), %ymm0 ; FMA-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm0 * ymm0) + mem ; FMA-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind @@ -386,7 +386,7 @@ define <4 x float> @test_x86_fmsub_baa_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fmsub_baa_ss: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %xmm0 +; FMA-NEXT: vmovups (%rdx), %xmm0 ; FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; FMA-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 ; FMA-NEXT: retq @@ -397,7 +397,7 @@ define <4 x float> @test_x86_fmsub_aba_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fmsub_aba_ss: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %xmm0 +; FMA-NEXT: vmovups (%rcx), %xmm0 ; FMA-NEXT: vfmsub132ss {{.*#+}} xmm0 = (xmm0 * mem) - xmm0 ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind @@ -407,7 +407,7 @@ define <4 x float> @test_x86_fmsub_bba_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fmsub_bba_ss: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %xmm0 +; FMA-NEXT: vmovups (%rdx), %xmm0 ; FMA-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm0 * xmm0) - mem ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind @@ -418,7 +418,7 @@ define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fmsub_baa_ps: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %xmm0 +; FMA-NEXT: vmovups (%rcx), %xmm0 ; FMA-NEXT: vfmsub132ps {{.*#+}} xmm0 = (xmm0 * mem) - xmm0 ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind @@ -428,7 +428,7 @@ define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fmsub_aba_ps: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %xmm0 +; FMA-NEXT: vmovups (%rcx), %xmm0 ; FMA-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm0 * mem) - xmm0 ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind @@ -438,7 +438,7 @@ define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fmsub_bba_ps: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %xmm0 +; FMA-NEXT: vmovups (%rdx), %xmm0 ; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm0 * xmm0) - mem ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind @@ -449,7 +449,7 @@ define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA-LABEL: test_x86_fmsub_baa_ps_y: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %ymm0 +; FMA-NEXT: vmovups (%rcx), %ymm0 ; FMA-NEXT: vfmsub132ps {{.*#+}} ymm0 = (ymm0 * mem) - ymm0 ; FMA-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind @@ -459,7 +459,7 @@ define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA-LABEL: test_x86_fmsub_aba_ps_y: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %ymm0 +; FMA-NEXT: vmovups (%rcx), %ymm0 ; FMA-NEXT: vfmsub231ps {{.*#+}} ymm0 = (ymm0 * mem) - ymm0 ; FMA-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind @@ -469,7 +469,7 @@ define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA-LABEL: test_x86_fmsub_bba_ps_y: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %ymm0 +; FMA-NEXT: vmovups (%rdx), %ymm0 ; FMA-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm0 * ymm0) - mem ; FMA-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind @@ -575,7 +575,7 @@ define <4 x float> @test_x86_fnmsub_baa_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmsub_baa_ss: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %xmm0 +; FMA-NEXT: vmovups (%rdx), %xmm0 ; FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; FMA-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 ; FMA-NEXT: retq @@ -586,7 +586,7 @@ define <4 x float> @test_x86_fnmsub_aba_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmsub_aba_ss: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %xmm0 +; FMA-NEXT: vmovups (%rcx), %xmm0 ; FMA-NEXT: vfnmsub132ss {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0 ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind @@ -596,7 +596,7 @@ define <4 x float> @test_x86_fnmsub_bba_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmsub_bba_ss: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %xmm0 +; FMA-NEXT: vmovups (%rdx), %xmm0 ; FMA-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm0 * xmm0) - mem ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind @@ -607,7 +607,7 @@ define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmsub_baa_ps: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %xmm0 +; FMA-NEXT: vmovups (%rcx), %xmm0 ; FMA-NEXT: vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0 ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind @@ -617,7 +617,7 @@ define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmsub_aba_ps: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %xmm0 +; FMA-NEXT: vmovups (%rcx), %xmm0 ; FMA-NEXT: vfnmsub231ps {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0 ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind @@ -627,7 +627,7 @@ define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmsub_bba_ps: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %xmm0 +; FMA-NEXT: vmovups (%rdx), %xmm0 ; FMA-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm0 * xmm0) - mem ; FMA-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind @@ -638,7 +638,7 @@ define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmsub_baa_ps_y: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %ymm0 +; FMA-NEXT: vmovups (%rcx), %ymm0 ; FMA-NEXT: vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0 ; FMA-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind @@ -648,7 +648,7 @@ define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmsub_aba_ps_y: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rcx), %ymm0 +; FMA-NEXT: vmovups (%rcx), %ymm0 ; FMA-NEXT: vfnmsub231ps {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0 ; FMA-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind @@ -658,7 +658,7 @@ define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA-LABEL: test_x86_fnmsub_bba_ps_y: ; FMA: # %bb.0: -; FMA-NEXT: vmovaps (%rdx), %ymm0 +; FMA-NEXT: vmovups (%rdx), %ymm0 ; FMA-NEXT: vfnmsub213ps {{.*#+}} ymm0 = -(ymm0 * ymm0) - mem ; FMA-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-canonical.ll b/llvm/test/CodeGen/X86/fma-intrinsics-canonical.ll --- a/llvm/test/CodeGen/X86/fma-intrinsics-canonical.ll +++ b/llvm/test/CodeGen/X86/fma-intrinsics-canonical.ll @@ -20,8 +20,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm_fmadd_ps: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -70,7 +70,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm_fmadd_ss: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x01] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x02] @@ -130,8 +130,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm_fmsub_ps: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -182,7 +182,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm_fmsub_ss: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x01] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x02] @@ -244,8 +244,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm_fnmadd_ps: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -296,7 +296,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm_fnmadd_ss: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x01] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x02] @@ -358,8 +358,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm_fnmsub_ps: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -412,7 +412,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm_fnmsub_ss: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x01] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x02] @@ -476,8 +476,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm_fmaddsub_ps: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) +/- mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -532,8 +532,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm_fmsubadd_ps: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) -/+ mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -588,8 +588,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm256_fmadd_ps: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -638,8 +638,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm256_fmsub_ps: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -690,8 +690,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm256_fnmadd_ps: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -742,8 +742,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm256_fnmsub_ps: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -796,8 +796,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm256_fmaddsub_ps: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) +/- mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -852,8 +852,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_mm256_fmsubadd_ps: ; CHECK-FMA-WIN: # %bb.0: # %entry -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) -/+ mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll @@ -160,7 +160,7 @@ define <4 x float> @test_mm_fnmsub_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: test_mm_fnmsub_ps: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vmovups {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: vxorps %xmm3, %xmm0, %xmm4 ; CHECK-NEXT: vxorps %xmm3, %xmm2, %xmm0 ; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm0 = (xmm1 * xmm4) + xmm0 @@ -342,7 +342,7 @@ define <8 x float> @test_mm256_fnmsub_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) { ; CHECK-LABEL: test_mm256_fnmsub_ps: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vmovups {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: vxorps %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vxorps %ymm3, %ymm2, %ymm0 ; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0 diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/fma-intrinsics-x86-upgrade.ll --- a/llvm/test/CodeGen/X86/fma-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/fma-intrinsics-x86-upgrade.ll @@ -19,7 +19,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x01] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x02] @@ -46,7 +46,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x01] @@ -125,8 +125,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -175,8 +175,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps_256: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -226,7 +226,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x01] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x02] @@ -253,7 +253,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x01] @@ -332,8 +332,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -382,8 +382,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps_256: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -433,7 +433,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x01] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x02] @@ -460,7 +460,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x01] @@ -539,8 +539,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -589,8 +589,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps_256: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -640,7 +640,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x01] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x02] @@ -667,7 +667,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x01] @@ -746,8 +746,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -796,8 +796,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps_256: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -847,8 +847,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) +/- mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -897,8 +897,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps_256: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) +/- mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -948,8 +948,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) -/+ mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -998,8 +998,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps_256: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) -/+ mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll b/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll --- a/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll @@ -19,7 +19,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x01] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x02] @@ -50,7 +50,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x01] @@ -81,7 +81,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss_231: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%r8), %xmm0 # encoding: [0xc4,0xc1,0x78,0x28,0x00] +; CHECK-FMA-WIN-NEXT: vmovups (%r8), %xmm0 # encoding: [0xc4,0xc1,0x78,0x10,0x00] ; CHECK-FMA-WIN-NEXT: vmovss (%rcx), %xmm1 # encoding: [0xc5,0xfa,0x10,0x09] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfmadd231ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xb9,0x02] @@ -170,8 +170,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -218,8 +218,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps_256: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -267,7 +267,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x01] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x02] @@ -299,7 +299,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x01] @@ -391,8 +391,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -441,8 +441,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps_256: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -492,7 +492,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x01] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x02] @@ -524,7 +524,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x01] @@ -616,8 +616,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -666,8 +666,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps_256: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) + mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -717,7 +717,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x01] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x02] @@ -750,7 +750,7 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_ss: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08] ; CHECK-FMA-WIN-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x01] @@ -845,8 +845,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = -(xmm1 * xmm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -897,8 +897,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps_256: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = -(ymm1 * ymm0) - mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -950,8 +950,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) +/- mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -1004,8 +1004,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps_256: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) +/- mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -1059,8 +1059,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00] ; CHECK-FMA-WIN-NEXT: # xmm0 = (xmm1 * xmm0) -/+ mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] @@ -1113,8 +1113,8 @@ ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps_256: ; CHECK-FMA-WIN: # %bb.0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vmovups (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x10,0x09] +; CHECK-FMA-WIN-NEXT: vmovups (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x10,0x02] ; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00] ; CHECK-FMA-WIN-NEXT: # ymm0 = (ymm1 * ymm0) -/+ mem ; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -394,12 +394,12 @@ ; FMACALL32_BDVER2-LABEL: test_v4f32: ; FMACALL32_BDVER2: ## %bb.0: ## %entry ; FMACALL32_BDVER2-NEXT: subl $108, %esp ## encoding: [0x83,0xec,0x6c] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x40] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x30] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x54,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x4c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x44,0x24,0x20] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x08,0x02] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm1, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x4c,0x24,0x04,0x02] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] @@ -407,12 +407,12 @@ ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x60] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x30] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x54,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x4c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] @@ -420,23 +420,23 @@ ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x54] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x20] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x30] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x54,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x4c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] @@ -653,26 +653,26 @@ ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] ; FMACALL32_BDVER2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovups %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x5c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x5c,0x24,0x08,0x02] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x54,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x4c,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xb4,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x50] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x54,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x4c,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] @@ -680,14 +680,14 @@ ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa8,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 @@ -747,12 +747,12 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x50] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x54,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x4c,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] @@ -1124,22 +1124,22 @@ ; FMACALL32_BDVER2-NEXT: andl $-32, %esp ## encoding: [0x83,0xe4,0xe0] ; FMACALL32_BDVER2-NEXT: subl $448, %esp ## encoding: [0x81,0xec,0xc0,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## imm = 0x1C0 -; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm4 ## encoding: [0xc5,0xf8,0x28,0x65,0x38] -; FMACALL32_BDVER2-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups 56(%ebp), %xmm4 ## encoding: [0xc5,0xf8,0x10,0x65,0x38] +; FMACALL32_BDVER2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc9,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x5c,0x24,0x60] -; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x5c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x94,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x8c,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm4, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x64,0x24,0x08,0x02] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -1147,106 +1147,106 @@ ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x54,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x38] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x94,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x8c,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] +; FMACALL32_BDVER2-NEXT: vmovups 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x38] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x48,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x84,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovups 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x28] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x3c,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovups 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x28] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x30,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovups 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x28] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x24,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovups 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x28] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x18,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] +; FMACALL32_BDVER2-NEXT: vmovups 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x18] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x0c,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x00,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovups 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x18] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x94,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x4c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02] @@ -1254,85 +1254,85 @@ ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xf4,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovups 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x18] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x94,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x4c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] +; FMACALL32_BDVER2-NEXT: vmovups 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x18] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xe8,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovups 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x08] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovups 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x08] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovups 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x08] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xdc,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovups 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x08] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xd0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x10,0x45,0x38] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x94,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x8c,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] @@ -1487,23 +1487,23 @@ ; FMACALL32_BDVER2-LABEL: test_v2f64: ; FMACALL32_BDVER2: ## %bb.0: ; FMACALL32_BDVER2-NEXT: subl $108, %esp ## encoding: [0x83,0xec,0x6c] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x44,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc1] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],xmm1[0] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x30] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x54,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x4c,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x54,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x58] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] @@ -1673,15 +1673,15 @@ ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] ; FMACALL32_BDVER2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],xmm2[0] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x70] +; FMACALL32_BDVER2-NEXT: vmovups %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x5c,0x24,0x70] ; FMACALL32_BDVER2-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x5c,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x54,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] @@ -1714,11 +1714,11 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x70] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x70] ; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x68] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] @@ -1926,23 +1926,23 @@ ; FMACALL32_BDVER2-NEXT: andl $-32, %esp ## encoding: [0x83,0xe4,0xe0] ; FMACALL32_BDVER2-NEXT: subl $352, %esp ## encoding: [0x81,0xec,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## imm = 0x160 -; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0xe0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd 56(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x38] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0x00,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x30,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x9c,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x94,0x24,0x30,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x40,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x84,0x24,0x40,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],xmm2[0] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] @@ -1954,8 +1954,8 @@ ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xa0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[1],mem[1] @@ -1968,8 +1968,8 @@ ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xa0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] @@ -1982,16 +1982,16 @@ ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x4c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x11,0x44,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x15,0xc1] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[1],xmm1[1] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] @@ -2003,8 +2003,8 @@ ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x44,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x14,0x44,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] @@ -2016,8 +2016,8 @@ ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0x00,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[1],mem[1] @@ -2030,8 +2030,8 @@ ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x00,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] @@ -2042,8 +2042,8 @@ ; FMACALL32_BDVER2-NEXT: vmovsd 64(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x40] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x30,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x10,0x84,0x24,0x30,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x48,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] diff --git a/llvm/test/CodeGen/X86/fma4-commute-x86.ll b/llvm/test/CodeGen/X86/fma4-commute-x86.ll --- a/llvm/test/CodeGen/X86/fma4-commute-x86.ll +++ b/llvm/test/CodeGen/X86/fma4-commute-x86.ll @@ -38,7 +38,7 @@ define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmadd_baa_ps: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %xmm0 +; FMA4-NEXT: vmovups (%rcx), %xmm0 ; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind @@ -48,7 +48,7 @@ define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmadd_aba_ps: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %xmm0 +; FMA4-NEXT: vmovups (%rcx), %xmm0 ; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind @@ -58,7 +58,7 @@ define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmadd_bba_ps: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rdx), %xmm0 +; FMA4-NEXT: vmovups (%rdx), %xmm0 ; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm0) + mem ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind @@ -69,7 +69,7 @@ define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmadd_baa_ps_y: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %ymm0 +; FMA4-NEXT: vmovups (%rcx), %ymm0 ; FMA4-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * mem) + ymm0 ; FMA4-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind @@ -79,7 +79,7 @@ define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmadd_aba_ps_y: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %ymm0 +; FMA4-NEXT: vmovups (%rcx), %ymm0 ; FMA4-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * mem) + ymm0 ; FMA4-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind @@ -89,7 +89,7 @@ define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmadd_bba_ps_y: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rdx), %ymm0 +; FMA4-NEXT: vmovups (%rdx), %ymm0 ; FMA4-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm0) + mem ; FMA4-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind @@ -193,7 +193,7 @@ define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fnmadd_baa_ps: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %xmm0 +; FMA4-NEXT: vmovups (%rcx), %xmm0 ; FMA4-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0 ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind @@ -203,7 +203,7 @@ define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fnmadd_aba_ps: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %xmm0 +; FMA4-NEXT: vmovups (%rcx), %xmm0 ; FMA4-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0 ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind @@ -213,7 +213,7 @@ define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fnmadd_bba_ps: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rdx), %xmm0 +; FMA4-NEXT: vmovups (%rdx), %xmm0 ; FMA4-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm0) + mem ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind @@ -224,7 +224,7 @@ define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA4-LABEL: test_x86_fnmadd_baa_ps_y: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %ymm0 +; FMA4-NEXT: vmovups (%rcx), %ymm0 ; FMA4-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0 ; FMA4-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind @@ -234,7 +234,7 @@ define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA4-LABEL: test_x86_fnmadd_aba_ps_y: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %ymm0 +; FMA4-NEXT: vmovups (%rcx), %ymm0 ; FMA4-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm0 * mem) + ymm0 ; FMA4-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind @@ -244,7 +244,7 @@ define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA4-LABEL: test_x86_fnmadd_bba_ps_y: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rdx), %ymm0 +; FMA4-NEXT: vmovups (%rdx), %ymm0 ; FMA4-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm0 * ymm0) + mem ; FMA4-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind @@ -317,7 +317,7 @@ define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmsub_baa_ps: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %xmm0 +; FMA4-NEXT: vmovups (%rcx), %xmm0 ; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * mem) - xmm0 ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind @@ -327,7 +327,7 @@ define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmsub_aba_ps: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %xmm0 +; FMA4-NEXT: vmovups (%rcx), %xmm0 ; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * mem) - xmm0 ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind @@ -337,7 +337,7 @@ define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmsub_bba_ps: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rdx), %xmm0 +; FMA4-NEXT: vmovups (%rdx), %xmm0 ; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm0) - mem ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind @@ -348,7 +348,7 @@ define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmsub_baa_ps_y: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %ymm0 +; FMA4-NEXT: vmovups (%rcx), %ymm0 ; FMA4-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * mem) - ymm0 ; FMA4-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind @@ -358,7 +358,7 @@ define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmsub_aba_ps_y: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %ymm0 +; FMA4-NEXT: vmovups (%rcx), %ymm0 ; FMA4-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * mem) - ymm0 ; FMA4-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind @@ -368,7 +368,7 @@ define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmsub_bba_ps_y: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rdx), %ymm0 +; FMA4-NEXT: vmovups (%rdx), %ymm0 ; FMA4-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm0) - mem ; FMA4-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind @@ -441,7 +441,7 @@ define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fnmsub_baa_ps: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %xmm0 +; FMA4-NEXT: vmovups (%rcx), %xmm0 ; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0 ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind @@ -451,7 +451,7 @@ define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fnmsub_aba_ps: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %xmm0 +; FMA4-NEXT: vmovups (%rcx), %xmm0 ; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0 ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind @@ -461,7 +461,7 @@ define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fnmsub_bba_ps: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rdx), %xmm0 +; FMA4-NEXT: vmovups (%rdx), %xmm0 ; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm0) - mem ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind @@ -472,7 +472,7 @@ define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA4-LABEL: test_x86_fnmsub_baa_ps_y: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %ymm0 +; FMA4-NEXT: vmovups (%rcx), %ymm0 ; FMA4-NEXT: vfnmsubps {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0 ; FMA4-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind @@ -482,7 +482,7 @@ define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA4-LABEL: test_x86_fnmsub_aba_ps_y: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %ymm0 +; FMA4-NEXT: vmovups (%rcx), %ymm0 ; FMA4-NEXT: vfnmsubps {{.*#+}} ymm0 = -(ymm0 * mem) - ymm0 ; FMA4-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind @@ -492,7 +492,7 @@ define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 { ; FMA4-LABEL: test_x86_fnmsub_bba_ps_y: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rdx), %ymm0 +; FMA4-NEXT: vmovups (%rdx), %ymm0 ; FMA4-NEXT: vfnmsubps {{.*#+}} ymm0 = -(ymm0 * ymm0) - mem ; FMA4-NEXT: retq %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind diff --git a/llvm/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll b/llvm/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll --- a/llvm/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll +++ b/llvm/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll @@ -77,7 +77,7 @@ define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x float >* %a1, < 4 x float > %a2) { ; CHECK-LABEL: test_x86_fma_vfmadd_ps_load3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0f] +; CHECK-NEXT: vmovups (%rdi), %xmm1 # encoding: [0xc5,0xf8,0x10,0x0f] ; CHECK-NEXT: vfmaddps %xmm0, (%rsi), %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x68,0x06,0x00] ; CHECK-NEXT: # xmm0 = (xmm1 * mem) + xmm0 ; CHECK-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -791,14 +791,14 @@ define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_sub_one_x_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_sub_one_x_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; FMA4-INFS-NEXT: retq @@ -832,14 +832,14 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-INFS-NEXT: retq @@ -873,14 +873,14 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0> +; FMA-INFS-NEXT: vmovups {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0> ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0> +; FMA4-INFS-NEXT: vmovups {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0> ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-INFS-NEXT: retq @@ -914,14 +914,14 @@ define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_sub_negone_x_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vmovups {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_sub_negone_x_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vmovups {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; FMA4-INFS-NEXT: retq @@ -955,14 +955,14 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vmovups {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vmovups {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-INFS-NEXT: retq @@ -996,14 +996,14 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0> +; FMA-INFS-NEXT: vmovups {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0> ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0> +; FMA4-INFS-NEXT: vmovups {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0> ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-INFS-NEXT: retq @@ -1318,7 +1318,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) { ; FMA-INFS-LABEL: test_v4f32_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovups {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3 ; FMA-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1 ; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 @@ -1326,7 +1326,7 @@ ; ; FMA4-INFS-LABEL: test_v4f32_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovups {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3 ; FMA4-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1 ; FMA4-INFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 @@ -1367,7 +1367,7 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) { ; FMA-INFS-LABEL: test_v8f32_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3 ; FMA-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 @@ -1375,7 +1375,7 @@ ; ; FMA4-INFS-LABEL: test_v8f32_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3 ; FMA4-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll --- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll +++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll @@ -259,7 +259,7 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_add_x_one_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -268,7 +268,7 @@ ; ; FMA4-INFS-LABEL: test_v16f32_mul_add_x_one_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -351,7 +351,7 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_add_x_negone_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vmovups {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -360,7 +360,7 @@ ; ; FMA4-INFS-LABEL: test_v16f32_mul_add_x_negone_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vmovups {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -443,7 +443,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_one_x_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -452,7 +452,7 @@ ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_one_x_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -537,7 +537,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_negone_x_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vmovups {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -546,7 +546,7 @@ ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_negone_x_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vmovups {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -631,7 +631,7 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_x_one_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vmovups {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -640,7 +640,7 @@ ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_one_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vmovups {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -723,7 +723,7 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_x_negone_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -732,7 +732,7 @@ ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_negone_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -819,7 +819,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x float> %t) { ; FMA-INFS-LABEL: test_v16f32_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovups {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7 ; FMA-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6 ; FMA-INFS-NEXT: vmulps %ymm6, %ymm3, %ymm3 @@ -830,7 +830,7 @@ ; ; FMA4-INFS-LABEL: test_v16f32_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovups {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7 ; FMA4-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6 ; FMA4-INFS-NEXT: vmulps %ymm6, %ymm3, %ymm3 diff --git a/llvm/test/CodeGen/X86/fold-load-unops.ll b/llvm/test/CodeGen/X86/fold-load-unops.ll --- a/llvm/test/CodeGen/X86/fold-load-unops.ll +++ b/llvm/test/CodeGen/X86/fold-load-unops.ll @@ -235,7 +235,7 @@ ; ; AVX-LABEL: sqrtss_full_size: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load <4 x float>, <4 x float>* %a @@ -252,7 +252,7 @@ ; ; AVX-LABEL: sqrtss_full_size_volatile: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load volatile <4 x float>, <4 x float>* %a @@ -286,7 +286,7 @@ ; ; AVX-LABEL: sqrtss_full_pgso: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load <4 x float>, <4 x float>* %a @@ -303,7 +303,7 @@ ; ; AVX-LABEL: sqrtss_full_pgso_volatile: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load volatile <4 x float>, <4 x float>* %a diff --git a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll --- a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll +++ b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll @@ -11,12 +11,12 @@ define <4 x i16> @test_sext_4i8_4i16() { ; X32-LABEL: test_sext_4i8_4i16: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u> +; X32-NEXT: vmovups {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u> ; X32-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i16: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u> +; X64-NEXT: vmovups {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u> ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 0, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -29,12 +29,12 @@ define <4 x i16> @test_sext_4i8_4i16_undef() { ; X32-LABEL: test_sext_4i8_4i16_undef: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = +; X32-NEXT: vmovups {{.*#+}} xmm0 = ; X32-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i16_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = +; X64-NEXT: vmovups {{.*#+}} xmm0 = ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -47,12 +47,12 @@ define <4 x i32> @test_sext_4i8_4i32() { ; X32-LABEL: test_sext_4i8_4i32: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,2,4294967293] +; X32-NEXT: vmovups {{.*#+}} xmm0 = [0,4294967295,2,4294967293] ; X32-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i32: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,2,4294967293] +; X64-NEXT: vmovups {{.*#+}} xmm0 = [0,4294967295,2,4294967293] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 0, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -65,12 +65,12 @@ define <4 x i32> @test_sext_4i8_4i32_undef() { ; X32-LABEL: test_sext_4i8_4i32_undef: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = +; X32-NEXT: vmovups {{.*#+}} xmm0 = ; X32-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i32_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = +; X64-NEXT: vmovups {{.*#+}} xmm0 = ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -83,12 +83,12 @@ define <4 x i64> @test_sext_4i8_4i64() { ; X32-LABEL: test_sext_4i8_4i64: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,4294967295,4294967295,2,0,4294967293,4294967295] +; X32-NEXT: vmovups {{.*#+}} ymm0 = [0,0,4294967295,4294967295,2,0,4294967293,4294967295] ; X32-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i64: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,18446744073709551615,2,18446744073709551613] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [0,18446744073709551615,2,18446744073709551613] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 0, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -101,12 +101,12 @@ define <4 x i64> @test_sext_4i8_4i64_undef() { ; X32-LABEL: test_sext_4i8_4i64_undef: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} ymm0 = +; X32-NEXT: vmovups {{.*#+}} ymm0 = ; X32-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i64_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = +; X64-NEXT: vmovups {{.*#+}} ymm0 = ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -119,12 +119,12 @@ define <8 x i16> @test_sext_8i8_8i16() { ; X32-LABEL: test_sext_8i8_8i16: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,65535,2,65533,4,65531,6,65529] +; X32-NEXT: vmovups {{.*#+}} xmm0 = [0,65535,2,65533,4,65531,6,65529] ; X32-NEXT: retl ; ; X64-LABEL: test_sext_8i8_8i16: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,65535,2,65533,4,65531,6,65529] +; X64-NEXT: vmovups {{.*#+}} xmm0 = [0,65535,2,65533,4,65531,6,65529] ; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 0, i32 0 %2 = insertelement <8 x i8> %1, i8 -1, i32 1 @@ -141,12 +141,12 @@ define <8 x i32> @test_sext_8i8_8i32() { ; X32-LABEL: test_sext_8i8_8i32: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,2,4294967293,4,4294967291,6,4294967289] +; X32-NEXT: vmovups {{.*#+}} ymm0 = [0,4294967295,2,4294967293,4,4294967291,6,4294967289] ; X32-NEXT: retl ; ; X64-LABEL: test_sext_8i8_8i32: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,2,4294967293,4,4294967291,6,4294967289] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [0,4294967295,2,4294967293,4,4294967291,6,4294967289] ; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 0, i32 0 %2 = insertelement <8 x i8> %1, i8 -1, i32 1 @@ -163,12 +163,12 @@ define <8 x i16> @test_sext_8i8_8i16_undef() { ; X32-LABEL: test_sext_8i8_8i16_undef: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = +; X32-NEXT: vmovups {{.*#+}} xmm0 = ; X32-NEXT: retl ; ; X64-LABEL: test_sext_8i8_8i16_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = +; X64-NEXT: vmovups {{.*#+}} xmm0 = ; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 undef, i32 0 %2 = insertelement <8 x i8> %1, i8 -1, i32 1 @@ -185,12 +185,12 @@ define <8 x i32> @test_sext_8i8_8i32_undef() { ; X32-LABEL: test_sext_8i8_8i32_undef: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} ymm0 = <0,u,2,u,4,u,6,u> +; X32-NEXT: vmovups {{.*#+}} ymm0 = <0,u,2,u,4,u,6,u> ; X32-NEXT: retl ; ; X64-LABEL: test_sext_8i8_8i32_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = <0,u,2,u,4,u,6,u> +; X64-NEXT: vmovups {{.*#+}} ymm0 = <0,u,2,u,4,u,6,u> ; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 0, i32 0 %2 = insertelement <8 x i8> %1, i8 undef, i32 1 @@ -207,12 +207,12 @@ define <4 x i16> @test_zext_4i8_4i16() { ; X32-LABEL: test_zext_4i8_4i16: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,2,253,u,u,u,u> +; X32-NEXT: vmovups {{.*#+}} xmm0 = <0,255,2,253,u,u,u,u> ; X32-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i16: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,2,253,u,u,u,u> +; X64-NEXT: vmovups {{.*#+}} xmm0 = <0,255,2,253,u,u,u,u> ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 0, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -225,12 +225,12 @@ define <4 x i32> @test_zext_4i8_4i32() { ; X32-LABEL: test_zext_4i8_4i32: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253] +; X32-NEXT: vmovups {{.*#+}} xmm0 = [0,255,2,253] ; X32-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i32: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253] +; X64-NEXT: vmovups {{.*#+}} xmm0 = [0,255,2,253] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 0, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -243,12 +243,12 @@ define <4 x i64> @test_zext_4i8_4i64() { ; X32-LABEL: test_zext_4i8_4i64: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,255,0,2,0,253,0] +; X32-NEXT: vmovups {{.*#+}} ymm0 = [0,0,255,0,2,0,253,0] ; X32-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i64: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,2,253] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [0,255,2,253] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 0, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -261,12 +261,12 @@ define <4 x i16> @test_zext_4i8_4i16_undef() { ; X32-LABEL: test_zext_4i8_4i16_undef: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,0,253,u,u,u,u> +; X32-NEXT: vmovups {{.*#+}} xmm0 = <0,255,0,253,u,u,u,u> ; X32-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i16_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,0,253,u,u,u,u> +; X64-NEXT: vmovups {{.*#+}} xmm0 = <0,255,0,253,u,u,u,u> ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -279,12 +279,12 @@ define <4 x i32> @test_zext_4i8_4i32_undef() { ; X32-LABEL: test_zext_4i8_4i32_undef: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,2,0] +; X32-NEXT: vmovups {{.*#+}} xmm0 = [0,0,2,0] ; X32-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i32_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,2,0] +; X64-NEXT: vmovups {{.*#+}} xmm0 = [0,0,2,0] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 0, i32 0 %2 = insertelement <4 x i8> %1, i8 undef, i32 1 @@ -297,12 +297,12 @@ define <4 x i64> @test_zext_4i8_4i64_undef() { ; X32-LABEL: test_zext_4i8_4i64_undef: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,255,0,2,0,0,0] +; X32-NEXT: vmovups {{.*#+}} ymm0 = [0,0,255,0,2,0,0,0] ; X32-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i64_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,2,0] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [0,255,2,0] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -315,12 +315,12 @@ define <8 x i16> @test_zext_8i8_8i16() { ; X32-LABEL: test_zext_8i8_8i16: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253,4,251,6,249] +; X32-NEXT: vmovups {{.*#+}} xmm0 = [0,255,2,253,4,251,6,249] ; X32-NEXT: retl ; ; X64-LABEL: test_zext_8i8_8i16: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253,4,251,6,249] +; X64-NEXT: vmovups {{.*#+}} xmm0 = [0,255,2,253,4,251,6,249] ; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 0, i32 0 %2 = insertelement <8 x i8> %1, i8 -1, i32 1 @@ -337,12 +337,12 @@ define <8 x i32> @test_zext_8i8_8i32() { ; X32-LABEL: test_zext_8i8_8i32: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,2,253,4,251,6,249] +; X32-NEXT: vmovups {{.*#+}} ymm0 = [0,255,2,253,4,251,6,249] ; X32-NEXT: retl ; ; X64-LABEL: test_zext_8i8_8i32: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,2,253,4,251,6,249] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [0,255,2,253,4,251,6,249] ; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 0, i32 0 %2 = insertelement <8 x i8> %1, i8 -1, i32 1 @@ -359,12 +359,12 @@ define <8 x i16> @test_zext_8i8_8i16_undef() { ; X32-LABEL: test_zext_8i8_8i16_undef: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,0,253,0,251,0,249] +; X32-NEXT: vmovups {{.*#+}} xmm0 = [0,255,0,253,0,251,0,249] ; X32-NEXT: retl ; ; X64-LABEL: test_zext_8i8_8i16_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,0,253,0,251,0,249] +; X64-NEXT: vmovups {{.*#+}} xmm0 = [0,255,0,253,0,251,0,249] ; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 undef, i32 0 %2 = insertelement <8 x i8> %1, i8 -1, i32 1 @@ -381,12 +381,12 @@ define <8 x i32> @test_zext_8i8_8i32_undef() { ; X32-LABEL: test_zext_8i8_8i32_undef: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,2,253,4,0,6,0] +; X32-NEXT: vmovups {{.*#+}} ymm0 = [0,0,2,253,4,0,6,0] ; X32-NEXT: retl ; ; X64-LABEL: test_zext_8i8_8i32_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,2,253,4,0,6,0] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [0,0,2,253,4,0,6,0] ; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 0, i32 0 %2 = insertelement <8 x i8> %1, i8 undef, i32 1 diff --git a/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll b/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll --- a/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll +++ b/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll @@ -7,7 +7,7 @@ define <4 x float> @test1() { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,-1.0E+0,0.0E+0] +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,-1.0E+0,0.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = trunc <4 x i3> to <4 x i1> %2 = sitofp <4 x i1> %1 to <4 x float> diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll --- a/llvm/test/CodeGen/X86/fp-round.ll +++ b/llvm/test/CodeGen/X86/fp-round.ll @@ -441,9 +441,9 @@ ; ; AVX1-LABEL: round_v16f32: ; AVX1: ## %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] ; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm3 ; AVX1-NEXT: vaddps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vroundps $11, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll --- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll @@ -29,7 +29,7 @@ ; X64-AVX-NEXT: pushq %rax ; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: callq __extendsftf2 -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovups %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: retq ; @@ -77,7 +77,7 @@ ; X64-AVX-NEXT: pushq %rax ; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X64-AVX-NEXT: callq __extenddftf2 -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovups %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: retq ; @@ -129,7 +129,7 @@ ; X64-AVX-NEXT: fstpt (%rsp) ; X64-AVX-NEXT: wait ; X64-AVX-NEXT: callq __extendxftf2 -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovups %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: addq $24, %rsp ; X64-AVX-NEXT: retq ; @@ -175,7 +175,7 @@ ; X64-AVX-LABEL: TestFPTruncF128_F32: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __trunctfsf2 ; X64-AVX-NEXT: vmovss %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax @@ -214,7 +214,7 @@ ; X64-AVX-LABEL: TestFPTruncF128_F64: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __trunctfdf2 ; X64-AVX-NEXT: vmovsd %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax @@ -254,7 +254,7 @@ ; X64-AVX-LABEL: TestFPTruncF128_F80: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __trunctfxf2 ; X64-AVX-NEXT: fstpt {{.*}}(%rip) ; X64-AVX-NEXT: wait diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll --- a/llvm/test/CodeGen/X86/fp128-cast.ll +++ b/llvm/test/CodeGen/X86/fp128-cast.ll @@ -58,7 +58,7 @@ ; X64-AVX-NEXT: pushq %rax ; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: callq __extendsftf2 -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovups %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: retq entry: @@ -105,7 +105,7 @@ ; X64-AVX-NEXT: pushq %rax ; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X64-AVX-NEXT: callq __extenddftf2 -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovups %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: retq entry: @@ -154,7 +154,7 @@ ; X64-AVX-NEXT: fldt {{.*}}(%rip) ; X64-AVX-NEXT: fstpt (%rsp) ; X64-AVX-NEXT: callq __extendxftf2 -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovups %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: addq $24, %rsp ; X64-AVX-NEXT: retq entry: @@ -190,7 +190,7 @@ ; X64-AVX-LABEL: TestFPToSIF128_I16: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __fixtfsi ; X64-AVX-NEXT: movw %ax, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax @@ -228,7 +228,7 @@ ; X64-AVX-LABEL: TestFPToUIF128_I16: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __fixtfsi ; X64-AVX-NEXT: movw %ax, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax @@ -266,7 +266,7 @@ ; X64-AVX-LABEL: TestFPToSIF128_I32: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __fixtfsi ; X64-AVX-NEXT: movl %eax, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax @@ -304,7 +304,7 @@ ; X64-AVX-LABEL: TestFPToUIF128_U32: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __fixunstfsi ; X64-AVX-NEXT: movl %eax, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax @@ -345,7 +345,7 @@ ; X64-AVX-LABEL: TestFPToSIF128_I64: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __fixtfsi ; X64-AVX-NEXT: cltq ; X64-AVX-NEXT: movq %rax, {{.*}}(%rip) @@ -387,7 +387,7 @@ ; X64-AVX-LABEL: TestFPToUIF128_U64: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __fixunstfsi ; X64-AVX-NEXT: movl %eax, %eax ; X64-AVX-NEXT: movq %rax, {{.*}}(%rip) @@ -439,7 +439,7 @@ ; X64-AVX-LABEL: TestFPToSIF128_I128: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __fixtfti ; X64-AVX-NEXT: movq %rdx, vi128+{{.*}}(%rip) ; X64-AVX-NEXT: movq %rax, {{.*}}(%rip) @@ -490,7 +490,7 @@ ; X64-AVX-LABEL: TestFPToUIF128_U128: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __fixunstfti ; X64-AVX-NEXT: movq %rdx, vu128+{{.*}}(%rip) ; X64-AVX-NEXT: movq %rax, {{.*}}(%rip) @@ -529,7 +529,7 @@ ; X64-AVX-LABEL: TestFPTruncF128_F32: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __trunctfsf2 ; X64-AVX-NEXT: vmovss %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax @@ -567,7 +567,7 @@ ; X64-AVX-LABEL: TestFPTruncF128_F64: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __trunctfdf2 ; X64-AVX-NEXT: vmovsd %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax @@ -605,7 +605,7 @@ ; X64-AVX-LABEL: TestFPTruncF128_F80: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __trunctfxf2 ; X64-AVX-NEXT: fstpt {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax @@ -655,7 +655,7 @@ ; X64-AVX-NEXT: pushq %rax ; X64-AVX-NEXT: movswl {{.*}}(%rip), %edi ; X64-AVX-NEXT: callq __floatsitf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovups %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: retq entry: @@ -703,7 +703,7 @@ ; X64-AVX-NEXT: pushq %rax ; X64-AVX-NEXT: movzwl {{.*}}(%rip), %edi ; X64-AVX-NEXT: callq __floatsitf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovups %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: retq entry: @@ -749,7 +749,7 @@ ; X64-AVX-NEXT: pushq %rax ; X64-AVX-NEXT: movl {{.*}}(%rip), %edi ; X64-AVX-NEXT: callq __floatsitf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovups %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: retq entry: @@ -795,7 +795,7 @@ ; X64-AVX-NEXT: pushq %rax ; X64-AVX-NEXT: movl {{.*}}(%rip), %edi ; X64-AVX-NEXT: callq __floatunsitf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovups %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: retq entry: @@ -842,7 +842,7 @@ ; X64-AVX-NEXT: pushq %rax ; X64-AVX-NEXT: movq {{.*}}(%rip), %rdi ; X64-AVX-NEXT: callq __floatditf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovups %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: retq entry: @@ -889,7 +889,7 @@ ; X64-AVX-NEXT: pushq %rax ; X64-AVX-NEXT: movq {{.*}}(%rip), %rdi ; X64-AVX-NEXT: callq __floatunditf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovups %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: retq entry: @@ -940,7 +940,7 @@ ; X64-AVX-NEXT: movq {{.*}}(%rip), %rdi ; X64-AVX-NEXT: movq vi128+{{.*}}(%rip), %rsi ; X64-AVX-NEXT: callq __floattitf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovups %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: retq entry: @@ -991,7 +991,7 @@ ; X64-AVX-NEXT: movq {{.*}}(%rip), %rdi ; X64-AVX-NEXT: movq vu128+{{.*}}(%rip), %rsi ; X64-AVX-NEXT: callq __floatuntitf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovups %xmm0, {{.*}}(%rip) ; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: retq entry: @@ -1037,7 +1037,7 @@ ; X64-AVX-LABEL: TestConst128: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm1 +; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm1 ; X64-AVX-NEXT: callq __gttf2 ; X64-AVX-NEXT: xorl %ecx, %ecx ; X64-AVX-NEXT: testl %eax, %eax @@ -1168,7 +1168,7 @@ ; X64-AVX-NEXT: subq $24, %rsp ; X64-AVX-NEXT: vmovaps %xmm0, %xmm1 ; X64-AVX-NEXT: callq __multf3 -; X64-AVX-NEXT: vmovaps %xmm0, (%rsp) +; X64-AVX-NEXT: vmovups %xmm0, (%rsp) ; X64-AVX-NEXT: movq (%rsp), %rcx ; X64-AVX-NEXT: movq %rcx, %rdx ; X64-AVX-NEXT: shrq $32, %rdx @@ -1239,7 +1239,7 @@ ; X64-AVX-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: adcq $0, %rdi ; X64-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; X64-AVX-NEXT: vmovups -{{[0-9]+}}(%rsp), %xmm0 ; X64-AVX-NEXT: retq entry: %conv = zext i64 %a to i128 @@ -1371,7 +1371,7 @@ ; X64-AVX-LABEL: PR34866: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: xorq -{{[0-9]+}}(%rsp), %rsi ; X64-AVX-NEXT: xorq -{{[0-9]+}}(%rsp), %rdi ; X64-AVX-NEXT: orq %rsi, %rdi @@ -1406,7 +1406,7 @@ ; X64-AVX-LABEL: PR34866_commute: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: xorq -{{[0-9]+}}(%rsp), %rsi ; X64-AVX-NEXT: xorq -{{[0-9]+}}(%rsp), %rdi ; X64-AVX-NEXT: orq %rsi, %rdi diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll --- a/llvm/test/CodeGen/X86/fp128-i128.ll +++ b/llvm/test/CodeGen/X86/fp128-i128.ll @@ -62,7 +62,7 @@ ; ; AVX-LABEL: TestUnionLD1: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax ; AVX-NEXT: shlq $48, %rax ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx @@ -71,7 +71,7 @@ ; AVX-NEXT: orq %rax, %rdx ; AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: vmovups -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: jmp foo # TAILCALL entry: %0 = bitcast fp128 %s to i128 @@ -106,11 +106,11 @@ ; ; AVX-LABEL: TestUnionLD2: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: vmovups -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: retq entry: %0 = bitcast fp128 %s to i128 @@ -151,20 +151,20 @@ ; AVX-LABEL: TestI128_1: ; AVX: # %bb.0: # %entry ; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp) ; AVX-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF ; AVX-NEXT: andq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX-NEXT: movq %rcx, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %xmm0 -; AVX-NEXT: vmovaps {{.*}}(%rip), %xmm1 +; AVX-NEXT: vmovups (%rsp), %xmm0 +; AVX-NEXT: vmovups {{.*}}(%rip), %xmm1 ; AVX-NEXT: callq __lttf2 ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: testl %eax, %eax ; AVX-NEXT: sets %cl ; AVX-NEXT: shlq $4, %rcx -; AVX-NEXT: vmovaps {{\.LCPI.*}}(%rcx), %xmm0 +; AVX-NEXT: vmovups {{\.LCPI.*}}(%rcx), %xmm0 ; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: retq entry: @@ -198,7 +198,7 @@ ; ; AVX-LABEL: TestI128_2: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: jns .LBB3_2 ; AVX-NEXT: # %bb.1: # %entry @@ -254,7 +254,7 @@ ; AVX-LABEL: TestI128_3: ; AVX: # %bb.0: # %entry ; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: movabsq $9223090561878065152, %rcx # imm = 0x7FFF000000000000 ; AVX-NEXT: testq %rcx, %rax @@ -263,9 +263,9 @@ ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: jmp .LBB4_3 ; AVX-NEXT: .LBB4_2: # %if.then -; AVX-NEXT: vmovaps {{.*}}(%rip), %xmm1 +; AVX-NEXT: vmovups {{.*}}(%rip), %xmm1 ; AVX-NEXT: callq __multf3 -; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: movabsq $-9223090561878065153, %rdx # imm = 0x8000FFFFFFFFFFFF ; AVX-NEXT: andq {{[0-9]+}}(%rsp), %rdx @@ -274,7 +274,7 @@ ; AVX-NEXT: .LBB4_3: # %if.end ; AVX-NEXT: movq %rcx, (%rsp) ; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: vmovaps (%rsp), %xmm0 +; AVX-NEXT: vmovups (%rsp), %xmm0 ; AVX-NEXT: addq $56, %rsp ; AVX-NEXT: retq entry: @@ -320,11 +320,11 @@ ; AVX-LABEL: TestI128_4: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovaps %xmm0, %xmm1 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: vmovups -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: jmp __addtf3 # TAILCALL entry: %0 = bitcast fp128 %x to i128 @@ -375,11 +375,11 @@ ; AVX-LABEL: acosl: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovaps %xmm0, %xmm1 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: vmovups -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: jmp __addtf3 # TAILCALL entry: %0 = bitcast fp128 %x to i128 @@ -403,7 +403,7 @@ ; ; AVX-LABEL: TestComp: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: jns .LBB8_2 ; AVX-NEXT: # %bb.1: # %entry @@ -484,30 +484,30 @@ ; AVX-NEXT: pushq %rbx ; AVX-NEXT: subq $40, %rsp ; AVX-NEXT: movq %rdi, %rbx -; AVX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm1 +; AVX-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: callq __gttf2 ; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vmovaps %xmm0, %xmm1 ; AVX-NEXT: callq __subtf3 ; AVX-NEXT: testl %ebp, %ebp ; AVX-NEXT: jle .LBB10_1 ; AVX-NEXT: # %bb.2: # %if.then ; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: jmp .LBB10_3 ; AVX-NEXT: .LBB10_1: -; AVX-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm2 # 16-byte Reload ; AVX-NEXT: .LBB10_3: # %cleanup -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm2, (%rbx) -; AVX-NEXT: vmovaps %xmm0, 16(%rbx) +; AVX-NEXT: vmovups %xmm2, (%rbx) +; AVX-NEXT: vmovups %xmm0, 16(%rbx) ; AVX-NEXT: movq %rbx, %rax ; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/gpr-to-mask.ll b/llvm/test/CodeGen/X86/gpr-to-mask.ll --- a/llvm/test/CodeGen/X86/gpr-to-mask.ll +++ b/llvm/test/CodeGen/X86/gpr-to-mask.ll @@ -268,7 +268,7 @@ ; X86-64-NEXT: kmovb (%rdx), %k1 ; X86-64-NEXT: .LBB5_3: # %exit ; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vmovups %ymm1, (%rcx) ; X86-64-NEXT: vzeroupper ; X86-64-NEXT: retq ; @@ -289,7 +289,7 @@ ; X86-32-NEXT: kmovb (%ecx), %k1 ; X86-32-NEXT: .LBB5_3: # %exit ; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vmovups %ymm1, (%eax) ; X86-32-NEXT: vzeroupper ; X86-32-NEXT: retl entry: @@ -328,7 +328,7 @@ ; X86-64-NEXT: .LBB6_3: # %exit ; X86-64-NEXT: kmovd %eax, %k1 ; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vmovups %ymm1, (%rcx) ; X86-64-NEXT: vzeroupper ; X86-64-NEXT: retq ; @@ -350,7 +350,7 @@ ; X86-32-NEXT: .LBB6_3: # %exit ; X86-32-NEXT: kmovd %ecx, %k1 ; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vmovups %ymm1, (%eax) ; X86-32-NEXT: vzeroupper ; X86-32-NEXT: retl entry: @@ -388,7 +388,7 @@ ; X86-64-NEXT: kmovb (%rdx), %k1 ; X86-64-NEXT: .LBB7_3: # %exit ; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vmovups %ymm1, (%rcx) ; X86-64-NEXT: vzeroupper ; X86-64-NEXT: retq ; @@ -409,7 +409,7 @@ ; X86-32-NEXT: kmovb (%ecx), %k1 ; X86-32-NEXT: .LBB7_3: # %exit ; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vmovups %ymm1, (%eax) ; X86-32-NEXT: vzeroupper ; X86-32-NEXT: retl entry: @@ -447,7 +447,7 @@ ; X86-64-NEXT: kmovb (%rdx), %k1 ; X86-64-NEXT: .LBB8_3: # %exit ; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vmovups %ymm1, (%rcx) ; X86-64-NEXT: vzeroupper ; X86-64-NEXT: retq ; @@ -468,7 +468,7 @@ ; X86-32-NEXT: kmovb (%ecx), %k1 ; X86-32-NEXT: .LBB8_3: # %exit ; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vmovups %ymm1, (%eax) ; X86-32-NEXT: vzeroupper ; X86-32-NEXT: retl entry: @@ -507,7 +507,7 @@ ; X86-64-NEXT: kaddb %k1, %k0, %k1 ; X86-64-NEXT: .LBB9_3: # %exit ; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; X86-64-NEXT: vmovaps %ymm1, (%rcx) +; X86-64-NEXT: vmovups %ymm1, (%rcx) ; X86-64-NEXT: vzeroupper ; X86-64-NEXT: retq ; @@ -529,7 +529,7 @@ ; X86-32-NEXT: kaddb %k1, %k0, %k1 ; X86-32-NEXT: .LBB9_3: # %exit ; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1} -; X86-32-NEXT: vmovaps %ymm1, (%eax) +; X86-32-NEXT: vmovups %ymm1, (%eax) ; X86-32-NEXT: vzeroupper ; X86-32-NEXT: retl entry: diff --git a/llvm/test/CodeGen/X86/haddsub-shuf-undef-operand.ll b/llvm/test/CodeGen/X86/haddsub-shuf-undef-operand.ll --- a/llvm/test/CodeGen/X86/haddsub-shuf-undef-operand.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf-undef-operand.ll @@ -6,8 +6,8 @@ define void @PR43225(<4 x double>* %p0, <4 x double>* %p1, <4 x double> %x, <4 x double> %y, <4 x double> %z) nounwind { ; CHECK-LABEL: PR43225: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vmovaps (%rsi), %ymm0 +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: vmovups (%rsi), %ymm0 ; CHECK-NEXT: vhsubpd %ymm2, %ymm2, %ymm0 ; CHECK-NEXT: vmovapd %ymm0, (%rdi) ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -754,7 +754,7 @@ ; BWON-F16C-NEXT: vzeroupper ; BWON-F16C-NEXT: callq __truncdfhf2 ; BWON-F16C-NEXT: movl %eax, %ebp -; BWON-F16C-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; BWON-F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; BWON-F16C-NEXT: callq __truncdfhf2 ; BWON-F16C-NEXT: movw %ax, 4(%rbx) ; BWON-F16C-NEXT: movw %bp, (%rbx) diff --git a/llvm/test/CodeGen/X86/i64-mem-copy.ll b/llvm/test/CodeGen/X86/i64-mem-copy.ll --- a/llvm/test/CodeGen/X86/i64-mem-copy.ll +++ b/llvm/test/CodeGen/X86/i64-mem-copy.ll @@ -157,8 +157,8 @@ ; X32AVX-NEXT: andl $7, %eax ; X32AVX-NEXT: movl 48(%ebp), %ecx ; X32AVX-NEXT: vmovups 8(%ebp), %ymm1 -; X32AVX-NEXT: vmovaps %ymm1, (%esp) -; X32AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) +; X32AVX-NEXT: vmovups %ymm1, (%esp) +; X32AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; X32AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32AVX-NEXT: vmovsd %xmm0, (%ecx) ; X32AVX-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll --- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll @@ -192,13 +192,13 @@ ; ; X32AVX-LABEL: elt1_v4f32: ; X32AVX: # %bb.0: -; X32AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0> +; X32AVX-NEXT: vmovups {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0> ; X32AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; X32AVX-NEXT: retl ; ; X64AVX-LABEL: elt1_v4f32: ; X64AVX: # %bb.0: -; X64AVX-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0> +; X64AVX-NEXT: vmovups {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0> ; X64AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; X64AVX-NEXT: retq %ins = insertelement <4 x float> , float %x, i32 1 @@ -221,13 +221,13 @@ ; ; X32AVX-LABEL: elt1_v2f64: ; X32AVX: # %bb.0: -; X32AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u> +; X32AVX-NEXT: vmovups {{.*#+}} xmm0 = <4.2E+1,u> ; X32AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X32AVX-NEXT: retl ; ; X64AVX-LABEL: elt1_v2f64: ; X64AVX: # %bb.0: -; X64AVX-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u> +; X64AVX-NEXT: vmovups {{.*#+}} xmm1 = <4.2E+1,u> ; X64AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; X64AVX-NEXT: retq %ins = insertelement <2 x double> , double %x, i32 1 @@ -391,10 +391,10 @@ ; X32AVX1-LABEL: elt5_v8i64: ; X32AVX1: # %bb.0: ; X32AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,0,0] +; X32AVX1-NEXT: vmovups {{.*#+}} xmm1 = [4,0,0,0] ; X32AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; X32AVX1-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm0, %ymm1 -; X32AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0] +; X32AVX1-NEXT: vmovups {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0] ; X32AVX1-NEXT: retl ; ; X64AVX1-LABEL: elt5_v8i64: @@ -402,16 +402,16 @@ ; X64AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <4,u,6,7> ; X64AVX1-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 ; X64AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; X64AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,1,2,3] +; X64AVX1-NEXT: vmovups {{.*#+}} ymm0 = [42,1,2,3] ; X64AVX1-NEXT: retq ; ; X32AVX2-LABEL: elt5_v8i64: ; X32AVX2: # %bb.0: ; X32AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,0,0] +; X32AVX2-NEXT: vmovups {{.*#+}} xmm1 = [4,0,0,0] ; X32AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; X32AVX2-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm0, %ymm1 -; X32AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0] +; X32AVX2-NEXT: vmovups {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0] ; X32AVX2-NEXT: retl ; ; X64AVX2-LABEL: elt5_v8i64: @@ -419,14 +419,14 @@ ; X64AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <4,u,6,7> ; X64AVX2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 ; X64AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; X64AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,1,2,3] +; X64AVX2-NEXT: vmovups {{.*#+}} ymm0 = [42,1,2,3] ; X64AVX2-NEXT: retq ; ; X32AVX512F-LABEL: elt5_v8i64: ; X32AVX512F: # %bb.0: -; X32AVX512F-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0] +; X32AVX512F-NEXT: vmovups {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0] ; X32AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X32AVX512F-NEXT: vmovaps {{.*#+}} xmm2 = [4,0,0,0] +; X32AVX512F-NEXT: vmovups {{.*#+}} xmm2 = [4,0,0,0] ; X32AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; X32AVX512F-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm1, %ymm1 ; X32AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 @@ -465,49 +465,49 @@ ; ; X32AVX1-LABEL: elt1_v8f64: ; X32AVX1: # %bb.0: -; X32AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0> +; X32AVX1-NEXT: vmovups {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0> ; X32AVX1-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X32AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; X32AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] +; X32AVX1-NEXT: vmovups {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X32AVX1-NEXT: retl ; ; X64AVX1-LABEL: elt1_v8f64: ; X64AVX1: # %bb.0: -; X64AVX1-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0> +; X64AVX1-NEXT: vmovups {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0> ; X64AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; X64AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; X64AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] +; X64AVX1-NEXT: vmovups {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X64AVX1-NEXT: retq ; ; X32AVX2-LABEL: elt1_v8f64: ; X32AVX2: # %bb.0: -; X32AVX2-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0> +; X32AVX2-NEXT: vmovups {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0> ; X32AVX2-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X32AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; X32AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] +; X32AVX2-NEXT: vmovups {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X32AVX2-NEXT: retl ; ; X64AVX2-LABEL: elt1_v8f64: ; X64AVX2: # %bb.0: -; X64AVX2-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0> +; X64AVX2-NEXT: vmovups {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0> ; X64AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; X64AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; X64AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] +; X64AVX2-NEXT: vmovups {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X64AVX2-NEXT: retq ; ; X32AVX512F-LABEL: elt1_v8f64: ; X32AVX512F: # %bb.0: -; X32AVX512F-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> +; X32AVX512F-NEXT: vmovups {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> ; X32AVX512F-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; X32AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> +; X32AVX512F-NEXT: vmovups {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> ; X32AVX512F-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0 ; X32AVX512F-NEXT: retl ; ; X64AVX512F-LABEL: elt1_v8f64: ; X64AVX512F: # %bb.0: -; X64AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> +; X64AVX512F-NEXT: vmovups {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> ; X64AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> +; X64AVX512F-NEXT: vmovups {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> ; X64AVX512F-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0 ; X64AVX512F-NEXT: retq %ins = insertelement <8 x double> , double %x, i32 1 diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll --- a/llvm/test/CodeGen/X86/known-bits-vector.ll +++ b/llvm/test/CodeGen/X86/known-bits-vector.ll @@ -156,12 +156,12 @@ define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind { ; X32-LABEL: knownbits_mask_or_shuffle_uitofp: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4] +; X32-NEXT: vmovups {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4] ; X32-NEXT: retl ; ; X64-LABEL: knownbits_mask_or_shuffle_uitofp: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4] +; X64-NEXT: vmovups {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4] ; X64-NEXT: retq %1 = and <4 x i32> %a0, %2 = or <4 x i32> %1, @@ -600,7 +600,7 @@ ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $16, %esp -; X32-NEXT: vmovaps 8(%ebp), %xmm3 +; X32-NEXT: vmovups 8(%ebp), %xmm3 ; X32-NEXT: vandps {{\.LCPI.*}}, %xmm2, %xmm2 ; X32-NEXT: vandps {{\.LCPI.*}}, %xmm3, %xmm3 ; X32-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -636,7 +636,7 @@ ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $16, %esp -; X32-NEXT: vmovaps 8(%ebp), %xmm3 +; X32-NEXT: vmovups 8(%ebp), %xmm3 ; X32-NEXT: vpsrld $5, %xmm2, %xmm2 ; X32-NEXT: vandps {{\.LCPI.*}}, %xmm3, %xmm3 ; X32-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -758,7 +758,7 @@ ; X86-NEXT: vandnps %ymm1, %ymm0, %ymm1 ; X86-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; X86-NEXT: vorps %ymm1, %ymm0, %ymm0 -; X86-NEXT: vmovaps %ymm0, (%eax) +; X86-NEXT: vmovups %ymm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; @@ -772,7 +772,7 @@ ; X64-AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm1 ; X64-AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; X64-AVX1-NEXT: vmovaps %ymm0, (%rdi) +; X64-AVX1-NEXT: vmovups %ymm0, (%rdi) ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/legalize-vaarg.ll b/llvm/test/CodeGen/X86/legalize-vaarg.ll --- a/llvm/test/CodeGen/X86/legalize-vaarg.ll +++ b/llvm/test/CodeGen/X86/legalize-vaarg.ll @@ -20,7 +20,7 @@ ; CHECK-NEXT: leaq 32(%rax), %rcx ; CHECK-NEXT: movq %rcx, (%rsp) ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: vmovaps (%rax), %ymm0 +; CHECK-NEXT: vmovups (%rax), %ymm0 ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: cmpl $24, %ecx ; CHECK-NEXT: jae .LBB0_5 @@ -37,7 +37,7 @@ ; CHECK-NEXT: leaq 32(%rax), %rcx ; CHECK-NEXT: movq %rcx, (%rsp) ; CHECK-NEXT: .LBB0_6: -; CHECK-NEXT: vmovaps (%rax), %ymm1 +; CHECK-NEXT: vmovups (%rax), %ymm1 ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: cmpl $24, %ecx ; CHECK-NEXT: jae .LBB0_8 @@ -54,7 +54,7 @@ ; CHECK-NEXT: leaq 32(%rax), %rcx ; CHECK-NEXT: movq %rcx, (%rsp) ; CHECK-NEXT: .LBB0_9: -; CHECK-NEXT: vmovaps (%rax), %ymm2 +; CHECK-NEXT: vmovups (%rax), %ymm2 ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: cmpl $24, %ecx ; CHECK-NEXT: jae .LBB0_11 @@ -63,7 +63,7 @@ ; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: addl $8, %ecx ; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps (%rax), %ymm3 +; CHECK-NEXT: vmovups (%rax), %ymm3 ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB0_11: ; CHECK-NEXT: movq (%rsp), %rax @@ -71,7 +71,7 @@ ; CHECK-NEXT: andq $-32, %rax ; CHECK-NEXT: leaq 32(%rax), %rcx ; CHECK-NEXT: movq %rcx, (%rsp) -; CHECK-NEXT: vmovaps (%rax), %ymm3 +; CHECK-NEXT: vmovups (%rax), %ymm3 ; CHECK-NEXT: retq %args = alloca i8*, align 4 %x = va_arg i8** %args, <32 x i32> diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll --- a/llvm/test/CodeGen/X86/load-partial.ll +++ b/llvm/test/CodeGen/X86/load-partial.ll @@ -164,7 +164,7 @@ ; ; AVX-LABEL: load_float4_float3_trunc: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: retq %2 = bitcast <4 x float>* %0 to i64* %3 = load i64, i64* %2, align 16 @@ -195,7 +195,7 @@ ; AVX-LABEL: load_float4_float3_trunc_0122: ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovaps (%rdi), %xmm1 +; AVX-NEXT: vmovups (%rdi), %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] ; AVX-NEXT: retq %2 = bitcast <4 x float>* %0 to i64* @@ -239,7 +239,7 @@ ; ; AVX-LABEL: load_float4_float3_trunc_0123: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; AVX-NEXT: retq @@ -365,7 +365,7 @@ ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, 672(%rsi) +; AVX1-NEXT: vmovups %ymm0, 672(%rsi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -1226,7 +1226,7 @@ ; AVX2-LABEL: compressstore_v16f32_const: ; AVX2: ## %bb.0: ; AVX2-NEXT: vmovups %ymm0, (%rdi) -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,4] +; AVX2-NEXT: vmovups {{.*#+}} xmm0 = [0,1,2,4] ; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovups %xmm0, 32(%rdi) ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -3091,7 +3091,7 @@ define <16 x float> @test_sext_cse(float* %base, <16 x i32> %ind, <16 x i32>* %foo) { ; KNL_64-LABEL: test_sext_cse: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vmovaps %zmm0, (%rsi) +; KNL_64-NEXT: vmovups %zmm0, (%rsi) ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; KNL_64-NEXT: vaddps %zmm1, %zmm1, %zmm0 @@ -3101,7 +3101,7 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; KNL_32-NEXT: vmovaps %zmm0, (%ecx) +; KNL_32-NEXT: vmovups %zmm0, (%ecx) ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} ; KNL_32-NEXT: vaddps %zmm1, %zmm1, %zmm0 @@ -3109,7 +3109,7 @@ ; ; SKX-LABEL: test_sext_cse: ; SKX: # %bb.0: -; SKX-NEXT: vmovaps %zmm0, (%rsi) +; SKX-NEXT: vmovups %zmm0, (%rsi) ; SKX-NEXT: kxnorw %k0, %k0, %k1 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; SKX-NEXT: vaddps %zmm1, %zmm1, %zmm0 @@ -3119,7 +3119,7 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; SKX_32-NEXT: vmovaps %zmm0, (%ecx) +; SKX_32-NEXT: vmovups %zmm0, (%ecx) ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} ; SKX_32-NEXT: vaddps %zmm1, %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -1275,14 +1275,14 @@ ; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1 -; AVX512VLDQ-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z} +; AVX512VLDQ-NEXT: vmovups (%rdi), %ymm0 {%k1} {z} ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: load_v8f32_v8i1_zero: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1 -; AVX512VLBW-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z} +; AVX512VLBW-NEXT: vmovups (%rdi), %ymm0 {%k1} {z} ; AVX512VLBW-NEXT: retq ; ; X86-AVX512-LABEL: load_v8f32_v8i1_zero: @@ -1290,7 +1290,7 @@ ; X86-AVX512-NEXT: vpsllw $15, %xmm0, %xmm0 ; X86-AVX512-NEXT: vpmovw2m %xmm0, %k1 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vmovaps (%eax), %ymm0 {%k1} {z} +; X86-AVX512-NEXT: vmovups (%eax), %ymm0 {%k1} {z} ; X86-AVX512-NEXT: retl %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer) ret <8 x float> %res @@ -6437,7 +6437,7 @@ ; ; AVX1-LABEL: mload_constmask_v4i32: ; AVX1: ## %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295] +; AVX1-NEXT: vmovups {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295] ; AVX1-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX1-NEXT: retq @@ -6534,7 +6534,7 @@ ; ; AVX1OR2-LABEL: mload_constmask_v8f32: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0] +; AVX1OR2-NEXT: vmovups {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0] ; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm1, %ymm1 ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1OR2-NEXT: retq @@ -6591,7 +6591,7 @@ ; ; AVX1OR2-LABEL: mload_constmask_v8f32_zero: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,4294967295,0,0,0,0,0] +; AVX1OR2-NEXT: vmovups {{.*#+}} ymm0 = [4294967295,4294967295,4294967295,0,0,0,0,0] ; AVX1OR2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 ; AVX1OR2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -376,7 +376,7 @@ ; ; AVX1-LABEL: truncstore_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm3 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 @@ -763,7 +763,7 @@ ; ; AVX1-LABEL: truncstore_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm3 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 @@ -2223,7 +2223,7 @@ ; ; AVX1-LABEL: truncstore_v16i32_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 @@ -2901,7 +2901,7 @@ ; ; AVX1-LABEL: truncstore_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 @@ -4887,7 +4887,7 @@ ; ; AVX1-LABEL: truncstore_v32i16_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/memset-nonzero.ll b/llvm/test/CodeGen/X86/memset-nonzero.ll --- a/llvm/test/CodeGen/X86/memset-nonzero.ll +++ b/llvm/test/CodeGen/X86/memset-nonzero.ll @@ -28,7 +28,7 @@ ; ; AVX-LABEL: memset_16_nonzero_bytes: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX-NEXT: vmovups %xmm0, (%rdi) ; AVX-NEXT: retq %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1) @@ -54,7 +54,7 @@ ; ; AVX-LABEL: memset_32_nonzero_bytes: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX-NEXT: vmovups %ymm0, (%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -87,7 +87,7 @@ ; ; AVX1-LABEL: memset_64_nonzero_bytes: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vmovups {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) ; AVX1-NEXT: vmovups %ymm0, (%rdi) ; AVX1-NEXT: vzeroupper @@ -95,7 +95,7 @@ ; ; AVX2-LABEL: memset_64_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovups {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX2-NEXT: vmovups %ymm0, 32(%rdi) ; AVX2-NEXT: vmovups %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper @@ -110,7 +110,7 @@ ; ; AVX512BW-LABEL: memset_64_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vmovups {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX512BW-NEXT: vmovups %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -156,7 +156,7 @@ ; ; AVX1-LABEL: memset_128_nonzero_bytes: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vmovups {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX1-NEXT: vmovups %ymm0, 96(%rdi) ; AVX1-NEXT: vmovups %ymm0, 64(%rdi) ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) @@ -166,7 +166,7 @@ ; ; AVX2-LABEL: memset_128_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovups {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX2-NEXT: vmovups %ymm0, 96(%rdi) ; AVX2-NEXT: vmovups %ymm0, 64(%rdi) ; AVX2-NEXT: vmovups %ymm0, 32(%rdi) @@ -184,7 +184,7 @@ ; ; AVX512BW-LABEL: memset_128_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vmovups {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi) ; AVX512BW-NEXT: vmovups %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper @@ -228,7 +228,7 @@ ; ; AVX1-LABEL: memset_256_nonzero_bytes: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vmovups {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX1-NEXT: vmovups %ymm0, 224(%rdi) ; AVX1-NEXT: vmovups %ymm0, 192(%rdi) ; AVX1-NEXT: vmovups %ymm0, 160(%rdi) @@ -242,7 +242,7 @@ ; ; AVX2-LABEL: memset_256_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovups {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX2-NEXT: vmovups %ymm0, 224(%rdi) ; AVX2-NEXT: vmovups %ymm0, 192(%rdi) ; AVX2-NEXT: vmovups %ymm0, 160(%rdi) @@ -266,7 +266,7 @@ ; ; AVX512BW-LABEL: memset_256_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vmovups {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX512BW-NEXT: vmovups %zmm0, 192(%rdi) ; AVX512BW-NEXT: vmovups %zmm0, 128(%rdi) ; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi) diff --git a/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll b/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll --- a/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll +++ b/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll @@ -66,7 +66,7 @@ ; AVX-NEXT: movl %esp, %esi ; AVX-NEXT: movl 8(%ebp), %eax ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %ymm0, (%esi) +; AVX-NEXT: vmovups %ymm0, (%esi) ; AVX-NEXT: addl $3, %eax ; AVX-NEXT: andl $-4, %eax ; AVX-NEXT: calll __alloca @@ -138,7 +138,7 @@ ; AVX-NEXT: movl %esp, %esi ; AVX-NEXT: movl 8(%ebp), %eax ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, (%esi) +; AVX-NEXT: vmovups %xmm0, (%esi) ; AVX-NEXT: addl $3, %eax ; AVX-NEXT: andl $-4, %eax ; AVX-NEXT: calll __alloca diff --git a/llvm/test/CodeGen/X86/memset-zero.ll b/llvm/test/CodeGen/X86/memset-zero.ll --- a/llvm/test/CodeGen/X86/memset-zero.ll +++ b/llvm/test/CodeGen/X86/memset-zero.ll @@ -608,21 +608,21 @@ ; SANDYBRIDGE-LABEL: memset_32_align32: ; SANDYBRIDGE: # %bb.0: # %entry ; SANDYBRIDGE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SANDYBRIDGE-NEXT: vmovaps %ymm0, (%rdi) +; SANDYBRIDGE-NEXT: vmovups %ymm0, (%rdi) ; SANDYBRIDGE-NEXT: vzeroupper ; SANDYBRIDGE-NEXT: retq ; ; SKYLAKE-LABEL: memset_32_align32: ; SKYLAKE: # %bb.0: # %entry ; SKYLAKE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SKYLAKE-NEXT: vmovaps %ymm0, (%rdi) +; SKYLAKE-NEXT: vmovups %ymm0, (%rdi) ; SKYLAKE-NEXT: vzeroupper ; SKYLAKE-NEXT: retq ; ; KNL-LABEL: memset_32_align32: ; KNL: # %bb.0: # %entry ; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovaps %ymm0, (%rdi) +; KNL-NEXT: vmovups %ymm0, (%rdi) ; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* align 32 %a, i8 0, i64 32, i1 false) @@ -802,23 +802,23 @@ ; SANDYBRIDGE-LABEL: memset_64_align64: ; SANDYBRIDGE: # %bb.0: # %entry ; SANDYBRIDGE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SANDYBRIDGE-NEXT: vmovaps %ymm0, 32(%rdi) -; SANDYBRIDGE-NEXT: vmovaps %ymm0, (%rdi) +; SANDYBRIDGE-NEXT: vmovups %ymm0, 32(%rdi) +; SANDYBRIDGE-NEXT: vmovups %ymm0, (%rdi) ; SANDYBRIDGE-NEXT: vzeroupper ; SANDYBRIDGE-NEXT: retq ; ; SKYLAKE-LABEL: memset_64_align64: ; SKYLAKE: # %bb.0: # %entry ; SKYLAKE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SKYLAKE-NEXT: vmovaps %ymm0, 32(%rdi) -; SKYLAKE-NEXT: vmovaps %ymm0, (%rdi) +; SKYLAKE-NEXT: vmovups %ymm0, 32(%rdi) +; SKYLAKE-NEXT: vmovups %ymm0, (%rdi) ; SKYLAKE-NEXT: vzeroupper ; SKYLAKE-NEXT: retq ; ; KNL-LABEL: memset_64_align64: ; KNL: # %bb.0: # %entry ; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovaps %zmm0, (%rdi) +; KNL-NEXT: vmovups %zmm0, (%rdi) ; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* align 64 %a, i8 0, i64 64, i1 false) diff --git a/llvm/test/CodeGen/X86/memset.ll b/llvm/test/CodeGen/X86/memset.ll --- a/llvm/test/CodeGen/X86/memset.ll +++ b/llvm/test/CodeGen/X86/memset.ll @@ -43,7 +43,7 @@ ; YMM-NEXT: subl $96, %esp ; YMM-NEXT: leal {{[0-9]+}}(%esp), %eax ; YMM-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; YMM-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) +; YMM-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; YMM-NEXT: movl %eax, (%esp) ; YMM-NEXT: vzeroupper ; YMM-NEXT: calll _foo diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -1071,7 +1071,7 @@ ; AVX-LABEL: merge_4i32_i32_combine: ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovaps %xmm0, (%rdi) +; AVX-NEXT: vmovups %xmm0, (%rdi) ; AVX-NEXT: retq ; ; X32-SSE1-LABEL: merge_4i32_i32_combine: diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -28,13 +28,13 @@ define <4 x double> @merge_4f64_2f64_2z(<2 x double>* %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_4f64_2f64_2z: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX-NEXT: vmovups 32(%rdi), %xmm0 ; AVX-NEXT: retq ; ; X32-AVX-LABEL: merge_4f64_2f64_2z: ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vmovaps 32(%eax), %xmm0 +; X32-AVX-NEXT: vmovups 32(%eax), %xmm0 ; X32-AVX-NEXT: retl %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 %val0 = load <2 x double>, <2 x double>* %ptr0 @@ -155,13 +155,13 @@ define <4 x i64> @merge_4i64_2i64_3z(<2 x i64>* %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_4i64_2i64_3z: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX-NEXT: vmovups 48(%rdi), %xmm0 ; AVX-NEXT: retq ; ; X32-AVX-LABEL: merge_4i64_2i64_3z: ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vmovaps 48(%eax), %xmm0 +; X32-AVX-NEXT: vmovups 48(%eax), %xmm0 ; X32-AVX-NEXT: retl %ptr0 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 3 %val0 = load <2 x i64>, <2 x i64>* %ptr0 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -316,7 +316,7 @@ ; ALL: # %bb.0: ; ALL-NEXT: vmovups (%rdi), %zmm1 ; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> +; ALL-NEXT: vmovups {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> ; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 ; ALL-NEXT: retq ; @@ -325,7 +325,7 @@ ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: vmovups (%eax), %zmm1 ; X32-AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X32-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> +; X32-AVX512F-NEXT: vmovups {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> ; X32-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds float, float* %ptr, i64 0 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -110,9 +110,9 @@ ; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntload: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0 -; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1 ; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi) +; X64-AVX-NEXT: vmovups %xmm1, 16(%rsi) ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* @@ -147,10 +147,10 @@ ; ; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntstore: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1 ; X64-AVX-NEXT: vmovntps %xmm0, (%rsi) -; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi) +; X64-AVX-NEXT: vmovups %xmm1, 16(%rsi) ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* @@ -241,8 +241,8 @@ ; ; X64-AVX-LABEL: merge_2_v4f32_align16_ntstore: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1 ; X64-AVX-NEXT: vmovntps %xmm0, (%rsi) ; X64-AVX-NEXT: vmovntps %xmm1, 16(%rsi) ; X64-AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/merge-store-constants.ll b/llvm/test/CodeGen/X86/merge-store-constants.ll --- a/llvm/test/CodeGen/X86/merge-store-constants.ll +++ b/llvm/test/CodeGen/X86/merge-store-constants.ll @@ -6,13 +6,13 @@ ; X32-LABEL: big_nonzero_16_bytes: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,3,4] +; X32-NEXT: vmovups {{.*#+}} xmm0 = [1,2,3,4] ; X32-NEXT: vmovups %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: big_nonzero_16_bytes: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,3,4] +; X64-NEXT: vmovups {{.*#+}} xmm0 = [1,2,3,4] ; X64-NEXT: vmovups %xmm0, (%rdi) ; X64-NEXT: retq %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1 @@ -34,7 +34,7 @@ ; X32-LABEL: big_nonzero_16_bytes_big64bit_constants: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,3] +; X32-NEXT: vmovups {{.*#+}} xmm0 = [1,1,1,3] ; X32-NEXT: vmovups %xmm0, (%eax) ; X32-NEXT: retl ; @@ -58,14 +58,14 @@ ; X32-LABEL: big_nonzero_32_bytes_splat: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] +; X32-NEXT: vmovups {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] ; X32-NEXT: vmovups %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: big_nonzero_32_bytes_splat: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] ; X64-NEXT: vmovups %ymm0, (%rdi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -94,9 +94,9 @@ ; X32-LABEL: big_nonzero_63_bytes: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1,0,2,0,3,0,4,0] +; X32-NEXT: vmovups {{.*#+}} ymm0 = [1,0,2,0,3,0,4,0] ; X32-NEXT: vmovups %ymm0, (%eax) -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [5,0,6,0] +; X32-NEXT: vmovups {{.*#+}} xmm0 = [5,0,6,0] ; X32-NEXT: vmovups %xmm0, 32(%eax) ; X32-NEXT: movl $0, 52(%eax) ; X32-NEXT: movl $7, 48(%eax) @@ -108,7 +108,7 @@ ; ; X64-LABEL: big_nonzero_63_bytes: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,3,4] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [1,2,3,4] ; X64-NEXT: vmovups %ymm0, (%rdi) ; X64-NEXT: movq $5, 32(%rdi) ; X64-NEXT: movq $6, 40(%rdi) diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -440,8 +440,8 @@ ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: vpmovm2d %k0, %ymm1 ; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) +; CHECK-NEXT: vmovups %ymm1, (%rdi) +; CHECK-NEXT: vmovups %ymm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer @@ -456,7 +456,7 @@ ; CHECK-NEXT: vpmovw2m %ymm0, %k0 ; CHECK-NEXT: vpmovm2d %k0, %zmm0 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, (%rdi) +; CHECK-NEXT: vmovups %zmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer @@ -478,10 +478,10 @@ ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 -; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) -; CHECK-NEXT: vmovaps %ymm3, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) -; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) +; CHECK-NEXT: vmovups %ymm2, 32(%rdi) +; CHECK-NEXT: vmovups %ymm3, (%rdi) +; CHECK-NEXT: vmovups %ymm0, 96(%rdi) +; CHECK-NEXT: vmovups %ymm1, 64(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer @@ -498,8 +498,8 @@ ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm1, (%rdi) +; CHECK-NEXT: vmovups %zmm0, 64(%rdi) +; CHECK-NEXT: vmovups %zmm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer @@ -519,8 +519,8 @@ ; CHECK-NEXT: vpmovm2d %k0, %ymm1 ; CHECK-NEXT: vpsrld $31, %ymm1, %ymm1 ; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) +; CHECK-NEXT: vmovups %ymm1, (%rdi) +; CHECK-NEXT: vmovups %ymm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer @@ -536,7 +536,7 @@ ; CHECK-NEXT: vpmovm2d %k0, %zmm0 ; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, (%rdi) +; CHECK-NEXT: vmovups %zmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer @@ -560,10 +560,10 @@ ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 -; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) -; CHECK-NEXT: vmovaps %ymm3, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) -; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) +; CHECK-NEXT: vmovups %ymm2, 32(%rdi) +; CHECK-NEXT: vmovups %ymm3, (%rdi) +; CHECK-NEXT: vmovups %ymm0, 96(%rdi) +; CHECK-NEXT: vmovups %ymm1, 64(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer @@ -581,8 +581,8 @@ ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm1, (%rdi) +; CHECK-NEXT: vmovups %zmm0, 64(%rdi) +; CHECK-NEXT: vmovups %zmm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer diff --git a/llvm/test/CodeGen/X86/nontemporal-loads-2.ll b/llvm/test/CodeGen/X86/nontemporal-loads-2.ll --- a/llvm/test/CodeGen/X86/nontemporal-loads-2.ll +++ b/llvm/test/CodeGen/X86/nontemporal-loads-2.ll @@ -209,7 +209,7 @@ ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %ymm0 +; AVX-NEXT: vmovups (%rsp), %ymm0 ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp ; AVX-NEXT: retq @@ -240,7 +240,7 @@ ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %ymm0 +; AVX-NEXT: vmovups (%rsp), %ymm0 ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp ; AVX-NEXT: retq @@ -271,7 +271,7 @@ ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %ymm0 +; AVX-NEXT: vmovups (%rsp), %ymm0 ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp ; AVX-NEXT: retq @@ -302,7 +302,7 @@ ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %ymm0 +; AVX-NEXT: vmovups (%rsp), %ymm0 ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp ; AVX-NEXT: retq @@ -333,7 +333,7 @@ ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %ymm0 +; AVX-NEXT: vmovups (%rsp), %ymm0 ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp ; AVX-NEXT: retq @@ -364,7 +364,7 @@ ; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %ymm0 +; AVX-NEXT: vmovups (%rsp), %ymm0 ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp ; AVX-NEXT: retq @@ -579,8 +579,8 @@ ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovaps (%rsp), %ymm0 -; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX1-NEXT: vmovups (%rsp), %ymm0 +; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 ; AVX1-NEXT: movq %rbp, %rsp ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq @@ -599,8 +599,8 @@ ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps (%rsp), %ymm0 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX2-NEXT: vmovups (%rsp), %ymm0 +; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -619,7 +619,7 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovups (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -658,8 +658,8 @@ ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovaps (%rsp), %ymm0 -; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX1-NEXT: vmovups (%rsp), %ymm0 +; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 ; AVX1-NEXT: movq %rbp, %rsp ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq @@ -678,8 +678,8 @@ ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps (%rsp), %ymm0 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX2-NEXT: vmovups (%rsp), %ymm0 +; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -698,7 +698,7 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovups (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -737,8 +737,8 @@ ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovaps (%rsp), %ymm0 -; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX1-NEXT: vmovups (%rsp), %ymm0 +; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 ; AVX1-NEXT: movq %rbp, %rsp ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq @@ -757,8 +757,8 @@ ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps (%rsp), %ymm0 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX2-NEXT: vmovups (%rsp), %ymm0 +; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -777,7 +777,7 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovups (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -816,8 +816,8 @@ ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovaps (%rsp), %ymm0 -; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX1-NEXT: vmovups (%rsp), %ymm0 +; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 ; AVX1-NEXT: movq %rbp, %rsp ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq @@ -836,8 +836,8 @@ ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps (%rsp), %ymm0 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX2-NEXT: vmovups (%rsp), %ymm0 +; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -856,7 +856,7 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovups (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -895,8 +895,8 @@ ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovaps (%rsp), %ymm0 -; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX1-NEXT: vmovups (%rsp), %ymm0 +; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 ; AVX1-NEXT: movq %rbp, %rsp ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq @@ -915,8 +915,8 @@ ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps (%rsp), %ymm0 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX2-NEXT: vmovups (%rsp), %ymm0 +; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -935,7 +935,7 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovups (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -974,8 +974,8 @@ ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovaps (%rsp), %ymm0 -; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX1-NEXT: vmovups (%rsp), %ymm0 +; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 ; AVX1-NEXT: movq %rbp, %rsp ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq @@ -994,8 +994,8 @@ ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps (%rsp), %ymm0 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX2-NEXT: vmovups (%rsp), %ymm0 +; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -1014,7 +1014,7 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovups (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -1065,7 +1065,7 @@ ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovups (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -1116,7 +1116,7 @@ ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovups (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -1167,7 +1167,7 @@ ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovups (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -1218,7 +1218,7 @@ ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovups (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -1269,7 +1269,7 @@ ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovups (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -1320,7 +1320,7 @@ ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovups (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -51,14 +51,14 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] ; AVX-NEXT: vmovhps %xmm0, 16(%rdi) -; AVX-NEXT: vmovaps %xmm1, (%rdi) +; AVX-NEXT: vmovups %xmm1, (%rdi) ; AVX-NEXT: retq ; ; XOP-LABEL: v3f64: ; XOP: # %bb.0: ; XOP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] ; XOP-NEXT: vmovhps %xmm0, 16(%rdi) -; XOP-NEXT: vmovaps %xmm1, (%rdi) +; XOP-NEXT: vmovups %xmm1, (%rdi) ; XOP-NEXT: retq %r = shufflevector <2 x double> %a, <2 x double> %b, <3 x i32> store <3 x double> %r, <3 x double>* %p @@ -209,7 +209,7 @@ ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2] ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX-NEXT: vextractps $3, %xmm0, 16(%rdi) -; AVX-NEXT: vmovaps %xmm1, (%rdi) +; AVX-NEXT: vmovups %xmm1, (%rdi) ; AVX-NEXT: retq ; ; XOP-LABEL: v5f32: @@ -217,7 +217,7 @@ ; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2] ; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3] ; XOP-NEXT: vextractps $3, %xmm0, 16(%rdi) -; XOP-NEXT: vmovaps %xmm1, (%rdi) +; XOP-NEXT: vmovups %xmm1, (%rdi) ; XOP-NEXT: retq %r = shufflevector <4 x float> %a, <4 x float> %b, <5 x i32> store <5 x float> %r, <5 x float>* %p @@ -354,7 +354,7 @@ ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX-NEXT: vmovss %xmm1, 24(%rdi) ; AVX-NEXT: vmovlps %xmm0, 16(%rdi) -; AVX-NEXT: vmovaps %xmm2, (%rdi) +; AVX-NEXT: vmovups %xmm2, (%rdi) ; AVX-NEXT: retq ; ; XOP-LABEL: v7i32: @@ -365,7 +365,7 @@ ; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; XOP-NEXT: vmovss %xmm1, 24(%rdi) ; XOP-NEXT: vmovlps %xmm0, 16(%rdi) -; XOP-NEXT: vmovaps %xmm2, (%rdi) +; XOP-NEXT: vmovups %xmm2, (%rdi) ; XOP-NEXT: retq %r = shufflevector <4 x i32> %a, <4 x i32> %b, <7 x i32> store <7 x i32> %r, <7 x i32>* %p @@ -562,14 +562,14 @@ ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX1-NEXT: vmovaps %xmm0, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm2, (%rdi) +; AVX1-NEXT: vmovups %xmm0, 32(%rdi) +; AVX1-NEXT: vmovups %ymm2, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: v12i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6> +; AVX2-SLOW-NEXT: vmovups {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6> ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] @@ -577,23 +577,23 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX2-SLOW-NEXT: vmovaps %xmm0, 32(%rdi) -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdi) +; AVX2-SLOW-NEXT: vmovups %xmm0, 32(%rdi) +; AVX2-SLOW-NEXT: vmovups %ymm2, (%rdi) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: v12i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm3 = ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovaps %xmm0, 32(%rdi) -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdi) +; AVX2-FAST-NEXT: vmovups %xmm0, 32(%rdi) +; AVX2-FAST-NEXT: vmovups %ymm2, (%rdi) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -608,8 +608,8 @@ ; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] ; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; XOP-NEXT: vmovaps %xmm0, 32(%rdi) -; XOP-NEXT: vmovaps %ymm2, (%rdi) +; XOP-NEXT: vmovups %xmm0, 32(%rdi) +; XOP-NEXT: vmovups %ymm2, (%rdi) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %r = shufflevector <8 x i32> %a, <8 x i32> %b, <12 x i32> @@ -1329,18 +1329,18 @@ ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> +; AVX2-SLOW-NEXT: vmovups {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> +; AVX2-SLOW-NEXT: vmovups {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> +; AVX2-SLOW-NEXT: vmovups {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] @@ -1359,20 +1359,20 @@ ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm5 = ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, (%rsi) @@ -1586,7 +1586,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-FAST-NEXT: vbroadcastsd (%rcx), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <5,u,u,6,u,u,7,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm4 = <5,u,u,6,u,u,7,u> ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] @@ -2077,8 +2077,8 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm1, (%rdi) +; AVX1-NEXT: vmovups %ymm1, 32(%rdi) +; AVX1-NEXT: vmovups %ymm1, (%rdi) ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2086,8 +2086,8 @@ ; AVX2-LABEL: wrongorder: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-NEXT: vmovaps %ymm0, 32(%rdi) -; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vmovups %ymm0, 32(%rdi) +; AVX2-NEXT: vmovups %ymm0, (%rdi) ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2096,8 +2096,8 @@ ; XOP: # %bb.0: ; XOP-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; XOP-NEXT: vmovaps %ymm1, 32(%rdi) -; XOP-NEXT: vmovaps %ymm1, (%rdi) +; XOP-NEXT: vmovups %ymm1, 32(%rdi) +; XOP-NEXT: vmovups %ymm1, (%rdi) ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -77,20 +77,20 @@ ; ; AVX-LABEL: PR40815: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX-NEXT: vmovaps 48(%rdi), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rsi) -; AVX-NEXT: vmovaps %xmm3, (%rsi) -; AVX-NEXT: vmovaps %xmm0, 48(%rsi) -; AVX-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmovups 16(%rdi), %xmm1 +; AVX-NEXT: vmovups 32(%rdi), %xmm2 +; AVX-NEXT: vmovups 48(%rdi), %xmm3 +; AVX-NEXT: vmovups %xmm2, 16(%rsi) +; AVX-NEXT: vmovups %xmm3, (%rsi) +; AVX-NEXT: vmovups %xmm0, 48(%rsi) +; AVX-NEXT: vmovups %xmm1, 32(%rsi) ; AVX-NEXT: retq ; ; AVX512-LABEL: PR40815: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX512-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX512-NEXT: vmovups 16(%rdi), %xmm0 +; AVX512-NEXT: vmovups 48(%rdi), %xmm1 ; AVX512-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 ; AVX512-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 ; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 @@ -162,7 +162,7 @@ ; SSE2-NEXT: movdqa c+{{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa c+{{.*}}(%rip), %xmm0 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: addl b(%rip), %eax +; SSE2-NEXT: addl {{.*}}(%rip), %eax ; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: paddd %xmm0, %xmm3 @@ -198,7 +198,7 @@ ; SSE42-NEXT: movdqa c+{{.*}}(%rip), %xmm0 ; SSE42-NEXT: movdqa c+{{.*}}(%rip), %xmm1 ; SSE42-NEXT: movd %xmm1, %eax -; SSE42-NEXT: addl b(%rip), %eax +; SSE42-NEXT: addl {{.*}}(%rip), %eax ; SSE42-NEXT: movd %eax, %xmm2 ; SSE42-NEXT: paddd %xmm1, %xmm2 ; SSE42-NEXT: movdqa d+{{.*}}(%rip), %xmm3 @@ -232,7 +232,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa c+{{.*}}(%rip), %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: addl b(%rip), %eax +; AVX1-NEXT: addl {{.*}}(%rip), %eax ; AVX1-NEXT: vmovd %eax, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2 @@ -265,7 +265,7 @@ ; ; AVX2-LABEL: PR42833: ; AVX2: # %bb.0: -; AVX2-NEXT: movl b(%rip), %eax +; AVX2-NEXT: movl {{.*}}(%rip), %eax ; AVX2-NEXT: vmovdqu c+{{.*}}(%rip), %ymm0 ; AVX2-NEXT: addl c+{{.*}}(%rip), %eax ; AVX2-NEXT: vmovd %eax, %xmm1 @@ -288,7 +288,7 @@ ; ; AVX512-LABEL: PR42833: ; AVX512: # %bb.0: -; AVX512-NEXT: movl b(%rip), %eax +; AVX512-NEXT: movl {{.*}}(%rip), %eax ; AVX512-NEXT: vmovdqu c+{{.*}}(%rip), %ymm0 ; AVX512-NEXT: vmovdqu64 c+{{.*}}(%rip), %zmm1 ; AVX512-NEXT: addl c+{{.*}}(%rip), %eax @@ -314,7 +314,7 @@ ; XOP: # %bb.0: ; XOP-NEXT: vmovdqa c+{{.*}}(%rip), %xmm0 ; XOP-NEXT: vmovd %xmm0, %eax -; XOP-NEXT: addl b(%rip), %eax +; XOP-NEXT: addl {{.*}}(%rip), %eax ; XOP-NEXT: vmovd %eax, %xmm1 ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; XOP-NEXT: vpaddd %xmm0, %xmm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -186,6 +186,7 @@ ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Atom pad short functions ; CHECK-NEXT: X86 LEA Fixup +; CHECK-NEXT: Replace movaps instruction to movups ; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possible ; CHECK-NEXT: X86 Discriminate Memory Operands ; CHECK-NEXT: X86 Insert Cache Prefetches diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll --- a/llvm/test/CodeGen/X86/paddus.ll +++ b/llvm/test/CodeGen/X86/paddus.ll @@ -613,7 +613,7 @@ ; ; AVX1-LABEL: test17: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm3 ; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 @@ -1422,7 +1422,7 @@ ; ; AVX1-LABEL: test35: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm3 ; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -308,8 +308,8 @@ ; AVX-LABEL: mul_v4i32spill: ; AVX: # %bb.0: # %entry ; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: callq foo ; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -347,8 +347,8 @@ ; AVX-LABEL: mul_v2i64spill: ; AVX: # %bb.0: # %entry ; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: callq foo ; AVX-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload ; AVX-NEXT: vpsrlq $32, %xmm3, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr11334.ll b/llvm/test/CodeGen/X86/pr11334.ll --- a/llvm/test/CodeGen/X86/pr11334.ll +++ b/llvm/test/CodeGen/X86/pr11334.ll @@ -95,7 +95,7 @@ ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vmovaps %ymm0, (%rax) +; AVX-NEXT: vmovups %ymm0, (%rax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %1 = insertelement <4 x double> undef, double 0.000000e+00, i32 2 diff --git a/llvm/test/CodeGen/X86/pr22774.ll b/llvm/test/CodeGen/X86/pr22774.ll --- a/llvm/test/CodeGen/X86/pr22774.ll +++ b/llvm/test/CodeGen/X86/pr22774.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: _Z3foov: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; CHECK-NEXT: vmovups %xmm0, {{.*}}(%rip) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll --- a/llvm/test/CodeGen/X86/pr29112.ll +++ b/llvm/test/CodeGen/X86/pr29112.ll @@ -19,9 +19,9 @@ ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10 ; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm7 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7 -; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29] +; CHECK-NEXT: vmovups {{.*#+}} xmm8 = [4,28,1,29] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8 -; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7] +; CHECK-NEXT: vmovups {{.*#+}} xmm4 = [4,21,1,7] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm6 @@ -29,7 +29,7 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0,1],xmm2[1],xmm11[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm1[0,1,2],xmm3[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1] -; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4 ; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,3,3,3] @@ -46,18 +46,18 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2] ; CHECK-NEXT: vaddps %xmm2, %xmm14, %xmm2 ; CHECK-NEXT: vmovaps %xmm13, %xmm1 -; CHECK-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vaddps %xmm10, %xmm13, %xmm10 ; CHECK-NEXT: vaddps %xmm13, %xmm13, %xmm3 ; CHECK-NEXT: vaddps %xmm12, %xmm14, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm8, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm13, %xmm0 -; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %xmm10, (%rsp) +; CHECK-NEXT: vmovups %xmm3, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %xmm10, (%rsp) ; CHECK-NEXT: vmovaps %xmm9, %xmm3 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: addq $72, %rsp diff --git a/llvm/test/CodeGen/X86/pr30290.ll b/llvm/test/CodeGen/X86/pr30290.ll --- a/llvm/test/CodeGen/X86/pr30290.ll +++ b/llvm/test/CodeGen/X86/pr30290.ll @@ -20,12 +20,12 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovups %xmm0, (%rsp) ; CHECK-NEXT: callq bar ; CHECK-NEXT: addq $40, %rsp diff --git a/llvm/test/CodeGen/X86/pr30430.ll b/llvm/test/CodeGen/X86/pr30430.ll --- a/llvm/test/CodeGen/X86/pr30430.ll +++ b/llvm/test/CodeGen/X86/pr30430.ll @@ -96,8 +96,8 @@ ; CHECK-NEXT: # implicit-def: $zmm2 ; CHECK-NEXT: vmovaps %ymm1, %ymm2 ; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0 +; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 diff --git a/llvm/test/CodeGen/X86/pr31956.ll b/llvm/test/CodeGen/X86/pr31956.ll --- a/llvm/test/CodeGen/X86/pr31956.ll +++ b/llvm/test/CodeGen/X86/pr31956.ll @@ -9,7 +9,7 @@ define <4 x float> @foo() { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: vmovups {{.*}}(%rip), %xmm0 ; CHECK-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2] ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] diff --git a/llvm/test/CodeGen/X86/pr32368.ll b/llvm/test/CodeGen/X86/pr32368.ll --- a/llvm/test/CodeGen/X86/pr32368.ll +++ b/llvm/test/CodeGen/X86/pr32368.ll @@ -21,19 +21,19 @@ ; ; AVX2-LABEL: PR32368_128: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967004,4294967004,4294967004,4294967004] ; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vaddps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [291,291,291,291] ; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: PR32368_128: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967004,4294967004,4294967004,4294967004] ; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vaddps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [291,291,291,291] ; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %2 = bitcast <4 x float> %0 to <4 x i32> @@ -68,19 +68,19 @@ ; ; AVX2-LABEL: PR32368_256: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004] ; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vaddps %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [291,291,291,291,291,291,291,291] ; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: PR32368_256: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 +; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004] ; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vaddps %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 +; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [291,291,291,291,291,291,291,291] ; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %2 = bitcast <8 x float> %0 to <8 x i32> @@ -114,24 +114,24 @@ ; ; AVX1-LABEL: PR32368_512: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vaddps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vaddps %ymm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [291,291,291,291,291,291,291,291] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [291,291,291,291,291,291,291,291] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR32368_512: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004] ; AVX2-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [291,291,291,291,291,291,291,291] ; AVX2-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll --- a/llvm/test/CodeGen/X86/pr34592.ll +++ b/llvm/test/CodeGen/X86/pr34592.ll @@ -11,17 +11,17 @@ ; CHECK-NEXT: .cfi_def_cfa_register %rbp ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $160, %rsp -; CHECK-NEXT: vmovaps 240(%rbp), %ymm8 -; CHECK-NEXT: vmovaps 208(%rbp), %ymm9 -; CHECK-NEXT: vmovaps 176(%rbp), %ymm10 -; CHECK-NEXT: vmovaps 144(%rbp), %ymm11 -; CHECK-NEXT: vmovaps 112(%rbp), %ymm12 -; CHECK-NEXT: vmovaps 80(%rbp), %ymm13 -; CHECK-NEXT: vmovaps 48(%rbp), %ymm14 -; CHECK-NEXT: vmovaps 16(%rbp), %ymm15 +; CHECK-NEXT: vmovups 240(%rbp), %ymm8 +; CHECK-NEXT: vmovups 208(%rbp), %ymm9 +; CHECK-NEXT: vmovups 176(%rbp), %ymm10 +; CHECK-NEXT: vmovups 144(%rbp), %ymm11 +; CHECK-NEXT: vmovups 112(%rbp), %ymm12 +; CHECK-NEXT: vmovups 80(%rbp), %ymm13 +; CHECK-NEXT: vmovups 48(%rbp), %ymm14 +; CHECK-NEXT: vmovups 16(%rbp), %ymm15 ; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; CHECK-NEXT: vmovaps %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: # implicit-def: $ymm0 ; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 ; CHECK-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] @@ -31,7 +31,7 @@ ; CHECK-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] ; CHECK-NEXT: # implicit-def: $ymm9 ; CHECK-NEXT: vmovaps %xmm2, %xmm9 -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,3] ; CHECK-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] @@ -44,9 +44,9 @@ ; CHECK-NEXT: # implicit-def: $ymm8 ; CHECK-NEXT: vmovaps %xmm7, %xmm8 ; CHECK-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[0,1],ymm6[0,1] -; CHECK-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm5, %ymm1 -; CHECK-NEXT: vmovaps %ymm3, (%rsp) # 32-byte Spill +; CHECK-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm9, %ymm3 ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/pr34657.ll b/llvm/test/CodeGen/X86/pr34657.ll --- a/llvm/test/CodeGen/X86/pr34657.ll +++ b/llvm/test/CodeGen/X86/pr34657.ll @@ -8,9 +8,9 @@ ; CHECK-NEXT: vmovups (%rsi), %zmm0 ; CHECK-NEXT: vmovups 64(%rsi), %ymm1 ; CHECK-NEXT: vmovups 96(%rsi), %xmm2 -; CHECK-NEXT: vmovaps %xmm2, 96(%rdi) -; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm0, (%rdi) +; CHECK-NEXT: vmovups %xmm2, 96(%rdi) +; CHECK-NEXT: vmovups %ymm1, 64(%rdi) +; CHECK-NEXT: vmovups %zmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr38639.ll b/llvm/test/CodeGen/X86/pr38639.ll --- a/llvm/test/CodeGen/X86/pr38639.ll +++ b/llvm/test/CodeGen/X86/pr38639.ll @@ -4,11 +4,11 @@ define <8 x double> @test(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: test: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = ; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1] +; CHECK-NEXT: vmovups {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1] ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; CHECK-NEXT: retq %1 = shufflevector <4 x double> %a, <4 x double> , <8 x i32> diff --git a/llvm/test/CodeGen/X86/pr38738.ll b/llvm/test/CodeGen/X86/pr38738.ll --- a/llvm/test/CodeGen/X86/pr38738.ll +++ b/llvm/test/CodeGen/X86/pr38738.ll @@ -55,7 +55,7 @@ ; X64AVX-LABEL: pr38738: ; X64AVX: # %bb.0: # %entry ; X64AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; X64AVX-NEXT: movl $0, -{{[0-9]+}}(%rsp) ; X64AVX-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; X64AVX-NEXT: retq @@ -65,7 +65,7 @@ ; X86AVX-NEXT: subl $44, %esp ; X86AVX-NEXT: .cfi_def_cfa_offset 48 ; X86AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X86AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86AVX-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) ; X86AVX-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86AVX-NEXT: addl $44, %esp diff --git a/llvm/test/CodeGen/X86/pr40811.ll b/llvm/test/CodeGen/X86/pr40811.ll --- a/llvm/test/CodeGen/X86/pr40811.ll +++ b/llvm/test/CodeGen/X86/pr40811.ll @@ -4,7 +4,7 @@ define <8 x i32> @_Z6test70v(<4 x i32>* %id14793) { ; CHECK-LABEL: _Z6test70v: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vmovups (%rdi), %xmm0 ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2,3] ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,1,0] ; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,0] diff --git a/llvm/test/CodeGen/X86/pr43866.ll b/llvm/test/CodeGen/X86/pr43866.ll --- a/llvm/test/CodeGen/X86/pr43866.ll +++ b/llvm/test/CodeGen/X86/pr43866.ll @@ -21,7 +21,7 @@ ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[0,0],ymm1[6,4],ymm0[4,4] -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovups %ymm0, (%rsp) ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 diff --git a/llvm/test/CodeGen/X86/pr44140.ll b/llvm/test/CodeGen/X86/pr44140.ll --- a/llvm/test/CodeGen/X86/pr44140.ll +++ b/llvm/test/CodeGen/X86/pr44140.ll @@ -15,7 +15,7 @@ ; CHECK: # %bb.0: # %start ; CHECK-NEXT: subq $584, %rsp # imm = 0x248 ; CHECK-NEXT: .cfi_def_cfa_offset 592 -; CHECK-NEXT: vmovaps {{.*#+}} xmm6 = [1010101010101010101,2020202020202020202] +; CHECK-NEXT: vmovups {{.*#+}} xmm6 = [1010101010101010101,2020202020202020202] ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_1: # %fake-loop @@ -42,7 +42,7 @@ ; CHECK-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq opaque -; CHECK-NEXT: vmovaps %xmm6, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %xmm6, {{[0-9]+}}(%rsp) ; CHECK-NEXT: testb %sil, %sil ; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: # %bb.2: # %exit diff --git a/llvm/test/CodeGen/X86/pr45563-2.ll b/llvm/test/CodeGen/X86/pr45563-2.ll --- a/llvm/test/CodeGen/X86/pr45563-2.ll +++ b/llvm/test/CodeGen/X86/pr45563-2.ll @@ -46,7 +46,7 @@ ; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; CHECK-NEXT: vmovss %xmm1, 32(%rax) -; CHECK-NEXT: vmovaps %ymm0, (%rax) +; CHECK-NEXT: vmovups %ymm0, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <9 x float> @llvm.masked.load.v9f32.p0v9f32(<9 x float>* %addr, i32 4, <9 x i1>%mask, <9 x float> %dst) @@ -105,11 +105,11 @@ ; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm5 ; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm5, %ymm5 ; CHECK-NEXT: vblendvps %xmm4, %xmm5, %xmm1, %xmm1 -; CHECK-NEXT: vmovaps %xmm1, 32(%rax) +; CHECK-NEXT: vmovups %xmm1, 32(%rax) ; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm1 ; CHECK-NEXT: vblendvps %xmm3, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovss %xmm0, 48(%rax) -; CHECK-NEXT: vmovaps %ymm2, (%rax) +; CHECK-NEXT: vmovups %ymm2, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <13 x float> @llvm.masked.load.v13f32.p0v13f32(<13 x float>* %addr, i32 4, <13 x i1>%mask, <13 x float> %dst) @@ -174,8 +174,8 @@ ; CHECK-NEXT: vblendvps %xmm3, %xmm6, %xmm1, %xmm1 ; CHECK-NEXT: vmovlps %xmm1, 48(%rax) ; CHECK-NEXT: vblendvps %xmm4, %xmm5, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, 32(%rax) -; CHECK-NEXT: vmovaps %ymm2, (%rax) +; CHECK-NEXT: vmovups %xmm0, 32(%rax) +; CHECK-NEXT: vmovups %ymm2, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <14 x float> @llvm.masked.load.v14f32.p0v14f32(<14 x float>* %addr, i32 4, <14 x i1>%mask, <14 x float> %dst) @@ -241,8 +241,8 @@ ; CHECK-NEXT: vmaskmovps 64(%r10), %ymm3, %ymm4 ; CHECK-NEXT: vblendvps %xmm3, %xmm4, %xmm0, %xmm0 ; CHECK-NEXT: vmovss %xmm0, 64(%rax) -; CHECK-NEXT: vmovaps %ymm1, 32(%rax) -; CHECK-NEXT: vmovaps %ymm2, (%rax) +; CHECK-NEXT: vmovups %ymm1, 32(%rax) +; CHECK-NEXT: vmovups %ymm2, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <17 x float> @llvm.masked.load.v17f32.p0v17f32(<17 x float>* %addr, i32 4, <17 x i1>%mask, <17 x float> %dst) @@ -321,14 +321,14 @@ ; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6 ; CHECK-NEXT: vmaskmovps 64(%r10), %ymm6, %ymm6 -; CHECK-NEXT: vmovaps %ymm2, 32(%rax) +; CHECK-NEXT: vmovups %ymm2, 32(%rax) ; CHECK-NEXT: vextractf128 $1, %ymm6, %xmm2 ; CHECK-NEXT: vblendvps %xmm4, %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vextractps $2, %xmm1, 88(%rax) ; CHECK-NEXT: vmovlps %xmm1, 80(%rax) ; CHECK-NEXT: vblendvps %xmm5, %xmm6, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, 64(%rax) -; CHECK-NEXT: vmovaps %ymm3, (%rax) +; CHECK-NEXT: vmovups %xmm0, 64(%rax) +; CHECK-NEXT: vmovups %ymm3, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <23 x float> @llvm.masked.load.v23f32.p0v23f32(<23 x float>* %addr, i32 4, <23 x i1>%mask, <23 x float> %dst) diff --git a/llvm/test/CodeGen/X86/pr46532.ll b/llvm/test/CodeGen/X86/pr46532.ll --- a/llvm/test/CodeGen/X86/pr46532.ll +++ b/llvm/test/CodeGen/X86/pr46532.ll @@ -6,8 +6,8 @@ ; CHECK: # %bb.0: # %while.1.body.preheader ; CHECK-NEXT: movq (%rax), %rax ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, 32(%rax) -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,0,0,0,0,0,0] +; CHECK-NEXT: vmovups %xmm0, 32(%rax) +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [4294967295,4294967295,0,0,0,0,0,0] ; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax) while.1.body.preheader: %0 = load i8*, i8** undef, align 8, !invariant.load !0, !dereferenceable !1, !align !2 diff --git a/llvm/test/CodeGen/X86/pr46820.ll b/llvm/test/CodeGen/X86/pr46820.ll --- a/llvm/test/CodeGen/X86/pr46820.ll +++ b/llvm/test/CodeGen/X86/pr46820.ll @@ -13,11 +13,11 @@ ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: vmovups 64(%rsi), %ymm0 ; CHECK-NEXT: vmovups (%rsi), %zmm1 -; CHECK-NEXT: vmovaps 64(%rsi), %xmm2 +; CHECK-NEXT: vmovups 64(%rsi), %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss %xmm3, 88(%rdi) -; CHECK-NEXT: vmovaps %xmm2, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm1, (%rdi) +; CHECK-NEXT: vmovups %xmm2, 64(%rdi) +; CHECK-NEXT: vmovups %zmm1, (%rdi) ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vmovlps %xmm0, 80(%rdi) ; CHECK-NEXT: vzeroupper @@ -38,8 +38,8 @@ ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss %xmm2, 88(%rdi) ; CHECK-NEXT: movq %rcx, 80(%rdi) -; CHECK-NEXT: vmovaps %xmm1, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm0, (%rdi) +; CHECK-NEXT: vmovups %xmm1, 64(%rdi) +; CHECK-NEXT: vmovups %zmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t0 = load <23 x float>, <23 x float>* %p, align 1 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -757,7 +757,7 @@ ; AVX1-NEXT: vpxor %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpacksswb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll --- a/llvm/test/CodeGen/X86/recip-fastmath.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath.ll @@ -335,31 +335,31 @@ ; ; AVX-RECIP-LABEL: v4f32_no_estimate: ; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_no_estimate: ; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v4f32_no_estimate: ; BDVER2: # %bb.0: -; BDVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v4f32_no_estimate: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_no_estimate: ; SANDY: # %bb.0: -; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: retq ; @@ -400,7 +400,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -422,7 +422,7 @@ ; ; BTVER2-LABEL: v4f32_one_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 @@ -434,7 +434,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -587,7 +587,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -600,7 +600,7 @@ ; FMA-RECIP-LABEL: v4f32_two_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 @@ -611,7 +611,7 @@ ; BDVER2-LABEL: v4f32_two_step: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %xmm0, %xmm1 -; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2 ; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1 ; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 @@ -620,7 +620,7 @@ ; ; BTVER2-LABEL: v4f32_two_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 ; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 @@ -636,7 +636,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -698,31 +698,31 @@ ; ; AVX-RECIP-LABEL: v8f32_no_estimate: ; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_no_estimate: ; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v8f32_no_estimate: ; BDVER2: # %bb.0: -; BDVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v8f32_no_estimate: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_no_estimate: ; SANDY: # %bb.0: -; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: retq ; @@ -770,7 +770,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -792,7 +792,7 @@ ; ; BTVER2-LABEL: v8f32_one_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 @@ -804,7 +804,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -883,7 +883,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -896,7 +896,7 @@ ; FMA-RECIP-LABEL: v8f32_two_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1 @@ -907,7 +907,7 @@ ; BDVER2-LABEL: v8f32_two_step: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm1 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm0 * ymm1) - ymm2 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm1 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2 @@ -916,7 +916,7 @@ ; ; BTVER2-LABEL: v8f32_two_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 ; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 @@ -932,7 +932,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -1000,35 +1000,35 @@ ; ; AVX-RECIP-LABEL: v16f32_no_estimate: ; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v16f32_no_estimate: ; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; FMA-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v16f32_no_estimate: ; BDVER2: # %bb.0: -; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; BDVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v16f32_no_estimate: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; BTVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v16f32_no_estimate: ; SANDY: # %bb.0: -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; SANDY-NEXT: retq @@ -1093,7 +1093,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 @@ -1107,7 +1107,7 @@ ; FMA-RECIP-LABEL: v16f32_one_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 @@ -1118,7 +1118,7 @@ ; BDVER2-LABEL: v16f32_one_step: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vrcpps %ymm1, %ymm4 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3 @@ -1128,7 +1128,7 @@ ; ; BTVER2-LABEL: v16f32_one_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 ; BTVER2-NEXT: vrcpps %ymm1, %ymm4 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -1145,7 +1145,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 @@ -1253,7 +1253,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 @@ -1275,7 +1275,7 @@ ; FMA-RECIP-LABEL: v16f32_two_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 @@ -1292,7 +1292,7 @@ ; BDVER2-LABEL: v16f32_two_step: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 @@ -1306,7 +1306,7 @@ ; ; BTVER2-LABEL: v16f32_two_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 @@ -1331,7 +1331,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll --- a/llvm/test/CodeGen/X86/recip-fastmath2.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll @@ -378,7 +378,7 @@ ; AVX-RECIP-LABEL: v4f32_one_step2: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3 ; AVX-RECIP-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 @@ -389,7 +389,7 @@ ; FMA-RECIP-LABEL: v4f32_one_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; FMA-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 @@ -398,7 +398,7 @@ ; BDVER2-LABEL: v4f32_one_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %xmm0, %xmm1 -; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; BDVER2-NEXT: vmulps %xmm2, %xmm1, %xmm3 ; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm2 ; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 @@ -406,7 +406,7 @@ ; ; BTVER2-LABEL: v4f32_one_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 ; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm3 ; BTVER2-NEXT: vmulps %xmm3, %xmm0, %xmm0 @@ -418,7 +418,7 @@ ; SANDY-LABEL: v4f32_one_step2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm3 ; SANDY-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 @@ -429,7 +429,7 @@ ; HASWELL-LABEL: v4f32_one_step2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; HASWELL-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; HASWELL-NEXT: vmulps %xmm2, %xmm1, %xmm3 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 @@ -438,7 +438,7 @@ ; HASWELL-NO-FMA-LABEL: v4f32_one_step2: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; HASWELL-NO-FMA-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm3 ; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 @@ -449,7 +449,7 @@ ; AVX512-LABEL: v4f32_one_step2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpps %xmm0, %xmm1 -; AVX512-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; AVX512-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm3 ; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 @@ -476,7 +476,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -504,7 +504,7 @@ ; ; BTVER2-LABEL: v4f32_one_step_2_divs: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 @@ -518,7 +518,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -595,11 +595,11 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3 ; AVX-RECIP-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 @@ -610,10 +610,10 @@ ; FMA-RECIP-LABEL: v4f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; FMA-RECIP-NEXT: vmulps %xmm1, %xmm2, %xmm3 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 @@ -623,7 +623,7 @@ ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %xmm0, %xmm1 ; BDVER2-NEXT: vfmsubps {{.*#+}} xmm2 = (xmm0 * xmm1) - mem -; BDVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm1 ; BDVER2-NEXT: vmulps %xmm4, %xmm1, %xmm3 ; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm4 @@ -632,9 +632,9 @@ ; ; BTVER2-LABEL: v4f32_two_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 ; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 @@ -650,11 +650,11 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm3 ; SANDY-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 @@ -668,7 +668,7 @@ ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 ; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 -; HASWELL-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; HASWELL-NEXT: vmovups {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; HASWELL-NEXT: vmulps %xmm1, %xmm2, %xmm3 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 @@ -682,7 +682,7 @@ ; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; HASWELL-NO-FMA-NEXT: vmovups {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm3 ; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 @@ -696,7 +696,7 @@ ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX512-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 ; AVX512-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 -; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; AVX512-NEXT: vmovups {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm3 ; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 @@ -731,7 +731,7 @@ ; AVX-RECIP-LABEL: v8f32_one_step2: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 @@ -742,7 +742,7 @@ ; FMA-RECIP-LABEL: v8f32_one_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; FMA-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3 @@ -751,7 +751,7 @@ ; BDVER2-LABEL: v8f32_one_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm1 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BDVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm3) - ymm2 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3 @@ -759,7 +759,7 @@ ; ; BTVER2-LABEL: v8f32_one_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; BTVER2-NEXT: vmulps %ymm3, %ymm0, %ymm0 @@ -771,7 +771,7 @@ ; SANDY-LABEL: v8f32_one_step2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 @@ -782,7 +782,7 @@ ; HASWELL-LABEL: v8f32_one_step2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; HASWELL-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3 @@ -791,7 +791,7 @@ ; HASWELL-NO-FMA-LABEL: v8f32_one_step2: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NO-FMA-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 @@ -802,7 +802,7 @@ ; AVX512-LABEL: v8f32_one_step2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpps %ymm0, %ymm1 -; AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX512-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; AVX512-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3 @@ -838,7 +838,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -866,7 +866,7 @@ ; ; BTVER2-LABEL: v8f32_one_step_2_divs: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 @@ -880,7 +880,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -972,11 +972,11 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 @@ -987,10 +987,10 @@ ; FMA-RECIP-LABEL: v8f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; FMA-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm3 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm1 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 @@ -1000,7 +1000,7 @@ ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm1 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm2 = (ymm0 * ymm1) - mem -; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm1 ; BDVER2-NEXT: vmulps %ymm4, %ymm1, %ymm3 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm3) - ymm4 @@ -1009,9 +1009,9 @@ ; ; BTVER2-LABEL: v8f32_two_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 ; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 @@ -1027,11 +1027,11 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 @@ -1045,7 +1045,7 @@ ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 ; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 -; HASWELL-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NEXT: vmovups {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; HASWELL-NEXT: vmulps %ymm1, %ymm2, %ymm3 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm1 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 @@ -1059,7 +1059,7 @@ ; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NO-FMA-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 @@ -1073,7 +1073,7 @@ ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX512-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 ; AVX512-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 -; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX512-NEXT: vmovups {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; AVX512-NEXT: vmulps %ymm1, %ymm2, %ymm3 ; AVX512-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm1 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 @@ -1159,14 +1159,14 @@ ; AVX-RECIP-LABEL: v16f32_one_step2: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; AVX-RECIP-NEXT: vmulps %ymm4, %ymm0, %ymm0 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm4, %ymm0 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; AVX-RECIP-NEXT: vmulps %ymm4, %ymm1, %ymm1 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 @@ -1177,12 +1177,12 @@ ; FMA-RECIP-LABEL: v16f32_one_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; FMA-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; FMA-RECIP-NEXT: vmovups {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; FMA-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4 @@ -1191,11 +1191,11 @@ ; BDVER2-LABEL: v16f32_one_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BDVER2-NEXT: vrcpps %ymm1, %ymm5 ; BDVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm3 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BDVER2-NEXT: vmovups {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4 ; BDVER2-NEXT: vmulps %ymm3, %ymm5, %ymm4 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3 @@ -1204,12 +1204,12 @@ ; ; BTVER2-LABEL: v16f32_one_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; BTVER2-NEXT: vmulps %ymm4, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BTVER2-NEXT: vmovups {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; BTVER2-NEXT: vrcpps %ymm1, %ymm2 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm5 @@ -1223,14 +1223,14 @@ ; SANDY-LABEL: v16f32_one_step2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; SANDY-NEXT: vmulps %ymm4, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: vrcpps %ymm1, %ymm3 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm4, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; SANDY-NEXT: vmovups {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; SANDY-NEXT: vmulps %ymm2, %ymm3, %ymm4 ; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 ; SANDY-NEXT: vsubps %ymm1, %ymm2, %ymm1 @@ -1241,12 +1241,12 @@ ; HASWELL-LABEL: v16f32_one_step2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm2 -; HASWELL-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; HASWELL-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4 ; HASWELL-NEXT: vrcpps %ymm1, %ymm2 -; HASWELL-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; HASWELL-NEXT: vmovups {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; HASWELL-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4 @@ -1255,14 +1255,14 @@ ; HASWELL-NO-FMA-LABEL: v16f32_one_step2: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 -; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NO-FMA-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm0, %ymm0 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm3 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm4, %ymm0 -; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; HASWELL-NO-FMA-NEXT: vmovups {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm3, %ymm4 ; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm2, %ymm1 @@ -1273,7 +1273,7 @@ ; AVX512-LABEL: v16f32_one_step2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcp14ps %zmm0, %zmm1 -; AVX512-NEXT: vmovaps {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; AVX512-NEXT: vmovups {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm3 ; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm3 * zmm0) - zmm2 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm3 @@ -1327,7 +1327,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 @@ -1345,7 +1345,7 @@ ; FMA-RECIP-LABEL: v16f32_one_step_2_divs: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 @@ -1360,7 +1360,7 @@ ; BDVER2-LABEL: v16f32_one_step_2_divs: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2 ; BDVER2-NEXT: vrcpps %ymm1, %ymm2 @@ -1374,7 +1374,7 @@ ; ; BTVER2-LABEL: v16f32_one_step_2_divs: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 @@ -1395,7 +1395,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: vrcpps %ymm1, %ymm4 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 @@ -1526,11 +1526,11 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm5 ; AVX-RECIP-NEXT: vmulps %ymm5, %ymm0, %ymm0 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 @@ -1541,7 +1541,7 @@ ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; AVX-RECIP-NEXT: vmovups {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; AVX-RECIP-NEXT: vmulps %ymm4, %ymm1, %ymm1 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 @@ -1552,18 +1552,18 @@ ; FMA-RECIP-LABEL: v16f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; FMA-RECIP-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; FMA-RECIP-NEXT: vmulps %ymm2, %ymm4, %ymm5 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm5 * ymm0) - ymm2 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm5 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 ; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} ymm3 = (ymm1 * ymm2) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm2) + ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; FMA-RECIP-NEXT: vmovups {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; FMA-RECIP-NEXT: vmulps %ymm2, %ymm3, %ymm4 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm2 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm4 @@ -1572,15 +1572,15 @@ ; BDVER2-LABEL: v16f32_two_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BDVER2-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BDVER2-NEXT: vmulps %ymm4, %ymm2, %ymm5 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm5) - ymm4 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm5 ; BDVER2-NEXT: vrcpps %ymm1, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BDVER2-NEXT: vmovups {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm1 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm3) + ymm2 ; BDVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4 @@ -1590,13 +1590,13 @@ ; ; BTVER2-LABEL: v16f32_two_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BTVER2-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm5 ; BTVER2-NEXT: vmulps %ymm5, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 @@ -1604,7 +1604,7 @@ ; BTVER2-NEXT: vrcpps %ymm1, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; BTVER2-NEXT: vaddps %ymm0, %ymm5, %ymm0 -; BTVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BTVER2-NEXT: vmovups {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 @@ -1619,11 +1619,11 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm5 ; SANDY-NEXT: vmulps %ymm5, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 @@ -1634,7 +1634,7 @@ ; SANDY-NEXT: vsubps %ymm2, %ymm4, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm3, %ymm2 ; SANDY-NEXT: vaddps %ymm2, %ymm3, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; SANDY-NEXT: vmovups {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 ; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 @@ -1649,14 +1649,14 @@ ; HASWELL-NEXT: vmovaps %ymm2, %ymm4 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 ; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 -; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; HASWELL-NEXT: vmulps %ymm2, %ymm4, %ymm5 ; HASWELL-NEXT: vrcpps %ymm1, %ymm6 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm5 * ymm0) - ymm2 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm5 ; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm3 = (ymm1 * ymm6) - ymm3 ; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm6) + ymm6 -; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; HASWELL-NEXT: vmovups {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; HASWELL-NEXT: vmulps %ymm2, %ymm3, %ymm4 ; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm2 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm4 @@ -1670,7 +1670,7 @@ ; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NO-FMA-NEXT: vmovups {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm5 ; HASWELL-NO-FMA-NEXT: vmulps %ymm5, %ymm0, %ymm0 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 @@ -1681,7 +1681,7 @@ ; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm4, %ymm2 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm3, %ymm2 ; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm3, %ymm2 -; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; HASWELL-NO-FMA-NEXT: vmovups {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 @@ -1695,7 +1695,7 @@ ; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX512-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2 ; AVX512-NEXT: vfnmadd132ps {{.*#+}} zmm2 = -(zmm2 * zmm1) + zmm1 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; AVX512-NEXT: vmovups {{.*#+}} zmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; AVX512-NEXT: vmulps %zmm1, %zmm2, %zmm3 ; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm3 * zmm0) - zmm1 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm2 * zmm0) + zmm3 diff --git a/llvm/test/CodeGen/X86/reduce-trunc-shl.ll b/llvm/test/CodeGen/X86/reduce-trunc-shl.ll --- a/llvm/test/CodeGen/X86/reduce-trunc-shl.ll +++ b/llvm/test/CodeGen/X86/reduce-trunc-shl.ll @@ -13,7 +13,7 @@ ; ; AVX2-LABEL: trunc_shl_7_v4i32_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%rsi), %xmm0 +; AVX2-NEXT: vmovups (%rsi), %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX2-NEXT: vpslld $7, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -697,7 +697,7 @@ ; AVX1-LABEL: v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vmovups {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vblendvps %xmm2, {{.*}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 @@ -798,7 +798,7 @@ ; AVX1-LABEL: v4i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vmovups {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vblendvps %xmm2, {{.*}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 @@ -944,7 +944,7 @@ ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm5 -; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vmovups {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vblendvps %ymm5, {{.*}}(%rip), %ymm6, %ymm6 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0 @@ -1170,8 +1170,8 @@ ; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm6 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm7 -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vmovups {{.*#+}} ymm8 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] +; AVX1-NEXT: vmovups {{.*#+}} ymm9 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vblendvps %ymm7, %ymm8, %ymm9, %ymm10 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpcmpgtd %xmm6, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/sandybridge-loads.ll b/llvm/test/CodeGen/X86/sandybridge-loads.ll --- a/llvm/test/CodeGen/X86/sandybridge-loads.ll +++ b/llvm/test/CodeGen/X86/sandybridge-loads.ll @@ -4,15 +4,15 @@ define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind { ; CHECK-LABEL: wideloads: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vmovups (%rdi), %xmm0 ; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vmovaps (%rsi), %ymm1 +; CHECK-NEXT: vmovups (%rsi), %ymm1 ; CHECK-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovaps (%rdx), %ymm2 +; CHECK-NEXT: vmovups (%rdx), %ymm2 ; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vmovaps %ymm0, (%rax) +; CHECK-NEXT: vmovups %ymm0, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %v0 = load <8 x float>, <8 x float>* %a, align 16 ; <---- unaligned! @@ -29,11 +29,11 @@ define void @widestores(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind { ; CHECK-LABEL: widestores: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vmovaps (%rsi), %ymm1 -; CHECK-NEXT: vmovaps %ymm0, (%rsi) +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: vmovups (%rsi), %ymm1 +; CHECK-NEXT: vmovups %ymm0, (%rsi) ; CHECK-NEXT: vextractf128 $1, %ymm1, 16(%rdi) -; CHECK-NEXT: vmovaps %xmm1, (%rdi) +; CHECK-NEXT: vmovups %xmm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %v0 = load <8 x float>, <8 x float>* %a, align 32 @@ -46,12 +46,12 @@ define void @widestores_unaligned_load(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind { ; CHECK-LABEL: widestores_unaligned_load: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vmovaps (%rsi), %xmm1 -; CHECK-NEXT: vmovaps 16(%rsi), %xmm2 -; CHECK-NEXT: vmovaps %ymm0, (%rsi) -; CHECK-NEXT: vmovaps %xmm2, 16(%rdi) -; CHECK-NEXT: vmovaps %xmm1, (%rdi) +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: vmovups (%rsi), %xmm1 +; CHECK-NEXT: vmovups 16(%rsi), %xmm2 +; CHECK-NEXT: vmovups %ymm0, (%rsi) +; CHECK-NEXT: vmovups %xmm2, 16(%rdi) +; CHECK-NEXT: vmovups %xmm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %v0 = load <8 x float>, <8 x float>* %a, align 32 ; <--- aligned diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll --- a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll +++ b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll @@ -740,7 +740,7 @@ ; AVX512_32_LIN-LABEL: t_to_u32: ; AVX512_32_LIN: # %bb.0: ; AVX512_32_LIN-NEXT: subl $28, %esp -; AVX512_32_LIN-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 +; AVX512_32_LIN-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 ; AVX512_32_LIN-NEXT: vmovups %xmm0, (%esp) ; AVX512_32_LIN-NEXT: calll __fixunstfsi ; AVX512_32_LIN-NEXT: addl $28, %esp @@ -888,7 +888,7 @@ ; AVX512_32_LIN-LABEL: t_to_s32: ; AVX512_32_LIN: # %bb.0: ; AVX512_32_LIN-NEXT: subl $28, %esp -; AVX512_32_LIN-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 +; AVX512_32_LIN-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 ; AVX512_32_LIN-NEXT: vmovups %xmm0, (%esp) ; AVX512_32_LIN-NEXT: calll __fixtfsi ; AVX512_32_LIN-NEXT: addl $28, %esp diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll --- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll +++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll @@ -1531,7 +1531,7 @@ ; AVX512_32_LIN-LABEL: t_to_u64: ; AVX512_32_LIN: # %bb.0: ; AVX512_32_LIN-NEXT: subl $28, %esp -; AVX512_32_LIN-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 +; AVX512_32_LIN-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 ; AVX512_32_LIN-NEXT: vmovups %xmm0, (%esp) ; AVX512_32_LIN-NEXT: calll __fixunstfdi ; AVX512_32_LIN-NEXT: addl $28, %esp @@ -1658,7 +1658,7 @@ ; AVX512_32_LIN-LABEL: t_to_s64: ; AVX512_32_LIN: # %bb.0: ; AVX512_32_LIN-NEXT: subl $28, %esp -; AVX512_32_LIN-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 +; AVX512_32_LIN-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 ; AVX512_32_LIN-NEXT: vmovups %xmm0, (%esp) ; AVX512_32_LIN-NEXT: calll __fixtfdi ; AVX512_32_LIN-NEXT: addl $28, %esp diff --git a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll --- a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll +++ b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll @@ -19,7 +19,7 @@ ; AVX2-LABEL: foo4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vmovups %ymm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> %res1 = shufflevector<4 x double> %res, <4 x double> undef, <4 x i32> @@ -32,14 +32,14 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rdi) ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: foo8: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vbroadcastss {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-FAST-NEXT: vmovups %ymm0, (%rdi) ; AVX2-FAST-NEXT: retq %res = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> %res1 = shufflevector<8 x float> %res, <8 x float> undef, <8 x i32> @@ -82,7 +82,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,2,3,3] ; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-NEXT: vmovaps %xmm0, (%rdi) +; AVX2-NEXT: vmovups %xmm0, (%rdi) ; AVX2-NEXT: vmovaps %xmm1, %xmm0 ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> @@ -95,7 +95,7 @@ ; AVX2-LABEL: undef_splatmask5: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-NEXT: vmovaps %xmm0, (%rdi) +; AVX2-NEXT: vmovups %xmm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll @@ -95,16 +95,16 @@ define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind { ; AVX-LABEL: shuffle_v8i32_to_v4i32_1: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] -; AVX-NEXT: vmovaps %xmm0, (%rsi) +; AVX-NEXT: vmovups %xmm0, (%rsi) ; AVX-NEXT: retq ; ; AVX512-LABEL: shuffle_v8i32_to_v4i32_1: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-NEXT: vmovups (%rdi), %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] -; AVX512-NEXT: vmovaps %xmm0, (%rsi) +; AVX512-NEXT: vmovups %xmm0, (%rsi) ; AVX512-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %L %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -111,10 +111,10 @@ define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind { ; AVX512F-LABEL: shuffle_v16i32_to_v8i32_1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vmovups (%rdi), %ymm0 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7] ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512F-NEXT: vmovaps %ymm0, (%rsi) +; AVX512F-NEXT: vmovups %ymm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -70,7 +70,7 @@ define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { ; AVX1-LABEL: trunc_v16i16_to_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovups (%rdi), %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -249,16 +249,16 @@ define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { ; AVX-LABEL: shuffle_v8i32_to_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX-NEXT: vmovaps %xmm0, (%rsi) +; AVX-NEXT: vmovups %xmm0, (%rsi) ; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v8i32_to_v4i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vmovups (%rdi), %xmm0 ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX512F-NEXT: vmovaps %xmm0, (%rsi) +; AVX512F-NEXT: vmovups %xmm0, (%rsi) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_to_v4i32: @@ -270,9 +270,9 @@ ; ; AVX512BW-LABEL: shuffle_v8i32_to_v4i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BW-NEXT: vmovups (%rdi), %xmm0 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX512BW-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BW-NEXT: vmovups %xmm0, (%rsi) ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32: @@ -297,23 +297,23 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { ; AVX1-LABEL: trunc_v4i64_to_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vmovups (%rdi), %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX1-NEXT: vmovaps %xmm0, (%rsi) +; AVX1-NEXT: vmovups %xmm0, (%rsi) ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovups (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi) +; AVX2-SLOW-NEXT: vmovups %xmm0, (%rsi) ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u> ; AVX2-FAST-NEXT: vpermps (%rdi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsi) +; AVX2-FAST-NEXT: vmovups %xmm0, (%rsi) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/splat-const.ll b/llvm/test/CodeGen/X86/splat-const.ll --- a/llvm/test/CodeGen/X86/splat-const.ll +++ b/llvm/test/CodeGen/X86/splat-const.ll @@ -29,7 +29,7 @@ ; SSE: movaps {{.*}}, %xmm0 # xmm0 = [42,42,42,42] ; SSE-NEXT: retq ; AVX-LABEL: const_vector: -; AVX: vmovaps {{.*}}, %xmm0 # xmm0 = [42,42,42,42] +; AVX: vmovups {{.*}}, %xmm0 # xmm0 = [42,42,42,42] ; AVX-NEXT: retq ; AVX2-LABEL: const_vector: ; AVX2: vbroadcastss {{[^%].*}}, %xmm0 diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll --- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll @@ -55,7 +55,7 @@ ; SNB-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 ; SNB-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; SNB-NEXT: vmulps %xmm1, %xmm3, %xmm1 -; SNB-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; SNB-NEXT: vmovups {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; SNB-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; SNB-NEXT: vandps %xmm1, %xmm0, %xmm0 ; SNB-NEXT: retq @@ -125,7 +125,7 @@ ; SNB-NEXT: vaddps {{.*}}(%rip), %ymm1, %ymm1 ; SNB-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1 -; SNB-NEXT: vmovaps {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; SNB-NEXT: vmovups {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; SNB-NEXT: vcmpleps %ymm0, %ymm2, %ymm0 ; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0 ; SNB-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -307,7 +307,7 @@ ; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; AVX1-NEXT: vmovups {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -395,7 +395,7 @@ ; AVX1-LABEL: v4f32_no_estimate: ; AVX1: # %bb.0: ; AVX1-NEXT: vsqrtps %xmm0, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX1-NEXT: vmovups {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX1-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -460,7 +460,7 @@ ; AVX1-LABEL: v8f32_no_estimate: ; AVX1: # %bb.0: ; AVX1-NEXT: vsqrtps %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX1-NEXT: vmovups {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX1-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -540,7 +540,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vsqrtps %ymm1, %ymm1 ; AVX1-NEXT: vsqrtps %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX1-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq @@ -591,11 +591,11 @@ ; AVX1-LABEL: v16f32_estimate: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtps %ymm0, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; AVX1-NEXT: vmovups {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0 ; AVX1-NEXT: vrsqrtps %ymm1, %ymm4 diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll @@ -577,7 +577,7 @@ ; ; CHECK-AVX1-LABEL: test_srem_one_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX1-NEXT: vmovups {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_one_eq: @@ -733,7 +733,7 @@ ; ; CHECK-AVX1-LABEL: test_srem_allones: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX1-NEXT: vmovups {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_allones: diff --git a/llvm/test/CodeGen/X86/sse-fsignum.ll b/llvm/test/CodeGen/X86/sse-fsignum.ll --- a/llvm/test/CodeGen/X86/sse-fsignum.ll +++ b/llvm/test/CodeGen/X86/sse-fsignum.ll @@ -12,14 +12,14 @@ define void @signum32a(<4 x float>*) { ; AVX-LABEL: signum32a: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vcvtdq2ps %xmm2, %xmm2 ; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vmovaps %xmm0, (%rdi) +; AVX-NEXT: vmovups %xmm0, (%rdi) ; AVX-NEXT: retq entry: %1 = load <4 x float>, <4 x float>* %0 @@ -64,14 +64,14 @@ define void @signum32b(<8 x float>*) { ; AVX-LABEL: signum32b: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps (%rdi), %ymm0 +; AVX-NEXT: vmovups (%rdi), %ymm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 ; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 ; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vmovups %ymm0, (%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq entry: @@ -152,14 +152,14 @@ define void @signum32c(<8 x float>*) { ; AVX-LABEL: signum32c: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps (%rdi), %ymm0 +; AVX-NEXT: vmovups (%rdi), %ymm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 ; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 ; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vmovups %ymm0, (%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq entry: @@ -188,7 +188,7 @@ ; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdi) +; AVX1-NEXT: vmovups %ymm0, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -202,7 +202,7 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vmovups %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -216,7 +216,7 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX512F-NEXT: vmovaps %ymm0, (%rdi) +; AVX512F-NEXT: vmovups %ymm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -1167,13 +1167,13 @@ ; X86-AVX1-LABEL: test_mm_load_ps: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovaps (%eax), %xmm0 # encoding: [0xc5,0xf8,0x28,0x00] +; X86-AVX1-NEXT: vmovups (%eax), %xmm0 # encoding: [0xc5,0xf8,0x10,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_load_ps: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00] +; X86-AVX512-NEXT: vmovups (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_load_ps: @@ -1183,12 +1183,12 @@ ; ; X64-AVX1-LABEL: test_mm_load_ps: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07] +; X64-AVX1-NEXT: vmovups (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x10,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_load_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; X64-AVX512-NEXT: vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast float* %a0 to <4 x float>* %res = load <4 x float>, <4 x float>* %arg0, align 16 @@ -2583,13 +2583,13 @@ ; X86-AVX1-LABEL: test_mm_store_ps: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX1-NEXT: vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_store_ps: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store_ps: @@ -2599,12 +2599,12 @@ ; ; X64-AVX1-LABEL: test_mm_store_ps: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX1-NEXT: vmovups %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_store_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast float* %a0 to <4 x float>* store <4 x float> %a1, <4 x float>* %arg0, align 16 @@ -2625,14 +2625,14 @@ ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] -; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX1-NEXT: vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_store_ps1: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store_ps1: @@ -2646,13 +2646,13 @@ ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] ; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] -; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX1-NEXT: vmovups %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_store_ps1: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast float* %a0 to <4 x float>* %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer @@ -2712,14 +2712,14 @@ ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] -; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX1-NEXT: vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_store1_ps: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store1_ps: @@ -2733,13 +2733,13 @@ ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] ; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] -; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX1-NEXT: vmovups %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_store1_ps: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast float* %a0 to <4 x float>* %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer @@ -2974,7 +2974,7 @@ ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX1-NEXT: vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] ; X86-AVX1-NEXT: # xmm0 = xmm0[3,2,1,0] -; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX1-NEXT: vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_storer_ps: @@ -2982,7 +2982,7 @@ ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX512-NEXT: vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] ; X86-AVX512-NEXT: # xmm0 = xmm0[3,2,1,0] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_storer_ps: @@ -2996,14 +2996,14 @@ ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] ; X64-AVX1-NEXT: # xmm0 = xmm0[3,2,1,0] -; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX1-NEXT: vmovups %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_storer_ps: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] ; X64-AVX512-NEXT: # xmm0 = xmm0[3,2,1,0] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast float* %a0 to <4 x float>* %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> @@ -3174,10 +3174,10 @@ ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-AVX1-NEXT: vmovaps (%esi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x06] -; X86-AVX1-NEXT: vmovaps (%edx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a] -; X86-AVX1-NEXT: vmovaps (%ecx), %xmm2 # encoding: [0xc5,0xf8,0x28,0x11] -; X86-AVX1-NEXT: vmovaps (%eax), %xmm3 # encoding: [0xc5,0xf8,0x28,0x18] +; X86-AVX1-NEXT: vmovups (%esi), %xmm0 # encoding: [0xc5,0xf8,0x10,0x06] +; X86-AVX1-NEXT: vmovups (%edx), %xmm1 # encoding: [0xc5,0xf8,0x10,0x0a] +; X86-AVX1-NEXT: vmovups (%ecx), %xmm2 # encoding: [0xc5,0xf8,0x10,0x11] +; X86-AVX1-NEXT: vmovups (%eax), %xmm3 # encoding: [0xc5,0xf8,0x10,0x18] ; X86-AVX1-NEXT: vunpcklps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x14,0xe1] ; X86-AVX1-NEXT: # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-AVX1-NEXT: vunpcklps %xmm3, %xmm2, %xmm5 # encoding: [0xc5,0xe8,0x14,0xeb] @@ -3194,10 +3194,10 @@ ; X86-AVX1-NEXT: # xmm4 = xmm0[0],xmm1[0] ; X86-AVX1-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x15,0xc1] ; X86-AVX1-NEXT: # xmm0 = xmm0[1],xmm1[1] -; X86-AVX1-NEXT: vmovaps %xmm2, (%esi) # encoding: [0xc5,0xf8,0x29,0x16] -; X86-AVX1-NEXT: vmovaps %xmm3, (%edx) # encoding: [0xc5,0xf8,0x29,0x1a] -; X86-AVX1-NEXT: vmovaps %xmm4, (%ecx) # encoding: [0xc5,0xf8,0x29,0x21] -; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX1-NEXT: vmovups %xmm2, (%esi) # encoding: [0xc5,0xf8,0x11,0x16] +; X86-AVX1-NEXT: vmovups %xmm3, (%edx) # encoding: [0xc5,0xf8,0x11,0x1a] +; X86-AVX1-NEXT: vmovups %xmm4, (%ecx) # encoding: [0xc5,0xf8,0x11,0x21] +; X86-AVX1-NEXT: vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX1-NEXT: popl %esi # encoding: [0x5e] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; @@ -3208,10 +3208,10 @@ ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-AVX512-NEXT: vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06] -; X86-AVX512-NEXT: vmovaps (%edx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0a] -; X86-AVX512-NEXT: vmovaps (%ecx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x11] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x18] +; X86-AVX512-NEXT: vmovups (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x06] +; X86-AVX512-NEXT: vmovups (%edx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x0a] +; X86-AVX512-NEXT: vmovups (%ecx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x11] +; X86-AVX512-NEXT: vmovups (%eax), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x18] ; X86-AVX512-NEXT: vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1] ; X86-AVX512-NEXT: # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-AVX512-NEXT: vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb] @@ -3228,10 +3228,10 @@ ; X86-AVX512-NEXT: # xmm4 = xmm0[0],xmm1[0] ; X86-AVX512-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1] ; X86-AVX512-NEXT: # xmm0 = xmm0[1],xmm1[1] -; X86-AVX512-NEXT: vmovaps %xmm2, (%esi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x16] -; X86-AVX512-NEXT: vmovaps %xmm3, (%edx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1a] -; X86-AVX512-NEXT: vmovaps %xmm4, (%ecx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x21] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vmovups %xmm2, (%esi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x16] +; X86-AVX512-NEXT: vmovups %xmm3, (%edx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x1a] +; X86-AVX512-NEXT: vmovups %xmm4, (%ecx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x21] +; X86-AVX512-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX512-NEXT: popl %esi # encoding: [0x5e] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; @@ -3269,10 +3269,10 @@ ; ; X64-AVX1-LABEL: test_MM_TRANSPOSE4_PS: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07] -; X64-AVX1-NEXT: vmovaps (%rsi), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0e] -; X64-AVX1-NEXT: vmovaps (%rdx), %xmm2 # encoding: [0xc5,0xf8,0x28,0x12] -; X64-AVX1-NEXT: vmovaps (%rcx), %xmm3 # encoding: [0xc5,0xf8,0x28,0x19] +; X64-AVX1-NEXT: vmovups (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x10,0x07] +; X64-AVX1-NEXT: vmovups (%rsi), %xmm1 # encoding: [0xc5,0xf8,0x10,0x0e] +; X64-AVX1-NEXT: vmovups (%rdx), %xmm2 # encoding: [0xc5,0xf8,0x10,0x12] +; X64-AVX1-NEXT: vmovups (%rcx), %xmm3 # encoding: [0xc5,0xf8,0x10,0x19] ; X64-AVX1-NEXT: vunpcklps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x14,0xe1] ; X64-AVX1-NEXT: # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-AVX1-NEXT: vunpcklps %xmm3, %xmm2, %xmm5 # encoding: [0xc5,0xe8,0x14,0xeb] @@ -3289,18 +3289,18 @@ ; X64-AVX1-NEXT: # xmm4 = xmm0[0],xmm1[0] ; X64-AVX1-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x15,0xc1] ; X64-AVX1-NEXT: # xmm0 = xmm0[1],xmm1[1] -; X64-AVX1-NEXT: vmovaps %xmm2, (%rdi) # encoding: [0xc5,0xf8,0x29,0x17] -; X64-AVX1-NEXT: vmovaps %xmm3, (%rsi) # encoding: [0xc5,0xf8,0x29,0x1e] -; X64-AVX1-NEXT: vmovaps %xmm4, (%rdx) # encoding: [0xc5,0xf8,0x29,0x22] -; X64-AVX1-NEXT: vmovaps %xmm0, (%rcx) # encoding: [0xc5,0xf8,0x29,0x01] +; X64-AVX1-NEXT: vmovups %xmm2, (%rdi) # encoding: [0xc5,0xf8,0x11,0x17] +; X64-AVX1-NEXT: vmovups %xmm3, (%rsi) # encoding: [0xc5,0xf8,0x11,0x1e] +; X64-AVX1-NEXT: vmovups %xmm4, (%rdx) # encoding: [0xc5,0xf8,0x11,0x22] +; X64-AVX1-NEXT: vmovups %xmm0, (%rcx) # encoding: [0xc5,0xf8,0x11,0x01] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_MM_TRANSPOSE4_PS: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] -; X64-AVX512-NEXT: vmovaps (%rsi), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0e] -; X64-AVX512-NEXT: vmovaps (%rdx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x12] -; X64-AVX512-NEXT: vmovaps (%rcx), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x19] +; X64-AVX512-NEXT: vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] +; X64-AVX512-NEXT: vmovups (%rsi), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x0e] +; X64-AVX512-NEXT: vmovups (%rdx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x12] +; X64-AVX512-NEXT: vmovups (%rcx), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x19] ; X64-AVX512-NEXT: vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1] ; X64-AVX512-NEXT: # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-AVX512-NEXT: vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb] @@ -3317,10 +3317,10 @@ ; X64-AVX512-NEXT: # xmm4 = xmm0[0],xmm1[0] ; X64-AVX512-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1] ; X64-AVX512-NEXT: # xmm0 = xmm0[1],xmm1[1] -; X64-AVX512-NEXT: vmovaps %xmm2, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x17] -; X64-AVX512-NEXT: vmovaps %xmm3, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1e] -; X64-AVX512-NEXT: vmovaps %xmm4, (%rdx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x22] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rcx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x01] +; X64-AVX512-NEXT: vmovups %xmm2, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x17] +; X64-AVX512-NEXT: vmovups %xmm3, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x1e] +; X64-AVX512-NEXT: vmovups %xmm4, (%rdx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x22] +; X64-AVX512-NEXT: vmovups %xmm0, (%rcx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x01] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %row0 = load <4 x float>, <4 x float>* %a0, align 16 %row1 = load <4 x float>, <4 x float>* %a1, align 16 diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -1882,13 +1882,13 @@ ; X86-AVX1-LABEL: test_mm_load_pd: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovaps (%eax), %xmm0 # encoding: [0xc5,0xf8,0x28,0x00] +; X86-AVX1-NEXT: vmovups (%eax), %xmm0 # encoding: [0xc5,0xf8,0x10,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_load_pd: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00] +; X86-AVX512-NEXT: vmovups (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_load_pd: @@ -1898,12 +1898,12 @@ ; ; X64-AVX1-LABEL: test_mm_load_pd: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07] +; X64-AVX1-NEXT: vmovups (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x10,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_load_pd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; X64-AVX512-NEXT: vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast double* %a0 to <2 x double>* %res = load <2 x double>, <2 x double>* %arg0, align 16 @@ -1965,13 +1965,13 @@ ; X86-AVX1-LABEL: test_mm_load_si128: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovaps (%eax), %xmm0 # encoding: [0xc5,0xf8,0x28,0x00] +; X86-AVX1-NEXT: vmovups (%eax), %xmm0 # encoding: [0xc5,0xf8,0x10,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_load_si128: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00] +; X86-AVX512-NEXT: vmovups (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_load_si128: @@ -1981,12 +1981,12 @@ ; ; X64-AVX1-LABEL: test_mm_load_si128: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07] +; X64-AVX1-NEXT: vmovups (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x10,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_load_si128: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; X64-AVX512-NEXT: vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %res = load <2 x i64>, <2 x i64>* %a0, align 16 ret <2 x i64> %res @@ -5311,13 +5311,13 @@ ; X86-AVX1-LABEL: test_mm_store_pd: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX1-NEXT: vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_store_pd: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store_pd: @@ -5327,12 +5327,12 @@ ; ; X64-AVX1-LABEL: test_mm_store_pd: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX1-NEXT: vmovups %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_store_pd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast double* %a0 to <2 x double>* store <2 x double> %a1, <2 x double>* %arg0, align 16 @@ -5353,7 +5353,7 @@ ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX1-NEXT: vmovddup %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x12,0xc0] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0] -; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX1-NEXT: vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_store_pd1: @@ -5361,7 +5361,7 @@ ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX512-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] ; X86-AVX512-NEXT: # xmm0 = xmm0[0,0] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store_pd1: @@ -5375,14 +5375,14 @@ ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovddup %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x12,0xc0] ; X64-AVX1-NEXT: # xmm0 = xmm0[0,0] -; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX1-NEXT: vmovups %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_store_pd1: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] ; X64-AVX512-NEXT: # xmm0 = xmm0[0,0] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast double * %a0 to <2 x double>* %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer @@ -5438,13 +5438,13 @@ ; X86-AVX1-LABEL: test_mm_store_si128: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX1-NEXT: vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_store_si128: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store_si128: @@ -5454,12 +5454,12 @@ ; ; X64-AVX1-LABEL: test_mm_store_si128: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX1-NEXT: vmovups %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_store_si128: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] store <2 x i64> %a1, <2 x i64>* %a0, align 16 ret void @@ -5479,7 +5479,7 @@ ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX1-NEXT: vmovddup %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x12,0xc0] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0] -; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX1-NEXT: vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_store1_pd: @@ -5487,7 +5487,7 @@ ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX512-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] ; X86-AVX512-NEXT: # xmm0 = xmm0[0,0] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store1_pd: @@ -5501,14 +5501,14 @@ ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovddup %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x12,0xc0] ; X64-AVX1-NEXT: # xmm0 = xmm0[0,0] -; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX1-NEXT: vmovups %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_store1_pd: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] ; X64-AVX512-NEXT: # xmm0 = xmm0[0,0] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast double * %a0 to <2 x double>* %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -773,15 +773,15 @@ ; ; X86-AVX1-LABEL: test_x86_sse2_packssdw_128_fold: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,32767,32767,65535,32768] -; X86-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX1-NEXT: vmovups {{.*#+}} xmm0 = [0,0,0,0,32767,32767,65535,32768] +; X86-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X86-AVX1-NEXT: ## fixup A - offset: 4, value: LCPI30_0, kind: FK_Data_4 ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: test_x86_sse2_packssdw_128_fold: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: vmovaps LCPI30_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,32767,32767,65535,32768] -; X86-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX512-NEXT: vmovups LCPI30_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,32767,32767,65535,32768] +; X86-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI30_0, kind: FK_Data_4 ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -794,15 +794,15 @@ ; ; X64-AVX1-LABEL: test_x86_sse2_packssdw_128_fold: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,32767,32767,65535,32768] -; X64-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX1-NEXT: vmovups {{.*#+}} xmm0 = [0,0,0,0,32767,32767,65535,32768] +; X64-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X64-AVX1-NEXT: ## fixup A - offset: 4, value: LCPI30_0-4, kind: reloc_riprel_4byte ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: test_x86_sse2_packssdw_128_fold: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,32767,32767,65535,32768] -; X64-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX512-NEXT: vmovups {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,32767,32767,65535,32768] +; X64-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI30_0-4, kind: reloc_riprel_4byte ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> zeroinitializer, <4 x i32> ) @@ -841,15 +841,15 @@ ; ; X86-AVX1-LABEL: test_x86_sse2_packsswb_128_fold: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX1-NEXT: vmovups {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X86-AVX1-NEXT: ## fixup A - offset: 4, value: LCPI32_0, kind: FK_Data_4 ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: test_x86_sse2_packsswb_128_fold: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: vmovaps LCPI32_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX512-NEXT: vmovups LCPI32_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI32_0, kind: FK_Data_4 ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -862,15 +862,15 @@ ; ; X64-AVX1-LABEL: test_x86_sse2_packsswb_128_fold: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX1-NEXT: vmovups {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X64-AVX1-NEXT: ## fixup A - offset: 4, value: LCPI32_0-4, kind: reloc_riprel_4byte ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: test_x86_sse2_packsswb_128_fold: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX512-NEXT: vmovups {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI32_0-4, kind: reloc_riprel_4byte ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> , <8 x i16> zeroinitializer) @@ -909,15 +909,15 @@ ; ; X86-AVX1-LABEL: test_x86_sse2_packuswb_128_fold: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX1-NEXT: vmovups {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X86-AVX1-NEXT: ## fixup A - offset: 4, value: LCPI34_0, kind: FK_Data_4 ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: test_x86_sse2_packuswb_128_fold: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: vmovaps LCPI34_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX512-NEXT: vmovups LCPI34_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI34_0, kind: FK_Data_4 ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -930,15 +930,15 @@ ; ; X64-AVX1-LABEL: test_x86_sse2_packuswb_128_fold: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX1-NEXT: vmovups {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X64-AVX1-NEXT: ## fixup A - offset: 4, value: LCPI34_0-4, kind: reloc_riprel_4byte ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: test_x86_sse2_packuswb_128_fold: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX512-NEXT: vmovups {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI34_0-4, kind: reloc_riprel_4byte ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> , <8 x i16> zeroinitializer) diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll --- a/llvm/test/CodeGen/X86/sse2.ll +++ b/llvm/test/CodeGen/X86/sse2.ll @@ -22,9 +22,9 @@ ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vmovaps (%ecx), %xmm0 +; X86-AVX-NEXT: vmovups (%ecx), %xmm0 ; X86-AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: test1: @@ -36,7 +36,7 @@ ; X64-AVX-LABEL: test1: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX-NEXT: vmovups %xmm0, (%rdi) ; X64-AVX-NEXT: retq %tmp3 = load <2 x double>, <2 x double>* %A, align 16 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 @@ -59,9 +59,9 @@ ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vmovaps (%ecx), %xmm0 +; X86-AVX-NEXT: vmovups (%ecx), %xmm0 ; X86-AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: test2: @@ -73,9 +73,9 @@ ; ; X64-AVX-LABEL: test2: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rsi), %xmm1 +; X64-AVX-NEXT: vmovups (%rsi), %xmm1 ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX-NEXT: vmovups %xmm0, (%rdi) ; X64-AVX-NEXT: retq %tmp3 = load <2 x double>, <2 x double>* %A, align 16 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 @@ -101,9 +101,9 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: vmovaps (%edx), %xmm0 +; X86-AVX-NEXT: vmovups (%edx), %xmm0 ; X86-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: test3: @@ -115,9 +115,9 @@ ; ; X64-AVX-LABEL: test3: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rsi), %xmm0 +; X64-AVX-NEXT: vmovups (%rsi), %xmm0 ; X64-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX-NEXT: vmovups %xmm0, (%rdi) ; X64-AVX-NEXT: retq %tmp = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=2] %tmp3 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=2] @@ -145,7 +145,7 @@ ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] -; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: test4: @@ -157,7 +157,7 @@ ; X64-AVX-LABEL: test4: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] -; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX-NEXT: vmovups %xmm0, (%rdi) ; X64-AVX-NEXT: retq %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] store <4 x float> %tmp5, <4 x float>* %res @@ -230,8 +230,8 @@ ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vmovaps (%ecx), %xmm0 -; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: test6: @@ -242,8 +242,8 @@ ; ; X64-AVX-LABEL: test6: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rsi), %xmm0 -; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX-NEXT: vmovups (%rsi), %xmm0 +; X64-AVX-NEXT: vmovups %xmm0, (%rdi) ; X64-AVX-NEXT: retq %tmp1 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=1] %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1] @@ -261,7 +261,7 @@ ; AVX-LABEL: test7: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, 0 +; AVX-NEXT: vmovups %xmm0, 0 ; AVX-NEXT: ret{{[l|q]}} bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1] shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1] @@ -404,23 +404,23 @@ ; ; AVX1-LABEL: test12: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps 0, %xmm0 +; AVX1-NEXT: vmovups 0, %xmm0 ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovaps %xmm0, 0 +; AVX1-NEXT: vmovups %xmm0, 0 ; AVX1-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test12: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps 0, %xmm0 +; AVX512-NEXT: vmovups 0, %xmm0 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX512-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, 0 +; AVX512-NEXT: vmovups %xmm0, 0 ; AVX512-NEXT: ret{{[l|q]}} %tmp1 = load <4 x float>, <4 x float>* null ; <<4 x float>> [#uses=2] %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1] @@ -447,10 +447,10 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: vmovaps (%edx), %xmm0 +; X86-AVX-NEXT: vmovups (%edx), %xmm0 ; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] ; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: test13: @@ -463,10 +463,10 @@ ; ; X64-AVX-LABEL: test13: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdx), %xmm0 +; X64-AVX-NEXT: vmovups (%rdx), %xmm0 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] ; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX-NEXT: vmovups %xmm0, (%rdi) ; X64-AVX-NEXT: retq %tmp3 = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=1] %tmp5 = load <4 x float>, <4 x float>* %C ; <<4 x float>> [#uses=1] @@ -492,8 +492,8 @@ ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vmovaps (%ecx), %xmm0 -; X86-AVX-NEXT: vmovaps (%eax), %xmm1 +; X86-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-AVX-NEXT: vmovups (%eax), %xmm1 ; X86-AVX-NEXT: vaddps %xmm0, %xmm1, %xmm2 ; X86-AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0] @@ -511,8 +511,8 @@ ; ; X64-AVX-LABEL: test14: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rsi), %xmm0 -; X64-AVX-NEXT: vmovaps (%rdi), %xmm1 +; X64-AVX-NEXT: vmovups (%rsi), %xmm0 +; X64-AVX-NEXT: vmovups (%rdi), %xmm1 ; X64-AVX-NEXT: vaddps %xmm0, %xmm1, %xmm2 ; X64-AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0] @@ -538,7 +538,7 @@ ; X86-AVX: # %bb.0: # %entry ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vmovaps (%ecx), %xmm0 +; X86-AVX-NEXT: vmovups (%ecx), %xmm0 ; X86-AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; X86-AVX-NEXT: retl ; @@ -550,7 +550,7 @@ ; ; X64-AVX-LABEL: test15: ; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vmovups (%rdi), %xmm0 ; X64-AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; X64-AVX-NEXT: retq entry: @@ -573,7 +573,7 @@ ; X86-AVX-LABEL: test16: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovaps 96(%eax), %xmm0 +; X86-AVX-NEXT: vmovups 96(%eax), %xmm0 ; X86-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; X86-AVX-NEXT: retl ; @@ -585,7 +585,7 @@ ; ; X64-AVX-LABEL: test16: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps 96(%rdi), %xmm0 +; X64-AVX-NEXT: vmovups 96(%rdi), %xmm0 ; X64-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; X64-AVX-NEXT: retq %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3 @@ -604,14 +604,14 @@ ; ; X86-AVX1-LABEL: test17: ; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = -; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX1-NEXT: vmovups {{.*#+}} xmm0 = +; X86-AVX1-NEXT: vmovups %xmm0, (%eax) ; X86-AVX1-NEXT: retl ; ; X86-AVX512-LABEL: test17: ; X86-AVX512: # %bb.0: # %entry ; X86-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX512-NEXT: vmovups %xmm0, (%eax) ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test17: @@ -622,14 +622,14 @@ ; ; X64-AVX1-LABEL: test17: ; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = -; X64-AVX1-NEXT: vmovaps %xmm0, (%rax) +; X64-AVX1-NEXT: vmovups {{.*#+}} xmm0 = +; X64-AVX1-NEXT: vmovups %xmm0, (%rax) ; X64-AVX1-NEXT: retq ; ; X64-AVX512-LABEL: test17: ; X64-AVX512: # %bb.0: # %entry ; X64-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rax) +; X64-AVX512-NEXT: vmovups %xmm0, (%rax) ; X64-AVX512-NEXT: retq entry: %0 = insertelement <4 x i32> undef, i32 undef, i32 1 diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll --- a/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll @@ -148,15 +148,15 @@ ; ; X86-AVX1-LABEL: test_x86_sse41_packusdw_fold: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0] -; X86-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX1-NEXT: vmovups {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0] +; X86-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X86-AVX1-NEXT: ## fixup A - offset: 4, value: LCPI7_0, kind: FK_Data_4 ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: test_x86_sse41_packusdw_fold: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: vmovaps LCPI7_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,65535,65535,0,0] -; X86-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX512-NEXT: vmovups LCPI7_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,65535,65535,0,0] +; X86-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI7_0, kind: FK_Data_4 ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -169,15 +169,15 @@ ; ; X64-AVX1-LABEL: test_x86_sse41_packusdw_fold: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0] -; X64-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX1-NEXT: vmovups {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0] +; X64-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X64-AVX1-NEXT: ## fixup A - offset: 4, value: LCPI7_0-4, kind: reloc_riprel_4byte ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: test_x86_sse41_packusdw_fold: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,65535,65535,0,0] -; X64-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX512-NEXT: vmovups {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,65535,65535,0,0] +; X64-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI7_0-4, kind: reloc_riprel_4byte ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> zeroinitializer, <4 x i32> ) diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -568,7 +568,7 @@ ; X86-AVX1-LABEL: insertps_from_shufflevector_1: ; X86-AVX1: ## %bb.0: ## %entry ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] +; X86-AVX1-NEXT: vmovups (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x08] ; X86-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] ; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] @@ -576,7 +576,7 @@ ; X86-AVX512-LABEL: insertps_from_shufflevector_1: ; X86-AVX512: ## %bb.0: ## %entry ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] +; X86-AVX512-NEXT: vmovups (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x08] ; X86-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] ; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] @@ -590,14 +590,14 @@ ; ; X64-AVX1-LABEL: insertps_from_shufflevector_1: ; X64-AVX1: ## %bb.0: ## %entry -; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] +; X64-AVX1-NEXT: vmovups (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x0f] ; X64-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] ; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: insertps_from_shufflevector_1: ; X64-AVX512: ## %bb.0: ## %entry -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] +; X64-AVX512-NEXT: vmovups (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x0f] ; X64-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] ; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] @@ -1382,7 +1382,7 @@ ; X86-AVX1-LABEL: insertps_from_vector_load: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] +; X86-AVX1-NEXT: vmovups (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x08] ; X86-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] ; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] @@ -1390,7 +1390,7 @@ ; X86-AVX512-LABEL: insertps_from_vector_load: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] +; X86-AVX512-NEXT: vmovups (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x08] ; X86-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] ; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] @@ -1404,14 +1404,14 @@ ; ; X64-AVX1-LABEL: insertps_from_vector_load: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] +; X64-AVX1-NEXT: vmovups (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x0f] ; X64-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] ; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: insertps_from_vector_load: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] +; X64-AVX512-NEXT: vmovups (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x0f] ; X64-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] ; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] @@ -1434,7 +1434,7 @@ ; X86-AVX1-LABEL: insertps_from_vector_load_offset: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] +; X86-AVX1-NEXT: vmovups (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x08] ; X86-AVX1-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] ; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] @@ -1442,7 +1442,7 @@ ; X86-AVX512-LABEL: insertps_from_vector_load_offset: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] +; X86-AVX512-NEXT: vmovups (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x08] ; X86-AVX512-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] ; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] @@ -1456,14 +1456,14 @@ ; ; X64-AVX1-LABEL: insertps_from_vector_load_offset: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] +; X64-AVX1-NEXT: vmovups (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x0f] ; X64-AVX1-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] ; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: insertps_from_vector_load_offset: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] +; X64-AVX512-NEXT: vmovups (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x0f] ; X64-AVX512-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] ; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] @@ -1489,7 +1489,7 @@ ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] ; X86-AVX1-NEXT: shll $4, %ecx ## encoding: [0xc1,0xe1,0x04] -; X86-AVX1-NEXT: vmovaps (%eax,%ecx), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x08] +; X86-AVX1-NEXT: vmovups (%eax,%ecx), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x0c,0x08] ; X86-AVX1-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] ; X86-AVX1-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] @@ -1499,7 +1499,7 @@ ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] ; X86-AVX512-NEXT: shll $4, %ecx ## encoding: [0xc1,0xe1,0x04] -; X86-AVX512-NEXT: vmovaps (%eax,%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x08] +; X86-AVX512-NEXT: vmovups (%eax,%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x0c,0x08] ; X86-AVX512-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] ; X86-AVX512-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] @@ -1515,7 +1515,7 @@ ; X64-AVX1-LABEL: insertps_from_vector_load_offset_2: ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04] -; X64-AVX1-NEXT: vmovaps (%rdi,%rsi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x37] +; X64-AVX1-NEXT: vmovups (%rdi,%rsi), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x0c,0x37] ; X64-AVX1-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] ; X64-AVX1-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] @@ -1523,7 +1523,7 @@ ; X64-AVX512-LABEL: insertps_from_vector_load_offset_2: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04] -; X64-AVX512-NEXT: vmovaps (%rdi,%rsi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x37] +; X64-AVX512-NEXT: vmovups (%rdi,%rsi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x0c,0x37] ; X64-AVX512-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] ; X64-AVX512-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] @@ -1829,7 +1829,7 @@ ; X86-AVX1-LABEL: pr20087: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] +; X86-AVX1-NEXT: vmovups (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x08] ; X86-AVX1-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] ; X86-AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] @@ -1837,7 +1837,7 @@ ; X86-AVX512-LABEL: pr20087: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] +; X86-AVX512-NEXT: vmovups (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x08] ; X86-AVX512-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] ; X86-AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] @@ -1851,14 +1851,14 @@ ; ; X64-AVX1-LABEL: pr20087: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] +; X64-AVX1-NEXT: vmovups (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x0f] ; X64-AVX1-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] ; X64-AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: pr20087: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] +; X64-AVX512-NEXT: vmovups (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x0f] ; X64-AVX512-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] ; X64-AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -699,7 +699,7 @@ ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vmovups {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: retq @@ -808,7 +808,7 @@ ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vmovups {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vblendvps %xmm1, {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: retq @@ -967,7 +967,7 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vblendvps %ymm1, {{.*}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -1205,8 +1205,8 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm0, %ymm6, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] -; AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] +; AVX1-NEXT: vmovups {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vblendvps %ymm2, %ymm4, %ymm6, %ymm7 ; AVX1-NEXT: vblendvps %ymm0, %ymm7, %ymm2, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/stack-folding-avx512bf16.ll b/llvm/test/CodeGen/X86/stack-folding-avx512bf16.ll --- a/llvm/test/CodeGen/X86/stack-folding-avx512bf16.ll +++ b/llvm/test/CodeGen/X86/stack-folding-avx512bf16.ll @@ -32,7 +32,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm2 +; CHECK-NEXT: vmovups (%rdi), %zmm2 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -85,7 +85,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vmovups (%rdi), %ymm1 ; CHECK-NEXT: vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -137,7 +137,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %zmm2 +; CHECK-NEXT: vmovups (%rdi), %zmm2 ; CHECK-NEXT: kmovd %edx, %k1 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovaps %zmm2, %zmm0 @@ -194,7 +194,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %ymm2 +; CHECK-NEXT: vmovups (%rdi), %ymm2 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload ; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -248,7 +248,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %xmm1 +; CHECK-NEXT: vmovups (%rdi), %xmm1 ; CHECK-NEXT: vcvtneps2bf16y {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 32-byte Folded Reload ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper @@ -302,7 +302,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %ymm2 +; CHECK-NEXT: vmovups (%rdi), %ymm2 ; CHECK-NEXT: kmovd %edx, %k1 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload ; CHECK-NEXT: vmovaps %ymm2, %ymm0 @@ -341,7 +341,7 @@ define <8 x i16> @stack_fold_cvtne2ps2bf16_xmm(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -356,12 +356,12 @@ define <8 x i16> @stack_fold_cvtne2ps2bf16_mask_xmm(<4 x float> %a0, <4 x float> %a1, <8 x i16>* %passthru, i8 %U) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-NEXT: vmovups (%rdi), %xmm2 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload ; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq @@ -377,7 +377,7 @@ define <8 x i16> @stack_fold_cvtne2ps2bf16_maskz_xmm(<4 x float> %a0, <4 x float> %a1, i8 %U) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -394,7 +394,7 @@ define <8 x i16> @stack_fold_cvtneps2bf16_xmm(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -409,11 +409,11 @@ define <8 x i16> @stack_fold_cvtneps2bf16_mask_xmm(<4 x float> %a0, <8 x i16>* %passthru, i8 %U) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_mask_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %xmm1 +; CHECK-NEXT: vmovups (%rdi), %xmm1 ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vcvtneps2bf16x {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload ; CHECK-NEXT: vmovaps %xmm1, %xmm0 @@ -429,7 +429,7 @@ define <8 x i16> @stack_fold_cvtneps2bf16_maskz_xmm(<4 x float> %a0, i8 %U) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -446,7 +446,7 @@ define <4 x float> @stack_fold_vdpbf16ps_xmm(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vdpbf16ps_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -461,11 +461,11 @@ define <4 x float> @stack_fold_vdpbf16ps_mask_xmm(<4 x float>* %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x float>* %passthru, i8 %U) { ; CHECK-LABEL: stack_fold_vdpbf16ps_mask_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-NEXT: vmovups (%rdi), %xmm2 ; CHECK-NEXT: kmovd %edx, %k1 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -483,7 +483,7 @@ define <4 x float> @stack_fold_vdpbf16ps_maskz_xmm(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2, i8* %U) { ; CHECK-LABEL: stack_fold_vdpbf16ps_maskz_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll b/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll --- a/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll +++ b/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll @@ -11,7 +11,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload ; CHECK-NEXT: kmovw %k0, (%rsi) ; CHECK-NEXT: kmovw %k1, (%rdx) @@ -35,7 +35,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload ; CHECK-NEXT: kmovw %k1, %eax ; CHECK-NEXT: kmovw %k0, %ecx @@ -61,7 +61,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovups (%rdi), %ymm0 ; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload ; CHECK-NEXT: kmovw %k1, %eax ; CHECK-NEXT: kmovw %k0, %ecx @@ -87,7 +87,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovups (%rdi), %ymm0 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload ; CHECK-NEXT: kmovw %k1, %eax ; CHECK-NEXT: kmovw %k0, %ecx @@ -109,11 +109,11 @@ define void @stack_fold_vp2intersectd_128(<4 x i32>* %a, <4 x i32> %b, <4 x i1>* nocapture %m0, <4 x i1>* nocapture %m1) { ; CHECK-LABEL: stack_fold_vp2intersectd_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vmovups (%rdi), %xmm0 ; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload ; CHECK-NEXT: kmovw %k1, %eax ; CHECK-NEXT: kmovw %k0, %ecx @@ -134,11 +134,11 @@ define void @stack_fold_vp2intersectq_128(<2 x i64>* %a, <2 x i64> %b, <2 x i1>* nocapture %m0, <2 x i1>* nocapture %m1) { ; CHECK-LABEL: stack_fold_vp2intersectq_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vmovups (%rdi), %xmm0 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload ; CHECK-NEXT: kmovw %k1, %eax ; CHECK-NEXT: kmovw %k0, %ecx diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll @@ -12,7 +12,7 @@ define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_addpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -40,7 +40,7 @@ define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_addps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -82,7 +82,7 @@ define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_addsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -114,7 +114,7 @@ define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_addss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -132,7 +132,7 @@ define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_addsubpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -162,7 +162,7 @@ define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_addsubps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -192,7 +192,7 @@ define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_andnpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -236,7 +236,7 @@ define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_andnps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -280,7 +280,7 @@ define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_andpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -322,7 +322,7 @@ define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_andps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -364,7 +364,7 @@ define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_blendpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -428,7 +428,7 @@ define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) { ; CHECK-LABEL: stack_fold_blendvpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -458,7 +458,7 @@ define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) { ; CHECK-LABEL: stack_fold_blendvps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -488,7 +488,7 @@ define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_cmppd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -518,7 +518,7 @@ define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_cmpps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -566,7 +566,7 @@ define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_cmpsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -598,7 +598,7 @@ define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_cmpss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -615,7 +615,7 @@ define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_comisd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -636,7 +636,7 @@ define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_comiss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -655,7 +655,7 @@ define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtdq2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -669,7 +669,7 @@ define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtdq2pd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -684,7 +684,7 @@ define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtdq2pd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -698,7 +698,7 @@ define <4 x double> @stack_fold_cvtdq2pd_ymm_int(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtdq2pd_ymm_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -712,7 +712,7 @@ define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtdq2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -740,7 +740,7 @@ define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_cvtpd2dq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -771,7 +771,7 @@ define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_cvtpd2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -800,7 +800,7 @@ define <4 x float> @stack_fold_cvtph2ps(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_cvtph2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -815,7 +815,7 @@ define <8 x float> @stack_fold_cvtph2ps_ymm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_cvtph2ps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -830,7 +830,7 @@ define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtps2dq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -860,7 +860,7 @@ define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtps2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -875,7 +875,7 @@ define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtps2pd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -890,7 +890,7 @@ define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtps2pd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -904,7 +904,7 @@ define <4 x double> @stack_fold_cvtps2pd_ymm_int(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtps2pd_ymm_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -922,7 +922,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) @@ -936,7 +936,7 @@ define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_cvtsd2si_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -953,7 +953,7 @@ define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_cvtsd2si64_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1332,7 +1332,7 @@ define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtss2si_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1349,7 +1349,7 @@ define i64 @stack_fold_cvtss2si64_int(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtss2si64_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1364,7 +1364,7 @@ define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_cvttpd2dq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1394,7 +1394,7 @@ define <4 x i32> @stack_fold_cvttps2dq(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvttps2dq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1436,7 +1436,7 @@ define i32 @stack_fold_cvttsd2si_int(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_cvttsd2si_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1465,7 +1465,7 @@ define i64 @stack_fold_cvttsd2si64_int(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_cvttsd2si64_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1494,7 +1494,7 @@ define i32 @stack_fold_cvttss2si_int(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvttss2si_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1523,7 +1523,7 @@ define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvttss2si64_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1538,7 +1538,7 @@ define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_divpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1566,7 +1566,7 @@ define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_divps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1608,7 +1608,7 @@ define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_divsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1639,7 +1639,7 @@ define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_divss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1656,7 +1656,7 @@ define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_dppd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1671,7 +1671,7 @@ define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_dpps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1705,7 +1705,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> @@ -1764,7 +1764,7 @@ define <2 x double> @stack_fold_haddpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_haddpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1794,7 +1794,7 @@ define <4 x float> @stack_fold_haddps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_haddps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1824,7 +1824,7 @@ define <2 x double> @stack_fold_hsubpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_hsubpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1854,7 +1854,7 @@ define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_hsubps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1884,7 +1884,7 @@ define <8 x float> @stack_fold_insertf128(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_insertf128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1899,7 +1899,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_insertps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1915,7 +1915,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_maxpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1930,7 +1930,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 { ; CHECK-LABEL: stack_fold_maxpd_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1973,7 +1973,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 { ; CHECK-LABEL: stack_fold_maxps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1988,7 +1988,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 { ; CHECK-LABEL: stack_fold_maxps_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2061,7 +2061,7 @@ define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_maxsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2106,7 +2106,7 @@ define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 { ; CHECK-LABEL: stack_fold_maxss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2121,7 +2121,7 @@ define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_minpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2136,7 +2136,7 @@ define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 { ; CHECK-LABEL: stack_fold_minpd_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2179,7 +2179,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 { ; CHECK-LABEL: stack_fold_minps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2194,7 +2194,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 { ; CHECK-LABEL: stack_fold_minps_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2267,7 +2267,7 @@ define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_minsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2312,7 +2312,7 @@ define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 { ; CHECK-LABEL: stack_fold_minss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2327,7 +2327,7 @@ define <2 x double> @stack_fold_movddup(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_movddup: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2363,7 +2363,7 @@ define <4 x float> @stack_fold_movshdup(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_movshdup: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2393,7 +2393,7 @@ define <4 x float> @stack_fold_movsldup(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_movsldup: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2423,7 +2423,7 @@ define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_mulpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2451,7 +2451,7 @@ define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_mulps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2493,7 +2493,7 @@ define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_mulsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2524,7 +2524,7 @@ define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_mulss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2541,7 +2541,7 @@ define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_orpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2583,7 +2583,7 @@ define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_orps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2640,7 +2640,7 @@ define <2 x double> @stack_fold_permilpd(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_permilpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2670,7 +2670,7 @@ define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_permilpdvar: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2700,7 +2700,7 @@ define <4 x float> @stack_fold_permilps(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_permilps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2730,7 +2730,7 @@ define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_permilpsvar: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2762,7 +2762,7 @@ define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_rcpps_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2797,7 +2797,7 @@ define <2 x double> @stack_fold_roundpd(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_roundpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2827,7 +2827,7 @@ define <4 x float> @stack_fold_roundps(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_roundps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2887,8 +2887,8 @@ define <2 x double> @stack_fold_roundsd_int(<2 x double> %a0, <2 x double> %a1) optsize { ; CHECK-LABEL: stack_fold_roundsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2920,12 +2920,12 @@ define <4 x float> @stack_fold_roundss_int(<4 x float> %a0, <4 x float> %a1) optsize { ; CHECK-LABEL: stack_fold_roundss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vroundss $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() @@ -2939,7 +2939,7 @@ define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_rsqrtps_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2974,7 +2974,7 @@ define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_shufpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3012,7 +3012,7 @@ define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_shufps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3042,7 +3042,7 @@ define <2 x double> @stack_fold_sqrtpd(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_sqrtpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3072,7 +3072,7 @@ define <4 x float> @stack_fold_sqrtps(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_sqrtps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3138,7 +3138,7 @@ define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_subpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3166,7 +3166,7 @@ define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_subps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3208,7 +3208,7 @@ define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_subsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3239,7 +3239,7 @@ define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_subss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3256,7 +3256,7 @@ define i32 @stack_fold_testpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_testpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3291,7 +3291,7 @@ define i32 @stack_fold_testps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_testps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3344,7 +3344,7 @@ define i32 @stack_fold_ucomisd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_ucomisd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3381,7 +3381,7 @@ define i32 @stack_fold_ucomiss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_ucomiss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3400,7 +3400,7 @@ define <2 x double> @stack_fold_unpckhpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_unpckhpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3438,7 +3438,7 @@ define <4 x float> @stack_fold_unpckhps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_unpckhps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3476,7 +3476,7 @@ define <2 x double> @stack_fold_unpcklpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_unpcklpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3514,7 +3514,7 @@ define <4 x float> @stack_fold_unpcklps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_unpcklps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3552,7 +3552,7 @@ define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_xorpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3594,7 +3594,7 @@ define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_xorps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll @@ -102,7 +102,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps (%rsi), %zmm2 +; CHECK-NEXT: vmovups (%rsi), %zmm2 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -122,7 +122,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps (%rsi), %zmm2 +; CHECK-NEXT: vmovups (%rsi), %zmm2 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -168,7 +168,7 @@ define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_addsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -199,7 +199,7 @@ define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_addss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -472,7 +472,7 @@ define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_divsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -503,7 +503,7 @@ define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_divss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -592,7 +592,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_insertps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -728,7 +728,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps (%rsi), %zmm2 +; CHECK-NEXT: vmovups (%rsi), %zmm2 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -748,7 +748,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps (%rsi), %zmm2 +; CHECK-NEXT: vmovups (%rsi), %zmm2 ; CHECK-NEXT: vmaxps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -900,7 +900,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps (%rsi), %zmm2 +; CHECK-NEXT: vmovups (%rsi), %zmm2 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -920,7 +920,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps (%rsi), %zmm2 +; CHECK-NEXT: vmovups (%rsi), %zmm2 ; CHECK-NEXT: vminps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1042,7 +1042,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps (%rsi), %zmm2 +; CHECK-NEXT: vmovups (%rsi), %zmm2 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1062,7 +1062,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps (%rsi), %zmm2 +; CHECK-NEXT: vmovups (%rsi), %zmm2 ; CHECK-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq @@ -1108,7 +1108,7 @@ define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_mulsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1139,7 +1139,7 @@ define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_mulss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1279,7 +1279,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps (%rsi), %zmm1 +; CHECK-NEXT: vmovups (%rsi), %zmm1 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vshuff32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3] @@ -1365,7 +1365,7 @@ define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_subsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1396,7 +1396,7 @@ define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_subss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1504,7 +1504,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = shufflevector <16 x float> %a0, <16 x float> undef, <4 x i32> @@ -1519,7 +1519,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = shufflevector <8 x double> %a0, <8 x double> undef, <2 x i32> @@ -1660,7 +1660,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %zmm2 +; CHECK-NEXT: vmovups (%rdi), %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpermi2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovaps %zmm2, %zmm0 @@ -1681,7 +1681,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovups (%rdi), %zmm1 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq @@ -1700,7 +1700,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovups (%rdi), %zmm1 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpermt2ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq @@ -1969,7 +1969,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovups (%rdi), %zmm1 ; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: # zmm1 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; CHECK-NEXT: vmovaps %zmm1, %zmm0 @@ -2024,7 +2024,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm2 +; CHECK-NEXT: vmovups (%rdi), %zmm2 ; CHECK-NEXT: vpermilps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll @@ -12,7 +12,7 @@ define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_addpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -40,7 +40,7 @@ define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_addps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -68,7 +68,7 @@ define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_andnpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -112,7 +112,7 @@ define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_andnps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -156,7 +156,7 @@ define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_andpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -198,7 +198,7 @@ define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_andps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -240,7 +240,7 @@ define i8 @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_cmppd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -279,7 +279,7 @@ define i8 @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_cmpps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -317,7 +317,7 @@ define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_divpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -345,7 +345,7 @@ define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_divps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -373,7 +373,7 @@ define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtdq2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -388,7 +388,7 @@ define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtdq2pd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -402,7 +402,7 @@ define <2 x double> @stack_fold_cvtudq2pd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtudq2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -417,7 +417,7 @@ define <4 x double> @stack_fold_cvtudq2pd_ymm(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtudq2pd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -431,7 +431,7 @@ define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_cvtpd2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -460,7 +460,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_maxpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -475,7 +475,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 { ; CHECK-LABEL: stack_fold_maxpd_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -518,7 +518,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 { ; CHECK-LABEL: stack_fold_maxps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -533,7 +533,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 { ; CHECK-LABEL: stack_fold_maxps_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -576,7 +576,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 { ; CHECK-LABEL: stack_fold_minps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -591,7 +591,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 { ; CHECK-LABEL: stack_fold_minps_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -634,7 +634,7 @@ define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_mulpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -662,7 +662,7 @@ define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_mulps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -690,7 +690,7 @@ define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_orpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -732,7 +732,7 @@ define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_orps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -815,7 +815,7 @@ define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_shufps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -830,12 +830,12 @@ define <4 x float> @stack_fold_shufps_mask(<4 x float>* %passthru, <4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_shufps_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-NEXT: vmovups (%rdi), %xmm2 ; CHECK-NEXT: vshufps $200, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload ; CHECK-NEXT: # xmm2 {%k1} = xmm0[0,2],mem[0,3] ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -852,7 +852,7 @@ define <4 x float> @stack_fold_shufps_maskz(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_shufps_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -886,7 +886,7 @@ define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_subpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -914,7 +914,7 @@ define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_subps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -942,7 +942,7 @@ define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_xorpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -984,7 +984,7 @@ define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_xorps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1030,7 +1030,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> @@ -1045,7 +1045,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <2 x i32> @@ -1056,7 +1056,7 @@ define <8 x float> @stack_fold_insertf32x4(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_insertf32x4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1071,7 +1071,7 @@ define <4 x double> @stack_fold_insertf64x2(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_insertf64x2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1086,7 +1086,7 @@ define <4 x float> @stack_fold_vpermt2ps(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) { ; CHECK-LABEL: stack_fold_vpermt2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1100,7 +1100,7 @@ define <4 x float> @stack_fold_vpermi2ps(<4 x i32> %x0, <4 x float> %x1, <4 x float> %x2) { ; CHECK-LABEL: stack_fold_vpermi2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1114,7 +1114,7 @@ define <2 x double> @stack_fold_vpermt2pd(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) { ; CHECK-LABEL: stack_fold_vpermt2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1128,7 +1128,7 @@ define <2 x double> @stack_fold_vpermi2pd(<2 x i64> %x0, <2 x double> %x1, <2 x double> %x2) { ; CHECK-LABEL: stack_fold_vpermi2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1253,7 +1253,7 @@ define <2 x double> @stack_fold_permilpd(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_permilpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1283,7 +1283,7 @@ define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_permilpdvar: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1313,7 +1313,7 @@ define <4 x float> @stack_fold_permilps(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_permilps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1343,7 +1343,7 @@ define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_permilpsvar: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll --- a/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx1.ll @@ -12,7 +12,7 @@ define <2 x i64> @stack_fold_aesdec(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_aesdec: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -27,7 +27,7 @@ define <2 x i64> @stack_fold_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_aesdeclast: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -42,7 +42,7 @@ define <2 x i64> @stack_fold_aesenc(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_aesenc: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -57,7 +57,7 @@ define <2 x i64> @stack_fold_aesenclast(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_aesenclast: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -72,7 +72,7 @@ define <2 x i64> @stack_fold_aesimc(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_aesimc: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -87,7 +87,7 @@ define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_aeskeygenassist: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -198,7 +198,7 @@ define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_movq_load: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -264,7 +264,7 @@ define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_mpsadbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -279,7 +279,7 @@ define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pabsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -295,7 +295,7 @@ define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pabsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -311,7 +311,7 @@ define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pabsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -327,7 +327,7 @@ define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_packssdw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -342,7 +342,7 @@ define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_packsswb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -357,7 +357,7 @@ define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_packusdw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -372,7 +372,7 @@ define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_packuswb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -387,7 +387,7 @@ define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -401,7 +401,7 @@ define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_paddd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -415,7 +415,7 @@ define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_paddq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -429,7 +429,7 @@ define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -444,7 +444,7 @@ define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -459,7 +459,7 @@ define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddusb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -474,7 +474,7 @@ define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddusw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -489,7 +489,7 @@ define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -503,7 +503,7 @@ define <16 x i8> @stack_fold_palignr(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_palignr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -518,7 +518,7 @@ define <16 x i8> @stack_fold_pand(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pand: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -536,7 +536,7 @@ define <16 x i8> @stack_fold_pandn(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pandn: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -555,7 +555,7 @@ define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pavgb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -574,7 +574,7 @@ define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pavgw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -593,7 +593,7 @@ define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) { ; CHECK-LABEL: stack_fold_pblendvb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -608,7 +608,7 @@ define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pblendw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -623,7 +623,7 @@ define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_pclmulqdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -638,7 +638,7 @@ define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -653,7 +653,7 @@ define <4 x i32> @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -668,7 +668,7 @@ define <2 x i64> @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -683,7 +683,7 @@ define <8 x i16> @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -698,7 +698,7 @@ define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpestri: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -716,7 +716,7 @@ define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpestrm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -733,7 +733,7 @@ define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpgtb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -748,7 +748,7 @@ define <4 x i32> @stack_fold_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pcmpgtd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -763,7 +763,7 @@ define <2 x i64> @stack_fold_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_pcmpgtq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -778,7 +778,7 @@ define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pcmpgtw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -793,7 +793,7 @@ define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpistri: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -809,7 +809,7 @@ define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpistrm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -919,7 +919,7 @@ define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_phaddd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -934,7 +934,7 @@ define <8 x i16> @stack_fold_phaddsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_phaddsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -949,7 +949,7 @@ define <8 x i16> @stack_fold_phaddw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_phaddw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -964,7 +964,7 @@ define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_phminposuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -979,7 +979,7 @@ define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_phsubd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -994,7 +994,7 @@ define <8 x i16> @stack_fold_phsubsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_phsubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1009,7 +1009,7 @@ define <8 x i16> @stack_fold_phsubw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_phsubw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1200,7 +1200,7 @@ define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaddubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1215,7 +1215,7 @@ define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaddwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1230,7 +1230,7 @@ define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaxsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1245,7 +1245,7 @@ define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmaxsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1260,7 +1260,7 @@ define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaxsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1275,7 +1275,7 @@ define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaxub: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1290,7 +1290,7 @@ define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmaxud: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1305,7 +1305,7 @@ define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaxuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1320,7 +1320,7 @@ define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pminsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1335,7 +1335,7 @@ define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pminsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1350,7 +1350,7 @@ define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pminsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1365,7 +1365,7 @@ define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pminub: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1380,7 +1380,7 @@ define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pminud: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1395,7 +1395,7 @@ define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pminuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1410,7 +1410,7 @@ define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmuldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1430,7 +1430,7 @@ define <8 x i16> @stack_fold_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmulhrsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1445,7 +1445,7 @@ define <8 x i16> @stack_fold_pmulhuw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmulhuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1460,7 +1460,7 @@ define <8 x i16> @stack_fold_pmulhw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmulhw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1475,7 +1475,7 @@ define <4 x i32> @stack_fold_pmulld(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmulld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1489,7 +1489,7 @@ define <8 x i16> @stack_fold_pmullw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmullw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1503,7 +1503,7 @@ define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmuludq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1521,7 +1521,7 @@ define <16 x i8> @stack_fold_por(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_por: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1539,7 +1539,7 @@ define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psadbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1554,7 +1554,7 @@ define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pshufb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1569,7 +1569,7 @@ define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pshufd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1588,7 +1588,7 @@ define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pshufhw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1603,7 +1603,7 @@ define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pshuflw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1618,7 +1618,7 @@ define <16 x i8> @stack_fold_psignb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psignb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1633,7 +1633,7 @@ define <4 x i32> @stack_fold_psignd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psignd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1648,7 +1648,7 @@ define <8 x i16> @stack_fold_psignw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psignw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1663,7 +1663,7 @@ define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pslld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1678,7 +1678,7 @@ define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1693,7 +1693,7 @@ define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psllw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1708,7 +1708,7 @@ define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrad: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1723,7 +1723,7 @@ define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psraw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1738,7 +1738,7 @@ define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1753,7 +1753,7 @@ define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1768,7 +1768,7 @@ define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psrlw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1783,7 +1783,7 @@ define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1797,7 +1797,7 @@ define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psubd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1811,7 +1811,7 @@ define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psubq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1825,7 +1825,7 @@ define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1840,7 +1840,7 @@ define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1855,7 +1855,7 @@ define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubusb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1870,7 +1870,7 @@ define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubusw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1885,7 +1885,7 @@ define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1899,7 +1899,7 @@ define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_ptest: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1934,7 +1934,7 @@ define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_punpckhbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1949,7 +1949,7 @@ define <4 x i32> @stack_fold_punpckhdq(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_punpckhdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1968,7 +1968,7 @@ define <2 x i64> @stack_fold_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_punpckhqdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1987,7 +1987,7 @@ define <8 x i16> @stack_fold_punpckhwd(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_punpckhwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2002,7 +2002,7 @@ define <16 x i8> @stack_fold_punpcklbw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_punpcklbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2017,7 +2017,7 @@ define <4 x i32> @stack_fold_punpckldq(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_punpckldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2036,7 +2036,7 @@ define <2 x i64> @stack_fold_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_punpcklqdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2055,7 +2055,7 @@ define <8 x i16> @stack_fold_punpcklwd(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_punpcklwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2070,7 +2070,7 @@ define <16 x i8> @stack_fold_pxor(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pxor: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx2.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx2.ll --- a/llvm/test/CodeGen/X86/stack-folding-int-avx2.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx2.ll @@ -12,7 +12,7 @@ define <4 x double> @stack_fold_broadcastsd_ymm(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_broadcastsd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -30,7 +30,7 @@ define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_broadcastss: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -48,7 +48,7 @@ define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_broadcastss_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -71,7 +71,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq ; zext forces execution domain @@ -84,7 +84,7 @@ define <8 x i32> @stack_fold_inserti128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_inserti128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -429,7 +429,7 @@ define <4 x i32> @stack_fold_pblendd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pblendd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -496,7 +496,7 @@ define <16 x i8> @stack_fold_pbroadcastb(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -510,7 +510,7 @@ define <32 x i8> @stack_fold_pbroadcastb_ymm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -524,7 +524,7 @@ define <4 x i32> @stack_fold_pbroadcastd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -541,7 +541,7 @@ define <8 x i32> @stack_fold_pbroadcastd_ymm(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -558,7 +558,7 @@ define <2 x i64> @stack_fold_pbroadcastq(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -575,7 +575,7 @@ define <4 x i64> @stack_fold_pbroadcastq_ymm(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -592,7 +592,7 @@ define <8 x i16> @stack_fold_pbroadcastw(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -606,7 +606,7 @@ define <16 x i16> @stack_fold_pbroadcastw_ymm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1128,7 +1128,7 @@ define <8 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1143,7 +1143,7 @@ define <4 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1158,7 +1158,7 @@ define <16 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1172,7 +1172,7 @@ define <4 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pmovsxdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1186,7 +1186,7 @@ define <8 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1200,7 +1200,7 @@ define <4 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1215,7 +1215,7 @@ define <8 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1231,7 +1231,7 @@ define <4 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1247,7 +1247,7 @@ define <16 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1262,7 +1262,7 @@ define <4 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pmovzxdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1277,7 +1277,7 @@ define <8 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1292,7 +1292,7 @@ define <4 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1559,7 +1559,7 @@ define <8 x i32> @stack_fold_pslld(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pslld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1574,7 +1574,7 @@ define <4 x i64> @stack_fold_psllq(<4 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1589,7 +1589,7 @@ define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psllvd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1619,7 +1619,7 @@ define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllvq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1649,7 +1649,7 @@ define <16 x i16> @stack_fold_psllw(<16 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psllw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1664,7 +1664,7 @@ define <8 x i32> @stack_fold_psrad(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrad: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1679,7 +1679,7 @@ define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psravd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1709,7 +1709,7 @@ define <16 x i16> @stack_fold_psraw(<16 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psraw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1724,7 +1724,7 @@ define <8 x i32> @stack_fold_psrld(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1739,7 +1739,7 @@ define <4 x i64> @stack_fold_psrlq(<4 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1754,7 +1754,7 @@ define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrlvd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1784,7 +1784,7 @@ define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlvq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1814,7 +1814,7 @@ define <16 x i16> @stack_fold_psrlw(<16 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psrlw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -371,7 +371,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq ; zext forces execution domain @@ -389,7 +389,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq ; zext forces execution domain @@ -1749,7 +1749,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -1770,7 +1770,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -4712,7 +4712,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) @@ -4739,7 +4739,7 @@ define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_movq_load: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4777,7 +4777,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) @@ -4808,7 +4808,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) @@ -4854,7 +4854,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) @@ -4881,7 +4881,7 @@ define <16 x i32> @stack_fold_pmovsxbd_zmm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4895,7 +4895,7 @@ define <8 x i64> @stack_fold_pmovsxbq_zmm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbq_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4952,7 +4952,7 @@ define <8 x i64> @stack_fold_pmovsxwq_zmm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwq_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4966,7 +4966,7 @@ define <8 x i64> @stack_fold_pmovsxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pmovsxwq_mask_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4983,7 +4983,7 @@ define <8 x i64> @stack_fold_pmovsxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pmovsxwq_maskz_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5004,7 +5004,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) @@ -5050,7 +5050,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) @@ -5077,7 +5077,7 @@ define <16 x i32> @stack_fold_pmovzxbd_zmm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5092,7 +5092,7 @@ define <8 x i64> @stack_fold_pmovzxbq_zmm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbq_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5153,7 +5153,7 @@ define <8 x i64> @stack_fold_pmovzxwq_zmm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwq_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5168,7 +5168,7 @@ define <8 x i64> @stack_fold_pmovzxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pmovzxwq_mask_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5186,7 +5186,7 @@ define <8 x i64> @stack_fold_pmovzxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pmovzxwq_maskz_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5827,7 +5827,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -5848,7 +5848,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -6256,7 +6256,7 @@ define <16 x i32> @stack_fold_pslld(<16 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pslld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6271,7 +6271,7 @@ define <16 x i32> @stack_fold_pslld_mask(<16 x i32>* %passthru, <16 x i32> %a0, <4 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pslld_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6291,7 +6291,7 @@ define <16 x i32> @stack_fold_pslld_maskz(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pslld_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6375,7 +6375,7 @@ define <8 x i64> @stack_fold_psllq(<8 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6487,7 +6487,7 @@ define <32 x i16> @stack_fold_psllw(<32 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psllw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6517,7 +6517,7 @@ define <16 x i32> @stack_fold_psrad(<16 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrad: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6547,7 +6547,7 @@ define <8 x i64> @stack_fold_psraq(<8 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psraq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6622,7 +6622,7 @@ define <32 x i16> @stack_fold_psraw(<32 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psraw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6652,7 +6652,7 @@ define <16 x i32> @stack_fold_psrld(<16 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6697,7 +6697,7 @@ define <8 x i64> @stack_fold_psrlq(<8 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6772,7 +6772,7 @@ define <32 x i16> @stack_fold_psrlw(<32 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psrlw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -7159,7 +7159,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -7180,7 +7180,7 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll @@ -89,7 +89,7 @@ define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pavgb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -127,7 +127,7 @@ define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pavgw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -165,7 +165,7 @@ define <4 x i32> @stack_fold_vpconflictd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_vpconflictd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -193,7 +193,7 @@ define <2 x i64> @stack_fold_vpconflictq(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_vpconflictq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -226,7 +226,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq ; zext forces execution domain @@ -244,7 +244,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq ; zext forces execution domain @@ -257,7 +257,7 @@ define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_inserti32x4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -276,7 +276,7 @@ define <4 x i64> @stack_fold_inserti64x2(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_inserti64x2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -295,7 +295,7 @@ define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pabsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -327,7 +327,7 @@ define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pabsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -359,7 +359,7 @@ define <2 x i64> @stack_fold_pabsq(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_pabsq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -391,7 +391,7 @@ define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pabsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -423,7 +423,7 @@ define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_packssdw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -453,7 +453,7 @@ define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_packsswb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -483,7 +483,7 @@ define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_packusdw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -513,7 +513,7 @@ define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_packuswb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -543,7 +543,7 @@ define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -557,7 +557,7 @@ define <16 x i8> @stack_fold_paddb_mask(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_paddb_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -578,7 +578,7 @@ define <16 x i8> @stack_fold_paddb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_paddb_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -647,7 +647,7 @@ define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_paddd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -675,7 +675,7 @@ define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_paddq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -703,7 +703,7 @@ define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -733,7 +733,7 @@ define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -763,7 +763,7 @@ define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddusb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -793,7 +793,7 @@ define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddusw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -823,7 +823,7 @@ define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -911,7 +911,7 @@ define i16 @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -928,7 +928,7 @@ define i8 @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -946,7 +946,7 @@ define i8 @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -964,7 +964,7 @@ define i8 @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1021,7 +1021,7 @@ define <16 x i8> @stack_fold_vpermi2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { ; CHECK-LABEL: stack_fold_vpermi2b: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1049,7 +1049,7 @@ define <4 x i32> @stack_fold_vpermi2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { ; CHECK-LABEL: stack_fold_vpermi2d: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1077,7 +1077,7 @@ define <2 x i64> @stack_fold_vpermi2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { ; CHECK-LABEL: stack_fold_vpermi2q: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1105,7 +1105,7 @@ define <8 x i16> @stack_fold_vpermi2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { ; CHECK-LABEL: stack_fold_vpermi2w: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1173,7 +1173,7 @@ define <16 x i8> @stack_fold_vpermt2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { ; CHECK-LABEL: stack_fold_vpermt2b: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1203,7 +1203,7 @@ define <4 x i32> @stack_fold_vpermt2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { ; CHECK-LABEL: stack_fold_vpermt2d: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1233,7 +1233,7 @@ define <2 x i64> @stack_fold_vpermt2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { ; CHECK-LABEL: stack_fold_vpermt2q: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1263,7 +1263,7 @@ define <8 x i16> @stack_fold_vpermt2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { ; CHECK-LABEL: stack_fold_vpermt2w: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1314,7 +1314,7 @@ define <4 x i32> @stack_fold_vplzcntd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_vplzcntd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1342,7 +1342,7 @@ define <2 x i64> @stack_fold_vplzcntq(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_vplzcntq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1370,7 +1370,7 @@ define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaddubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1385,7 +1385,7 @@ define <8 x i16> @stack_fold_pmaddubsw_mask(<8 x i16>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaddubsw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1406,7 +1406,7 @@ define <8 x i16> @stack_fold_pmaddubsw_maskz(<16 x i8> %a0, <16 x i8> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaddubsw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1476,7 +1476,7 @@ define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaddwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1491,7 +1491,7 @@ define <4 x i32> @stack_fold_pmaddwd_mask(<4 x i32>* %passthru, <8 x i16> %a0, <8 x i16> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaddwd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1513,7 +1513,7 @@ define <4 x i32> @stack_fold_pmaddwd_maskz(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaddwd_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1584,7 +1584,7 @@ define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaxsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1614,7 +1614,7 @@ define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmaxsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1644,7 +1644,7 @@ define <2 x i64> @stack_fold_pmaxsq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmaxsq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1674,7 +1674,7 @@ define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaxsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1704,7 +1704,7 @@ define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaxub: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1734,7 +1734,7 @@ define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmaxud: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1764,7 +1764,7 @@ define <2 x i64> @stack_fold_pmaxuq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmaxuq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1779,7 +1779,7 @@ define <2 x i64> @stack_fold_pmaxuq_mask(<2 x i64>* %passthru, <2 x i64> %a0, <2 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaxuq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1801,7 +1801,7 @@ define <2 x i64> @stack_fold_pmaxuq_maskz(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaxuq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1876,7 +1876,7 @@ define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaxuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1907,7 +1907,7 @@ define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pminsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1937,7 +1937,7 @@ define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pminsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1967,7 +1967,7 @@ define <2 x i64> @stack_fold_pminsq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_pminsq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1997,7 +1997,7 @@ define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pminsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2027,7 +2027,7 @@ define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pminub: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2057,7 +2057,7 @@ define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pminud: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2087,7 +2087,7 @@ define <2 x i64> @stack_fold_pminuq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_pminuq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2117,7 +2117,7 @@ define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pminuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2151,7 +2151,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1) @@ -2167,7 +2167,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = trunc <4 x i64> %a0 to <4 x i32> @@ -2183,7 +2183,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = trunc <16 x i16> %a0 to <16 x i8> @@ -2199,7 +2199,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1) @@ -2215,7 +2215,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1) @@ -2231,7 +2231,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1) @@ -2243,7 +2243,7 @@ define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2258,7 +2258,7 @@ define <8 x i32> @stack_fold_pmovsxbd_ymm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2273,7 +2273,7 @@ define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2288,7 +2288,7 @@ define <4 x i64> @stack_fold_pmovsxbq_ymm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2303,7 +2303,7 @@ define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2318,7 +2318,7 @@ define <16 x i16> @stack_fold_pmovsxbw_ymm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2332,7 +2332,7 @@ define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pmovsxdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2347,7 +2347,7 @@ define <4 x i64> @stack_fold_pmovsxdq_ymm(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pmovsxdq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2361,7 +2361,7 @@ define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2376,7 +2376,7 @@ define <8 x i32> @stack_fold_pmovsxwd_ymm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2390,7 +2390,7 @@ define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2405,7 +2405,7 @@ define <4 x i64> @stack_fold_pmovsxwq_ymm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2424,7 +2424,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1) @@ -2440,7 +2440,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1) @@ -2456,7 +2456,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1) @@ -2468,7 +2468,7 @@ define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2484,7 +2484,7 @@ define <8 x i32> @stack_fold_pmovzxbd_ymm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2500,7 +2500,7 @@ define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2516,7 +2516,7 @@ define <4 x i64> @stack_fold_pmovzxbq_ymm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2532,7 +2532,7 @@ define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2548,7 +2548,7 @@ define <16 x i16> @stack_fold_pmovzxbw_ymm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2563,7 +2563,7 @@ define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pmovzxdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2579,7 +2579,7 @@ define <4 x i64> @stack_fold_pmovzxdq_ymm(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pmovzxdq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2594,7 +2594,7 @@ define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2610,7 +2610,7 @@ define <8 x i32> @stack_fold_pmovzxwd_ymm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2625,7 +2625,7 @@ define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2641,7 +2641,7 @@ define <4 x i64> @stack_fold_pmovzxwq_ymm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2657,7 +2657,7 @@ define <4 x i64> @stack_fold_pmovzxwq_maskz_ymm(<8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pmovzxwq_maskz_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2677,7 +2677,7 @@ define <4 x i64> @stack_fold_pmovzxwq_mask_ymm(<4 x i64> %passthru, <8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pmovzxwq_mask_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2697,7 +2697,7 @@ define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmuldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2737,7 +2737,7 @@ define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmuludq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2824,7 +2824,7 @@ define <4 x i32> @stack_fold_vpopcntd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_vpopcntd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2854,7 +2854,7 @@ define <2 x i64> @stack_fold_vpopcntq(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_vpopcntq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2884,7 +2884,7 @@ define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psadbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2899,7 +2899,7 @@ define <2 x i64> @stack_fold_psadbw_commute(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psadbw_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2942,7 +2942,7 @@ define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pshufb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2957,7 +2957,7 @@ define <16 x i8> @stack_fold_pshufb_mask(<16 x i8>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pshufb_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2977,7 +2977,7 @@ define <16 x i8> @stack_fold_pshufb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pshufb_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3046,7 +3046,7 @@ define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pshufd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3064,8 +3064,8 @@ define <4 x i32> @stack_fold_pshufd_mask(<4 x i32> %passthru, <4 x i32> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pshufd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3085,7 +3085,7 @@ define <4 x i32> @stack_fold_pshufd_maskz(<4 x i32> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pshufd_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3160,7 +3160,7 @@ define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pshufhw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3175,8 +3175,8 @@ define <8 x i16> @stack_fold_pshufhw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pshufhw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3195,7 +3195,7 @@ define <8 x i16> @stack_fold_pshufhw_maskz(<8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pshufhw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3266,7 +3266,7 @@ define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pshuflw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3281,8 +3281,8 @@ define <8 x i16> @stack_fold_pshuflw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pshuflw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3301,7 +3301,7 @@ define <8 x i16> @stack_fold_pshuflw_maskz(<8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pshuflw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3372,7 +3372,7 @@ define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pslld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3387,7 +3387,7 @@ define <8 x i32> @stack_fold_pslld_ymm(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pslld_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3402,7 +3402,7 @@ define <16 x i8> @stack_fold_pslldq(<16 x i8> %a) { ; CHECK-LABEL: stack_fold_pslldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3432,7 +3432,7 @@ define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3447,7 +3447,7 @@ define <4 x i64> @stack_fold_psllq_ymm(<4 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3462,7 +3462,7 @@ define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psllvd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3492,7 +3492,7 @@ define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllvq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3522,7 +3522,7 @@ define <8 x i16> @stack_fold_psllvw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psllvw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3552,7 +3552,7 @@ define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psllw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3567,7 +3567,7 @@ define <16 x i16> @stack_fold_psllw_ymm(<16 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psllw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3582,7 +3582,7 @@ define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrad: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3597,7 +3597,7 @@ define <8 x i32> @stack_fold_psrad_ymm(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrad_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3612,7 +3612,7 @@ define <2 x i64> @stack_fold_psraq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psraq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3627,7 +3627,7 @@ define <4 x i64> @stack_fold_psraq_ymm(<4 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psraq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3642,7 +3642,7 @@ define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psravd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3672,7 +3672,7 @@ define <2 x i64> @stack_fold_psravq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psravq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3702,7 +3702,7 @@ define <8 x i16> @stack_fold_psravw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psravw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3732,7 +3732,7 @@ define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psraw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3747,7 +3747,7 @@ define <16 x i16> @stack_fold_psraw_ymm(<16 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psraw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3762,7 +3762,7 @@ define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3777,7 +3777,7 @@ define <8 x i32> @stack_fold_psrld_ymm(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrld_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -3794,7 +3794,7 @@ define <16 x i8> @stack_fold_psrldq(<16 x i8> %a) { ; CHECK-LABEL: stack_fold_psrldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3824,7 +3824,7 @@ define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3839,7 +3839,7 @@ define <4 x i64> @stack_fold_psrlq_ymm(<4 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3854,7 +3854,7 @@ define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrlvd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3884,7 +3884,7 @@ define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlvq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3914,7 +3914,7 @@ define <8 x i16> @stack_fold_psrlvw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psrlvw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3944,7 +3944,7 @@ define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psrlw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3959,7 +3959,7 @@ define <16 x i16> @stack_fold_psrlw_ymm(<16 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psrlw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3974,7 +3974,7 @@ define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4002,7 +4002,7 @@ define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psubd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4030,7 +4030,7 @@ define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psubq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4058,7 +4058,7 @@ define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4088,7 +4088,7 @@ define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4118,7 +4118,7 @@ define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubusb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4148,7 +4148,7 @@ define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubusw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4178,7 +4178,7 @@ define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4206,7 +4206,7 @@ define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_punpckhbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4221,7 +4221,7 @@ define <16 x i8> @stack_fold_punpckhbw_mask(<16 x i8>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_punpckhbw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4243,7 +4243,7 @@ define <16 x i8> @stack_fold_punpckhbw_maskz(<16 x i8> %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_punpckhbw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/X86/stack-folding-xop.ll b/llvm/test/CodeGen/X86/stack-folding-xop.ll --- a/llvm/test/CodeGen/X86/stack-folding-xop.ll +++ b/llvm/test/CodeGen/X86/stack-folding-xop.ll @@ -12,7 +12,7 @@ define <2 x double> @stack_fold_vfrczpd(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_vfrczpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -42,7 +42,7 @@ define <4 x float> @stack_fold_vfrczps(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_vfrczps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -72,7 +72,7 @@ define <2 x double> @stack_fold_vfrczsd(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_vfrczsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -87,7 +87,7 @@ define <4 x float> @stack_fold_vfrczss(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_vfrczss: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -102,7 +102,7 @@ define <2 x i64> @stack_fold_vpcmov_rm(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) { ; CHECK-LABEL: stack_fold_vpcmov_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -115,7 +115,7 @@ define <2 x i64> @stack_fold_vpcmov_mr(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) { ; CHECK-LABEL: stack_fold_vpcmov_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -158,7 +158,7 @@ define <16 x i8> @stack_fold_vpcomb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_vpcomb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -173,7 +173,7 @@ define <4 x i32> @stack_fold_vpcomd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_vpcomd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -188,7 +188,7 @@ define <2 x i64> @stack_fold_vpcomq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_vpcomq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -203,7 +203,7 @@ define <16 x i8> @stack_fold_vpcomub(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_vpcomub: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -218,7 +218,7 @@ define <4 x i32> @stack_fold_vpcomud(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_vpcomud: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -233,7 +233,7 @@ define <2 x i64> @stack_fold_vpcomuq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_vpcomuq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -248,7 +248,7 @@ define <8 x i16> @stack_fold_vpcomuw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_vpcomuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -263,7 +263,7 @@ define <8 x i16> @stack_fold_vpcomw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_vpcomw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -278,7 +278,7 @@ define <2 x double> @stack_fold_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) { ; CHECK-LABEL: stack_fold_vpermil2pd_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -291,7 +291,7 @@ define <2 x double> @stack_fold_vpermil2pd_mr(<2 x double> %a0, <2 x i64> %a1, <2 x double> %a2) { ; CHECK-LABEL: stack_fold_vpermil2pd_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -334,7 +334,7 @@ define <4 x float> @stack_fold_vpermil2ps_rm(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpermil2ps_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -347,7 +347,7 @@ define <4 x float> @stack_fold_vpermil2ps_mr(<4 x float> %a0, <4 x i32> %a1, <4 x float> %a2) { ; CHECK-LABEL: stack_fold_vpermil2ps_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -390,7 +390,7 @@ define <4 x i32> @stack_fold_vphaddbd(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_vphaddbd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -405,7 +405,7 @@ define <2 x i64> @stack_fold_vphaddbq(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_vphaddbq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -420,7 +420,7 @@ define <8 x i16> @stack_fold_vphaddbw(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_vphaddbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -435,7 +435,7 @@ define <2 x i64> @stack_fold_vphadddq(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_vphadddq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -450,7 +450,7 @@ define <4 x i32> @stack_fold_vphaddubd(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_vphaddubd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -465,7 +465,7 @@ define <2 x i64> @stack_fold_vphaddubq(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_vphaddubq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -480,7 +480,7 @@ define <8 x i16> @stack_fold_vphaddubw(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_vphaddubw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -495,7 +495,7 @@ define <2 x i64> @stack_fold_vphaddudq(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_vphaddudq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -510,7 +510,7 @@ define <4 x i32> @stack_fold_vphadduwd(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_vphadduwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -525,7 +525,7 @@ define <2 x i64> @stack_fold_vphadduwq(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_vphadduwq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -540,7 +540,7 @@ define <4 x i32> @stack_fold_vphaddwd(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_vphaddwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -555,7 +555,7 @@ define <2 x i64> @stack_fold_vphaddwq(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_vphaddwq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -570,7 +570,7 @@ define <8 x i16> @stack_fold_vphsubbw(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_vphsubbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -585,7 +585,7 @@ define <2 x i64> @stack_fold_vphsubdq(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_vphsubdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -600,7 +600,7 @@ define <4 x i32> @stack_fold_vphsubwd(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_vphsubwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -615,8 +615,8 @@ define <4 x i32> @stack_fold_vpmacsdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpmacsdd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -632,8 +632,8 @@ define <2 x i64> @stack_fold_vpmacsdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) { ; CHECK-LABEL: stack_fold_vpmacsdqh: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -649,8 +649,8 @@ define <2 x i64> @stack_fold_vpmacsdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) { ; CHECK-LABEL: stack_fold_vpmacsdql: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -666,8 +666,8 @@ define <4 x i32> @stack_fold_vpmacssdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpmacssdd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -683,8 +683,8 @@ define <2 x i64> @stack_fold_vpmacssdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) { ; CHECK-LABEL: stack_fold_vpmacssdqh: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -700,8 +700,8 @@ define <2 x i64> @stack_fold_vpmacssdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) { ; CHECK-LABEL: stack_fold_vpmacssdql: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -717,8 +717,8 @@ define <4 x i32> @stack_fold_vpmacsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpmacsswd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -734,8 +734,8 @@ define <8 x i16> @stack_fold_vpmacssww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpmacssww: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -751,8 +751,8 @@ define <4 x i32> @stack_fold_vpmacswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpmacswd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -768,8 +768,8 @@ define <8 x i16> @stack_fold_vpmacsww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpmacsww: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -785,8 +785,8 @@ define <4 x i32> @stack_fold_vpmadcsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpmadcsswd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -802,8 +802,8 @@ define <4 x i32> @stack_fold_vpmadcswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpmadcswd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -819,7 +819,7 @@ define <16 x i8> @stack_fold_vpperm_rm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) { ; CHECK-LABEL: stack_fold_vpperm_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -832,7 +832,7 @@ define <16 x i8> @stack_fold_vpperm_mr(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) { ; CHECK-LABEL: stack_fold_vpperm_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -847,7 +847,7 @@ define <16 x i8> @stack_fold_vprotb(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_vprotb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -862,7 +862,7 @@ define <16 x i8> @stack_fold_vprotb_rm(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_vprotb_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -875,7 +875,7 @@ define <16 x i8> @stack_fold_vprotb_mr(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_vprotb_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -890,7 +890,7 @@ define <4 x i32> @stack_fold_vprotd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_vprotd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -905,7 +905,7 @@ define <4 x i32> @stack_fold_vprotd_rm(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_vprotd_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -918,7 +918,7 @@ define <4 x i32> @stack_fold_vprotd_mr(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_vprotd_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -933,7 +933,7 @@ define <2 x i64> @stack_fold_vprotq(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_vprotq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -948,7 +948,7 @@ define <2 x i64> @stack_fold_vprotq_rm(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_vprotq_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -961,7 +961,7 @@ define <2 x i64> @stack_fold_vprotq_mr(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_vprotq_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -976,7 +976,7 @@ define <8 x i16> @stack_fold_vprotw(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_vprotw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -991,7 +991,7 @@ define <8 x i16> @stack_fold_vprotw_rm(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_vprotw_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1004,7 +1004,7 @@ define <8 x i16> @stack_fold_vprotw_mr(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_vprotw_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1019,7 +1019,7 @@ define <16 x i8> @stack_fold_vpshab_rm(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_vpshab_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1032,7 +1032,7 @@ define <16 x i8> @stack_fold_vpshab_mr(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_vpshab_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1047,7 +1047,7 @@ define <4 x i32> @stack_fold_vpshad_rm(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_vpshad_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1060,7 +1060,7 @@ define <4 x i32> @stack_fold_vpshad_mr(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_vpshad_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1075,7 +1075,7 @@ define <2 x i64> @stack_fold_vpshaq_rm(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_vpshaq_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1088,7 +1088,7 @@ define <2 x i64> @stack_fold_vpshaq_mr(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_vpshaq_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1103,7 +1103,7 @@ define <8 x i16> @stack_fold_vpshaw_rm(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_vpshaw_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1116,7 +1116,7 @@ define <8 x i16> @stack_fold_vpshaw_mr(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_vpshaw_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1131,7 +1131,7 @@ define <16 x i8> @stack_fold_vpshlb_rm(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_vpshlb_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1144,7 +1144,7 @@ define <16 x i8> @stack_fold_vpshlb_mr(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_vpshlb_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1159,7 +1159,7 @@ define <4 x i32> @stack_fold_vpshld_rm(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_vpshld_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1172,7 +1172,7 @@ define <4 x i32> @stack_fold_vpshld_mr(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_vpshld_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1187,7 +1187,7 @@ define <2 x i64> @stack_fold_vpshlq_rm(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_vpshlq_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1200,7 +1200,7 @@ define <2 x i64> @stack_fold_vpshlq_mr(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_vpshlq_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1215,7 +1215,7 @@ define <8 x i16> @stack_fold_vpshlw_rm(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_vpshlw_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1228,7 +1228,7 @@ define <8 x i16> @stack_fold_vpshlw_mr(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_vpshlw_mr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/X86/statepoint-no-realign-stack.ll b/llvm/test/CodeGen/X86/statepoint-no-realign-stack.ll --- a/llvm/test/CodeGen/X86/statepoint-no-realign-stack.ll +++ b/llvm/test/CodeGen/X86/statepoint-no-realign-stack.ll @@ -21,8 +21,8 @@ ; CHECK-NEXT: .cfi_def_cfa_register %rbp ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: vmovups %ymm0, (%rsp) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: .Ltmp0: @@ -40,7 +40,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovups (%rdi), %ymm0 ; CHECK-NEXT: vmovups %ymm0, (%rsp) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo @@ -66,11 +66,11 @@ ; CHECK-NEXT: .cfi_def_cfa_register %rbp ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovups %ymm0, (%rsp) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq do_safepoint ; CHECK-NEXT: .Ltmp2: -; CHECK-NEXT: vmovaps (%rsp), %ymm0 +; CHECK-NEXT: vmovups (%rsp), %ymm0 ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -63,7 +63,7 @@ ; X32-AVX-LABEL: test_broadcast_4f64_8f64: ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX-NEXT: vmovups (%eax), %ymm0 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; @@ -75,7 +75,7 @@ ; ; X64-AVX-LABEL: test_broadcast_4f64_8f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; @@ -148,7 +148,7 @@ ; X32-AVX-LABEL: test_broadcast_4i64_8i64: ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX-NEXT: vmovups (%eax), %ymm0 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; @@ -160,7 +160,7 @@ ; ; X64-AVX-LABEL: test_broadcast_4i64_8i64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; @@ -222,7 +222,7 @@ ; X32-AVX-LABEL: test_broadcast_8f32_16f32: ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX-NEXT: vmovups (%eax), %ymm0 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; @@ -234,7 +234,7 @@ ; ; X64-AVX-LABEL: test_broadcast_8f32_16f32: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; @@ -307,7 +307,7 @@ ; X32-AVX-LABEL: test_broadcast_8i32_16i32: ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX-NEXT: vmovups (%eax), %ymm0 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; @@ -319,7 +319,7 @@ ; ; X64-AVX-LABEL: test_broadcast_8i32_16i32: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; @@ -392,7 +392,7 @@ ; X32-AVX-LABEL: test_broadcast_16i16_32i16: ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX-NEXT: vmovups (%eax), %ymm0 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; @@ -404,7 +404,7 @@ ; ; X64-AVX-LABEL: test_broadcast_16i16_32i16: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; @@ -477,7 +477,7 @@ ; X32-AVX-LABEL: test_broadcast_32i8_64i8: ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vmovaps (%eax), %ymm0 +; X32-AVX-NEXT: vmovups (%eax), %ymm0 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; @@ -489,7 +489,7 @@ ; ; X64-AVX-LABEL: test_broadcast_32i8_64i8: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; @@ -511,15 +511,15 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm0 -; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vmovups (%ecx), %xmm0 +; X32-NEXT: vmovups %xmm0, (%eax) ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_2f64_4f64_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) +; X64-NEXT: vmovups (%rdi), %xmm0 +; X64-NEXT: vmovups %xmm0, (%rsi) ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %1 = load <2 x double>, <2 x double>* %p0 @@ -533,15 +533,15 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm0 -; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vmovups (%ecx), %xmm0 +; X32-NEXT: vmovups %xmm0, (%eax) ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_2i64_4i64_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) +; X64-NEXT: vmovups (%rdi), %xmm0 +; X64-NEXT: vmovups %xmm0, (%rsi) ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %1 = load <2 x i64>, <2 x i64>* %p0 @@ -555,15 +555,15 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm0 -; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vmovups (%ecx), %xmm0 +; X32-NEXT: vmovups %xmm0, (%eax) ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_4f32_8f32_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) +; X64-NEXT: vmovups (%rdi), %xmm0 +; X64-NEXT: vmovups %xmm0, (%rsi) ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %p0 @@ -577,15 +577,15 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm0 -; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vmovups (%ecx), %xmm0 +; X32-NEXT: vmovups %xmm0, (%eax) ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_4i32_8i32_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) +; X64-NEXT: vmovups (%rdi), %xmm0 +; X64-NEXT: vmovups %xmm0, (%rsi) ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 @@ -599,15 +599,15 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm0 -; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vmovups (%ecx), %xmm0 +; X32-NEXT: vmovups %xmm0, (%eax) ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_8i16_16i16_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) +; X64-NEXT: vmovups (%rdi), %xmm0 +; X64-NEXT: vmovups %xmm0, (%rsi) ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %1 = load <8 x i16>, <8 x i16> *%p0 @@ -621,15 +621,15 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm0 -; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vmovups (%ecx), %xmm0 +; X32-NEXT: vmovups %xmm0, (%eax) ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_broadcast_16i8_32i8_reuse: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) +; X64-NEXT: vmovups (%rdi), %xmm0 +; X64-NEXT: vmovups %xmm0, (%rsi) ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %1 = load <16 x i8>, <16 x i8> *%p0 @@ -649,7 +649,7 @@ ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X32-AVX-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX-NEXT: vmovups %xmm1, (%eax) ; X32-AVX-NEXT: retl ; ; X32-AVX512-LABEL: test_broadcast_4i32_8i32_chain: @@ -658,21 +658,21 @@ ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X32-AVX512-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX512-NEXT: vmovups %xmm1, (%eax) ; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX-NEXT: vmovups %xmm1, (%rsi) ; X64-AVX-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX512-NEXT: vmovups %xmm1, (%rsi) ; X64-AVX512-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x float> zeroinitializer, <4 x float>* %p1 @@ -687,7 +687,7 @@ ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X32-AVX-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX-NEXT: vmovups %xmm1, (%eax) ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; @@ -697,14 +697,14 @@ ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X32-AVX512-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX512-NEXT: vmovups %xmm1, (%eax) ; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX-NEXT: vmovups %xmm1, (%rsi) ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; @@ -712,7 +712,7 @@ ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX512-NEXT: vmovups %xmm1, (%rsi) ; X64-AVX512-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x float> zeroinitializer, <4 x float>* %p1 @@ -736,7 +736,7 @@ ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,0,4,0] ; X32-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 -; X32-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,0,2,0,3,0,4,0] +; X32-AVX1-NEXT: vmovups {{.*#+}} ymm6 = [1,0,2,0,3,0,4,0] ; X32-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 ; X32-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7 ; X32-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 @@ -787,7 +787,7 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,4] ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,2,3,4] +; X64-AVX1-NEXT: vmovups {{.*#+}} ymm6 = [1,2,3,4] ; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7 ; X64-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/swap.ll b/llvm/test/CodeGen/X86/swap.ll --- a/llvm/test/CodeGen/X86/swap.ll +++ b/llvm/test/CodeGen/X86/swap.ll @@ -12,10 +12,10 @@ ; NOAA-LABEL: _Z4SwapP1SS0_: ; NOAA: # %bb.0: # %entry ; NOAA-NEXT: vmovups (%rdi), %xmm0 -; NOAA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; NOAA-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; NOAA-NEXT: vmovups (%rsi), %xmm0 ; NOAA-NEXT: vmovups %xmm0, (%rdi) -; NOAA-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; NOAA-NEXT: vmovups -{{[0-9]+}}(%rsp), %xmm0 ; NOAA-NEXT: vmovups %xmm0, (%rsi) ; NOAA-NEXT: retq ; @@ -87,14 +87,14 @@ ; NOAA-LABEL: twoallocs: ; NOAA: # %bb.0: # %entry ; NOAA-NEXT: vmovups (%rdi), %xmm0 -; NOAA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; NOAA-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; NOAA-NEXT: vmovups %xmm0, (%rsi) ; NOAA-NEXT: retq ; ; AA-LABEL: twoallocs: ; AA: # %bb.0: # %entry ; AA-NEXT: vmovups (%rdi), %xmm0 -; AA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AA-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AA-NEXT: vmovups %xmm0, (%rsi) ; AA-NEXT: retq entry: @@ -116,16 +116,16 @@ ; NOAA-LABEL: onealloc_readback_1: ; NOAA: # %bb.0: # %entry ; NOAA-NEXT: vmovups (%rdi), %xmm0 -; NOAA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; NOAA-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; NOAA-NEXT: vmovups (%rsi), %xmm0 -; NOAA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; NOAA-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; NOAA-NEXT: vmovups %xmm0, (%rdi) ; NOAA-NEXT: retq ; ; AA-LABEL: onealloc_readback_1: ; AA: # %bb.0: # %entry ; AA-NEXT: vmovups (%rsi), %xmm0 -; AA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AA-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AA-NEXT: vmovups %xmm0, (%rdi) ; AA-NEXT: retq entry: @@ -146,16 +146,16 @@ ; NOAA-LABEL: onealloc_readback_2: ; NOAA: # %bb.0: # %entry ; NOAA-NEXT: vmovups (%rdi), %xmm0 -; NOAA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; NOAA-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; NOAA-NEXT: vmovups (%rsi), %xmm0 -; NOAA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; NOAA-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; NOAA-NEXT: vmovups %xmm0, (%rdi) ; NOAA-NEXT: retq ; ; AA-LABEL: onealloc_readback_2: ; AA: # %bb.0: # %entry ; AA-NEXT: vmovups (%rsi), %xmm0 -; AA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AA-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AA-NEXT: vmovups %xmm0, (%rdi) ; AA-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/swizzle-avx2.ll b/llvm/test/CodeGen/X86/swizzle-avx2.ll --- a/llvm/test/CodeGen/X86/swizzle-avx2.ll +++ b/llvm/test/CodeGen/X86/swizzle-avx2.ll @@ -14,7 +14,7 @@ define <8 x i32> @swizzle_1(<8 x i32> %v) { ; CHECK-LABEL: swizzle_1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,2,0,4,5,6,7] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [1,3,2,0,4,5,6,7] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> @@ -45,7 +45,7 @@ define <8 x i32> @swizzle_4(<8 x i32> %v) { ; CHECK-LABEL: swizzle_4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,1,2,0,6,5,4,7] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [3,1,2,0,6,5,4,7] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> @@ -56,7 +56,7 @@ define <8 x i32> @swizzle_5(<8 x i32> %v) { ; CHECK-LABEL: swizzle_5: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,0,1,2,7,6,4,5] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [3,0,1,2,7,6,4,5] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> @@ -67,7 +67,7 @@ define <8 x i32> @swizzle_6(<8 x i32> %v) { ; CHECK-LABEL: swizzle_6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,1,0,2,4,5,6,7] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [3,1,0,2,4,5,6,7] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> @@ -78,7 +78,7 @@ define <8 x i32> @swizzle_7(<8 x i32> %v) { ; CHECK-LABEL: swizzle_7: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,3,1,4,5,6,7] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [0,2,3,1,4,5,6,7] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll --- a/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll +++ b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -8,7 +8,7 @@ define <8 x float> @load32bytes(<8 x float>* %Ap) { ; AVXSLOW-LABEL: load32bytes: ; AVXSLOW: # %bb.0: -; AVXSLOW-NEXT: vmovaps (%rdi), %xmm0 +; AVXSLOW-NEXT: vmovups (%rdi), %xmm0 ; AVXSLOW-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 ; AVXSLOW-NEXT: retq ; @@ -31,7 +31,7 @@ ; AVXSLOW-LABEL: store32bytes: ; AVXSLOW: # %bb.0: ; AVXSLOW-NEXT: vextractf128 $1, %ymm0, 16(%rdi) -; AVXSLOW-NEXT: vmovaps %xmm0, (%rdi) +; AVXSLOW-NEXT: vmovups %xmm0, (%rdi) ; AVXSLOW-NEXT: vzeroupper ; AVXSLOW-NEXT: retq ; @@ -81,17 +81,17 @@ define <8 x float> @combine_16_byte_loads_aligned(<4 x float>* %ptr) { ; AVXSLOW-LABEL: combine_16_byte_loads_aligned: ; AVXSLOW: # %bb.0: -; AVXSLOW-NEXT: vmovaps 48(%rdi), %ymm0 +; AVXSLOW-NEXT: vmovups 48(%rdi), %ymm0 ; AVXSLOW-NEXT: retq ; ; AVXFAST-LABEL: combine_16_byte_loads_aligned: ; AVXFAST: # %bb.0: -; AVXFAST-NEXT: vmovaps 48(%rdi), %ymm0 +; AVXFAST-NEXT: vmovups 48(%rdi), %ymm0 ; AVXFAST-NEXT: retq ; ; AVX2-LABEL: combine_16_byte_loads_aligned: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps 48(%rdi), %ymm0 +; AVX2-NEXT: vmovups 48(%rdi), %ymm0 ; AVX2-NEXT: retq %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll @@ -98,7 +98,7 @@ ; ; CHECK-XOP-LABEL: out_constant_varx_mone_invmask: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-XOP-NEXT: vmovups (%rdi), %xmm0 ; CHECK-XOP-NEXT: vorps (%rdx), %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, <4 x i32> *%px, align 16 @@ -319,7 +319,7 @@ ; ; CHECK-XOP-LABEL: out_constant_mone_vary: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0 +; CHECK-XOP-NEXT: vmovups (%rsi), %xmm0 ; CHECK-XOP-NEXT: vorps (%rdx), %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, <4 x i32> *%px, align 16 @@ -351,7 +351,7 @@ ; ; CHECK-XOP-LABEL: in_constant_mone_vary: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0 +; CHECK-XOP-NEXT: vmovups (%rsi), %xmm0 ; CHECK-XOP-NEXT: vorps (%rdx), %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, <4 x i32> *%px, align 16 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll @@ -488,7 +488,7 @@ ; ; CHECK-AVX1-LABEL: test_urem_one_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX1-NEXT: vmovups {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_one_eq: diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll --- a/llvm/test/CodeGen/X86/v8i1-masks.ll +++ b/llvm/test/CodeGen/X86/v8i1-masks.ll @@ -17,7 +17,7 @@ ; X32-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 ; X32-NEXT: vandps %ymm1, %ymm0, %ymm0 ; X32-NEXT: vandps LCPI0_0, %ymm0, %ymm0 -; X32-NEXT: vmovaps %ymm0, (%eax) +; X32-NEXT: vmovups %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; @@ -30,7 +30,7 @@ ; X64-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 ; X64-NEXT: vandps %ymm1, %ymm0, %ymm0 ; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vmovaps %ymm0, (%rax) +; X64-NEXT: vmovups %ymm0, (%rax) ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -81,7 +81,7 @@ ; X32-NEXT: vmovups (%ecx), %ymm0 ; X32-NEXT: vcmpnltps (%eax), %ymm0, %ymm0 ; X32-NEXT: vandps LCPI1_0, %ymm0, %ymm0 -; X32-NEXT: vmovaps %ymm0, (%eax) +; X32-NEXT: vmovups %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; @@ -90,7 +90,7 @@ ; X64-NEXT: vmovups (%rsi), %ymm0 ; X64-NEXT: vcmpnltps (%rdi), %ymm0, %ymm0 ; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vmovaps %ymm0, (%rax) +; X64-NEXT: vmovups %ymm0, (%rax) ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -102,7 +102,7 @@ ; X32-AVX2-NEXT: vcmpnltps (%eax), %ymm0, %ymm0 ; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] ; X32-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 -; X32-AVX2-NEXT: vmovaps %ymm0, (%eax) +; X32-AVX2-NEXT: vmovups %ymm0, (%eax) ; X32-AVX2-NEXT: vzeroupper ; X32-AVX2-NEXT: retl ; @@ -112,7 +112,7 @@ ; X64-AVX2-NEXT: vcmpnltps (%rdi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] ; X64-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vmovaps %ymm0, (%rax) +; X64-AVX2-NEXT: vmovups %ymm0, (%rax) ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq %v0 = load <8 x float>, <8 x float>* %a, align 16 @@ -167,7 +167,7 @@ define <8 x i32> @two_ands(<8 x float> %x) local_unnamed_addr #0 { ; X32-LABEL: two_ands: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vcmpltps LCPI3_1, %ymm0, %ymm0 ; X32-NEXT: vandps %ymm0, %ymm1, %ymm0 @@ -175,7 +175,7 @@ ; ; X64-LABEL: two_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vandps %ymm0, %ymm1, %ymm0 @@ -209,7 +209,7 @@ define <8 x i32> @three_ands(<8 x float> %x) { ; X32-LABEL: three_ands: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vcmpltps LCPI4_1, %ymm0, %ymm2 ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -220,7 +220,7 @@ ; ; X64-LABEL: three_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -265,7 +265,7 @@ define <8 x i32> @four_ands(<8 x float> %x) { ; X32-LABEL: four_ands: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vcmpltps LCPI5_1, %ymm0, %ymm2 ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -278,7 +278,7 @@ ; ; X64-LABEL: four_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -333,7 +333,7 @@ define <8 x i32> @five_ands(<8 x float> %x) { ; X32-LABEL: five_ands: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vcmpltps LCPI6_1, %ymm0, %ymm2 ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -348,7 +348,7 @@ ; ; X64-LABEL: five_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -413,7 +413,7 @@ define <8 x i32> @two_or(<8 x float> %x) { ; X32-LABEL: two_or: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vcmpltps LCPI7_1, %ymm0, %ymm0 ; X32-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -421,7 +421,7 @@ ; ; X64-LABEL: two_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -455,7 +455,7 @@ define <8 x i32> @three_or(<8 x float> %x) { ; X32-LABEL: three_or: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vcmpltps LCPI8_1, %ymm0, %ymm2 ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -466,7 +466,7 @@ ; ; X64-LABEL: three_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -511,7 +511,7 @@ define <8 x i32> @four_or(<8 x float> %x) { ; X32-LABEL: four_or: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vcmpltps LCPI9_1, %ymm0, %ymm2 ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -524,7 +524,7 @@ ; ; X64-LABEL: four_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -579,7 +579,7 @@ define <8 x i32> @five_or(<8 x float> %x) { ; X32-LABEL: five_or: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vcmpltps LCPI10_1, %ymm0, %ymm2 ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -594,7 +594,7 @@ ; ; X64-LABEL: five_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -659,7 +659,7 @@ define <8 x i32> @three_or_and(<8 x float> %x) { ; X32-LABEL: three_or_and: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vcmpltps LCPI11_1, %ymm0, %ymm2 ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -670,7 +670,7 @@ ; ; X64-LABEL: three_or_and: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -715,7 +715,7 @@ define <8 x i32> @four_or_and(<8 x float> %x) { ; X32-LABEL: four_or_and: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vcmpltps LCPI12_1, %ymm0, %ymm2 ; X32-NEXT: vandps %ymm2, %ymm1, %ymm1 @@ -728,7 +728,7 @@ ; ; X64-LABEL: four_or_and: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vandps %ymm2, %ymm1, %ymm1 @@ -783,7 +783,7 @@ define <8 x i32> @five_or_and(<8 x float> %x) { ; X32-LABEL: five_or_and: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vcmpltps LCPI13_1, %ymm0, %ymm2 ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -798,7 +798,7 @@ ; ; X64-LABEL: five_or_and: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -863,7 +863,7 @@ define <8 x i32> @four_or_and_xor(<8 x float> %x) { ; X32-LABEL: four_or_and_xor: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vcmpltps LCPI14_1, %ymm0, %ymm2 ; X32-NEXT: vxorps %ymm2, %ymm1, %ymm1 @@ -876,7 +876,7 @@ ; ; X64-LABEL: four_or_and_xor: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %ymm2, %ymm1, %ymm1 @@ -931,7 +931,7 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) { ; X32-LABEL: five_or_and_xor: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vcmpltps LCPI15_1, %ymm0, %ymm2 ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -946,7 +946,7 @@ ; ; X64-LABEL: five_or_and_xor: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1010,7 +1010,7 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) { ; X32-LABEL: six_or_and_xor: ; X32: ## %bb.0: ## %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X32-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-NEXT: vcmpltps LCPI16_1, %ymm0, %ymm2 ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1027,7 +1027,7 @@ ; ; X64-LABEL: six_or_and_xor: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vmovups {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/vaargs.ll b/llvm/test/CodeGen/X86/vaargs.ll --- a/llvm/test/CodeGen/X86/vaargs.ll +++ b/llvm/test/CodeGen/X86/vaargs.ll @@ -9,14 +9,14 @@ ; CHECK: testb %al, %al ; CHECK-NEXT: je ; CHECK-NEXT: ## %bb.{{[0-9]+}}: -; CHECK-NEXT: vmovaps %xmm0, 48(%rsp) -; CHECK-NEXT: vmovaps %xmm1, 64(%rsp) -; CHECK-NEXT: vmovaps %xmm2, 80(%rsp) -; CHECK-NEXT: vmovaps %xmm3, 96(%rsp) -; CHECK-NEXT: vmovaps %xmm4, 112(%rsp) -; CHECK-NEXT: vmovaps %xmm5, 128(%rsp) -; CHECK-NEXT: vmovaps %xmm6, 144(%rsp) -; CHECK-NEXT: vmovaps %xmm7, 160(%rsp) +; CHECK-NEXT: vmovups %xmm0, 48(%rsp) +; CHECK-NEXT: vmovups %xmm1, 64(%rsp) +; CHECK-NEXT: vmovups %xmm2, 80(%rsp) +; CHECK-NEXT: vmovups %xmm3, 96(%rsp) +; CHECK-NEXT: vmovups %xmm4, 112(%rsp) +; CHECK-NEXT: vmovups %xmm5, 128(%rsp) +; CHECK-NEXT: vmovups %xmm6, 144(%rsp) +; CHECK-NEXT: vmovups %xmm7, 160(%rsp) ; Check that [EFLAGS] hasn't been pulled in. ; NO-FLAGS-NOT: %flags diff --git a/llvm/test/CodeGen/X86/var-permute-512.ll b/llvm/test/CodeGen/X86/var-permute-512.ll --- a/llvm/test/CodeGen/X86/var-permute-512.ll +++ b/llvm/test/CodeGen/X86/var-permute-512.ll @@ -102,7 +102,7 @@ ; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3 ; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4 ; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vmovaps %zmm0, (%rsp) +; AVX512F-NEXT: vmovups %zmm0, (%rsp) ; AVX512F-NEXT: andl $31, %eax ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 @@ -331,7 +331,7 @@ ; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3 ; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4 ; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vmovaps %zmm0, (%rsp) +; AVX512F-NEXT: vmovups %zmm0, (%rsp) ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 @@ -556,7 +556,7 @@ ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4 ; AVX512BW-NEXT: vmovd %xmm4, %eax -; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) +; AVX512BW-NEXT: vmovups %zmm0, (%rsp) ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vmovd %eax, %xmm0 @@ -1069,7 +1069,7 @@ ; AVX512F-NEXT: vpbroadcastd %esi, %zmm2 ; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm2, %zmm1 ; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vmovaps %zmm0, (%rsp) +; AVX512F-NEXT: vmovups %zmm0, (%rsp) ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 @@ -1301,10 +1301,10 @@ ; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1 ; AVX512F-NEXT: vpmovsxbd %xmm8, %zmm3 ; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3 -; AVX512F-NEXT: vmovaps %zmm3, 192(%rdi) -; AVX512F-NEXT: vmovaps %zmm1, 128(%rdi) -; AVX512F-NEXT: vmovaps %zmm2, 64(%rdi) -; AVX512F-NEXT: vmovaps %zmm0, (%rdi) +; AVX512F-NEXT: vmovups %zmm3, 192(%rdi) +; AVX512F-NEXT: vmovups %zmm1, 128(%rdi) +; AVX512F-NEXT: vmovups %zmm2, 64(%rdi) +; AVX512F-NEXT: vmovups %zmm0, (%rdi) ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp ; AVX512F-NEXT: vzeroupper @@ -1320,7 +1320,7 @@ ; AVX512BW-NEXT: vpbroadcastd %esi, %zmm2 ; AVX512BW-NEXT: vpaddd {{.*}}(%rip), %zmm2, %zmm1 ; AVX512BW-NEXT: vmovd %xmm1, %eax -; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) +; AVX512BW-NEXT: vmovups %zmm0, (%rsp) ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vmovd %eax, %xmm0 @@ -1552,10 +1552,10 @@ ; AVX512BW-NEXT: vcvtdq2ps %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovsxbd %xmm8, %zmm3 ; AVX512BW-NEXT: vcvtdq2ps %zmm3, %zmm3 -; AVX512BW-NEXT: vmovaps %zmm3, 192(%rdi) -; AVX512BW-NEXT: vmovaps %zmm1, 128(%rdi) -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdi) -; AVX512BW-NEXT: vmovaps %zmm0, (%rdi) +; AVX512BW-NEXT: vmovups %zmm3, 192(%rdi) +; AVX512BW-NEXT: vmovups %zmm1, 128(%rdi) +; AVX512BW-NEXT: vmovups %zmm2, 64(%rdi) +; AVX512BW-NEXT: vmovups %zmm0, (%rdi) ; AVX512BW-NEXT: movq %rbp, %rsp ; AVX512BW-NEXT: popq %rbp ; AVX512BW-NEXT: vzeroupper @@ -1639,10 +1639,10 @@ ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm3, %zmm0 ; AVX512VBMI-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512VBMI-NEXT: vcvtdq2ps %zmm0, %zmm0 -; AVX512VBMI-NEXT: vmovaps %zmm0, 128(%rdi) -; AVX512VBMI-NEXT: vmovaps %zmm4, 64(%rdi) -; AVX512VBMI-NEXT: vmovaps %zmm1, (%rdi) -; AVX512VBMI-NEXT: vmovaps %zmm2, 192(%rdi) +; AVX512VBMI-NEXT: vmovups %zmm0, 128(%rdi) +; AVX512VBMI-NEXT: vmovups %zmm4, 64(%rdi) +; AVX512VBMI-NEXT: vmovups %zmm1, (%rdi) +; AVX512VBMI-NEXT: vmovups %zmm2, 192(%rdi) ; AVX512VBMI-NEXT: movq %rbp, %rsp ; AVX512VBMI-NEXT: popq %rbp ; AVX512VBMI-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vec-libcalls.ll b/llvm/test/CodeGen/X86/vec-libcalls.ll --- a/llvm/test/CodeGen/X86/vec-libcalls.ll +++ b/llvm/test/CodeGen/X86/vec-libcalls.ll @@ -51,13 +51,13 @@ ; CHECK-LABEL: sin_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: retq @@ -69,19 +69,19 @@ ; CHECK-LABEL: sin_v3f32: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: retq @@ -93,25 +93,25 @@ ; CHECK-LABEL: sin_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,3,3,3] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: retq @@ -127,23 +127,23 @@ ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,3,3,3] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -164,37 +164,37 @@ ; CHECK-NEXT: subq $88, %rsp ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,3,3,3] ; CHECK-NEXT: callq sinf -; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; CHECK-NEXT: addq $88, %rsp @@ -211,7 +211,7 @@ ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq sin -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] ; CHECK-NEXT: callq sin @@ -252,13 +252,13 @@ ; CHECK-LABEL: cos_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq cosf -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq cosf -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: retq @@ -270,13 +270,13 @@ ; CHECK-LABEL: exp_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq expf -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq expf -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: retq @@ -288,13 +288,13 @@ ; CHECK-LABEL: exp2_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq exp2f -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq exp2f -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: retq @@ -315,13 +315,13 @@ ; CHECK-LABEL: log_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq logf -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq logf -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: retq @@ -333,13 +333,13 @@ ; CHECK-LABEL: log10_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq log10f -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq log10f -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: retq @@ -351,13 +351,13 @@ ; CHECK-LABEL: log2_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq log2f -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq log2f -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll @@ -189,7 +189,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-16, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX-32-NEXT: vcmplt_oqps %xmm2, %xmm3, %xmm2 ; AVX-32-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -228,7 +228,7 @@ ; AVX512F-32-NEXT: subl $16, %esp ; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX512F-32-NEXT: vcmplt_oqps %xmm2, %xmm3, %xmm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} @@ -343,7 +343,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-16, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX-32-NEXT: vcmple_oqps %xmm2, %xmm3, %xmm2 ; AVX-32-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -382,7 +382,7 @@ ; AVX512F-32-NEXT: subl $16, %esp ; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX512F-32-NEXT: vcmple_oqps %xmm2, %xmm3, %xmm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} @@ -1398,7 +1398,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-16, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX-32-NEXT: vcmpnle_uqps %xmm2, %xmm3, %xmm2 ; AVX-32-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -1437,7 +1437,7 @@ ; AVX512F-32-NEXT: subl $16, %esp ; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX512F-32-NEXT: vcmpnle_uqps %xmm2, %xmm3, %xmm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} @@ -1552,7 +1552,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-16, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX-32-NEXT: vcmpnlt_uqps %xmm2, %xmm3, %xmm2 ; AVX-32-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -1591,7 +1591,7 @@ ; AVX512F-32-NEXT: subl $16, %esp ; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX512F-32-NEXT: vcmpnlt_uqps %xmm2, %xmm3, %xmm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} @@ -3456,7 +3456,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-16, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX-32-NEXT: vcmpltps %xmm2, %xmm3, %xmm2 ; AVX-32-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -3495,7 +3495,7 @@ ; AVX512F-32-NEXT: subl $16, %esp ; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX512F-32-NEXT: vcmpltps %xmm2, %xmm3, %xmm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} @@ -3552,7 +3552,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-16, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX-32-NEXT: vcmpleps %xmm2, %xmm3, %xmm2 ; AVX-32-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -3591,7 +3591,7 @@ ; AVX512F-32-NEXT: subl $16, %esp ; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX512F-32-NEXT: vcmpleps %xmm2, %xmm3, %xmm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} @@ -4326,7 +4326,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-16, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX-32-NEXT: vcmpnleps %xmm2, %xmm3, %xmm2 ; AVX-32-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -4365,7 +4365,7 @@ ; AVX512F-32-NEXT: subl $16, %esp ; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX512F-32-NEXT: vcmpnleps %xmm2, %xmm3, %xmm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} @@ -4422,7 +4422,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-16, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX-32-NEXT: vcmpnltps %xmm2, %xmm3, %xmm2 ; AVX-32-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -4461,7 +4461,7 @@ ; AVX512F-32-NEXT: subl $16, %esp ; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %xmm3 ; AVX512F-32-NEXT: vcmpnltps %xmm2, %xmm3, %xmm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-256.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-cmp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-256.ll @@ -82,7 +82,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-32, %esp ; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX-32-NEXT: vcmplt_oqps %ymm2, %ymm3, %ymm2 ; AVX-32-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -121,7 +121,7 @@ ; AVX512F-32-NEXT: subl $32, %esp ; AVX512F-32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX512F-32-NEXT: vcmplt_oqps %ymm2, %ymm3, %ymm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} @@ -153,7 +153,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-32, %esp ; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX-32-NEXT: vcmple_oqps %ymm2, %ymm3, %ymm2 ; AVX-32-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -192,7 +192,7 @@ ; AVX512F-32-NEXT: subl $32, %esp ; AVX512F-32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX512F-32-NEXT: vcmple_oqps %ymm2, %ymm3, %ymm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} @@ -707,7 +707,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-32, %esp ; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX-32-NEXT: vcmpnle_uqps %ymm2, %ymm3, %ymm2 ; AVX-32-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -746,7 +746,7 @@ ; AVX512F-32-NEXT: subl $32, %esp ; AVX512F-32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX512F-32-NEXT: vcmpnle_uqps %ymm2, %ymm3, %ymm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} @@ -778,7 +778,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-32, %esp ; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX-32-NEXT: vcmpnlt_uqps %ymm2, %ymm3, %ymm2 ; AVX-32-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -817,7 +817,7 @@ ; AVX512F-32-NEXT: subl $32, %esp ; AVX512F-32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX512F-32-NEXT: vcmpnlt_uqps %ymm2, %ymm3, %ymm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} @@ -2030,7 +2030,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-32, %esp ; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX-32-NEXT: vcmpltps %ymm2, %ymm3, %ymm2 ; AVX-32-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -2069,7 +2069,7 @@ ; AVX512F-32-NEXT: subl $32, %esp ; AVX512F-32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX512F-32-NEXT: vcmpltps %ymm2, %ymm3, %ymm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} @@ -2101,7 +2101,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-32, %esp ; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX-32-NEXT: vcmpleps %ymm2, %ymm3, %ymm2 ; AVX-32-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -2140,7 +2140,7 @@ ; AVX512F-32-NEXT: subl $32, %esp ; AVX512F-32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX512F-32-NEXT: vcmpleps %ymm2, %ymm3, %ymm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} @@ -2655,7 +2655,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-32, %esp ; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX-32-NEXT: vcmpnleps %ymm2, %ymm3, %ymm2 ; AVX-32-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -2694,7 +2694,7 @@ ; AVX512F-32-NEXT: subl $32, %esp ; AVX512F-32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX512F-32-NEXT: vcmpnleps %ymm2, %ymm3, %ymm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} @@ -2726,7 +2726,7 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-32, %esp ; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX-32-NEXT: vcmpnltps %ymm2, %ymm3, %ymm2 ; AVX-32-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX-32-NEXT: movl %ebp, %esp @@ -2765,7 +2765,7 @@ ; AVX512F-32-NEXT: subl $32, %esp ; AVX512F-32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %ymm3 +; AVX512F-32-NEXT: vmovups 8(%ebp), %ymm3 ; AVX512F-32-NEXT: vcmpnltps %ymm2, %ymm3, %ymm2 ; AVX512F-32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -764,7 +764,7 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $16, %esp ; AVX-32-NEXT: movl 8(%ebp), %eax -; AVX-32-NEXT: vmovaps (%eax), %xmm0 +; AVX-32-NEXT: vmovups (%eax), %xmm0 ; AVX-32-NEXT: vmovss %xmm0, (%esp) ; AVX-32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: flds (%esp) @@ -3101,10 +3101,10 @@ ; ; AVX-LABEL: strict_vector_fptoui_v4f32_to_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX-NEXT: vmovups {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; AVX-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vmovaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; AVX-NEXT: vmovups {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm4 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vsubps %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -1180,7 +1180,7 @@ ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm3[0,2] ; AVX-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vmovaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; AVX-NEXT: vmovups {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] ; AVX-NEXT: vblendvps %xmm3, %xmm4, %xmm5, %xmm3 ; AVX-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1 @@ -1441,10 +1441,10 @@ define <8 x i32> @strict_vector_fptoui_v8f32_to_v8i32(<8 x float> %a) #0 { ; AVX-LABEL: strict_vector_fptoui_v8f32_to_v8i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX-NEXT: vmovups {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX-NEXT: vmovups {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX-NEXT: vblendvps %ymm2, %ymm3, %ymm4, %ymm4 ; AVX-NEXT: vblendvps %ymm2, %ymm3, %ymm1, %ymm1 ; AVX-NEXT: vsubps %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vec_cast3.ll b/llvm/test/CodeGen/X86/vec_cast3.ll --- a/llvm/test/CodeGen/X86/vec_cast3.ll +++ b/llvm/test/CodeGen/X86/vec_cast3.ll @@ -115,7 +115,7 @@ define <2 x i32> @cvt_v2f32_v2u32(<2 x float> %src) { ; CHECK-LABEL: cvt_v2f32_v2u32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; CHECK-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 ; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm1 ; CHECK-NEXT: vcvttps2dq %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_extract-avx.ll b/llvm/test/CodeGen/X86/vec_extract-avx.ll --- a/llvm/test/CodeGen/X86/vec_extract-avx.ll +++ b/llvm/test/CodeGen/X86/vec_extract-avx.ll @@ -12,13 +12,13 @@ ; X32-LABEL: low_v8f32_to_v4f32: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vmovups %xmm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: low_v8f32_to_v4f32: ; X64: # %bb.0: -; X64-NEXT: vmovaps %xmm0, (%rdi) +; X64-NEXT: vmovups %xmm0, (%rdi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %ext0 = extractelement <8 x float> %v, i32 0 @@ -120,7 +120,7 @@ ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; X32-NEXT: vmovaps %ymm0, (%eax) +; X32-NEXT: vmovups %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; @@ -129,7 +129,7 @@ ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; X64-NEXT: vmovaps %ymm0, (%rsi) +; X64-NEXT: vmovups %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %ld = load <2 x i32>, <2 x i32>* %in, align 8 @@ -145,14 +145,14 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: vmovaps %ymm0, (%eax) +; X32-NEXT: vmovups %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: legal_vzmovl_2i64_4i64: ; X64: # %bb.0: ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: vmovaps %ymm0, (%rsi) +; X64-NEXT: vmovups %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %ld = load <2 x i64>, <2 x i64>* %in, align 8 @@ -170,7 +170,7 @@ ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; X32-NEXT: vmovaps %ymm0, (%eax) +; X32-NEXT: vmovups %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; @@ -179,7 +179,7 @@ ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; X64-NEXT: vmovaps %ymm0, (%rsi) +; X64-NEXT: vmovups %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %ld = load <2 x float>, <2 x float>* %in, align 8 @@ -195,14 +195,14 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: vmovaps %ymm0, (%eax) +; X32-NEXT: vmovups %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: legal_vzmovl_2f64_4f64: ; X64: # %bb.0: ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: vmovaps %ymm0, (%rsi) +; X64-NEXT: vmovups %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %ld = load <2 x double>, <2 x double>* %in, align 8 diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll --- a/llvm/test/CodeGen/X86/vec_fabs.ll +++ b/llvm/test/CodeGen/X86/vec_fabs.ll @@ -129,7 +129,7 @@ define <8 x double> @fabs_v8f64(<8 x double> %p) { ; X32_AVX-LABEL: fabs_v8f64: ; X32_AVX: # %bb.0: -; X32_AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] +; X32_AVX-NEXT: vmovups {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] ; X32_AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X32_AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X32_AVX-NEXT: retl @@ -146,7 +146,7 @@ ; ; X64_AVX-LABEL: fabs_v8f64: ; X64_AVX: # %bb.0: -; X64_AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] +; X64_AVX-NEXT: vmovups {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] ; X64_AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X64_AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X64_AVX-NEXT: retq @@ -168,7 +168,7 @@ define <16 x float> @fabs_v16f32(<16 x float> %p) { ; X32_AVX-LABEL: fabs_v16f32: ; X32_AVX: # %bb.0: -; X32_AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X32_AVX-NEXT: vmovups {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; X32_AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X32_AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X32_AVX-NEXT: retl @@ -185,7 +185,7 @@ ; ; X64_AVX-LABEL: fabs_v16f32: ; X64_AVX: # %bb.0: -; X64_AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X64_AVX-NEXT: vmovups {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; X64_AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X64_AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X64_AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll --- a/llvm/test/CodeGen/X86/vec_floor.ll +++ b/llvm/test/CodeGen/X86/vec_floor.ll @@ -708,12 +708,12 @@ ; ; AVX-LABEL: const_floor_v2f64: ; AVX: ## %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] ; AVX-NEXT: retq ; ; AVX512-LABEL: const_floor_v2f64: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] +; AVX512-NEXT: vmovups {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] ; AVX512-NEXT: retq %t = call <2 x double> @llvm.floor.v2f64(<2 x double> ) ret <2 x double> %t @@ -727,12 +727,12 @@ ; ; AVX-LABEL: const_floor_v4f32: ; AVX: ## %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0] ; AVX-NEXT: retq ; ; AVX512-LABEL: const_floor_v4f32: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0] +; AVX512-NEXT: vmovups {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0] ; AVX512-NEXT: retq %t = call <4 x float> @llvm.floor.v4f32(<4 x float> ) ret <4 x float> %t @@ -746,12 +746,12 @@ ; ; AVX-LABEL: const_ceil_v2f64: ; AVX: ## %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [-1.0E+0,3.0E+0] ; AVX-NEXT: retq ; ; AVX512-LABEL: const_ceil_v2f64: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0] +; AVX512-NEXT: vmovups {{.*#+}} xmm0 = [-1.0E+0,3.0E+0] ; AVX512-NEXT: retq %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> ) ret <2 x double> %t @@ -765,12 +765,12 @@ ; ; AVX-LABEL: const_ceil_v4f32: ; AVX: ## %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0] ; AVX-NEXT: retq ; ; AVX512-LABEL: const_ceil_v4f32: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0] +; AVX512-NEXT: vmovups {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0] ; AVX512-NEXT: retq %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> ) ret <4 x float> %t @@ -784,12 +784,12 @@ ; ; AVX-LABEL: const_trunc_v2f64: ; AVX: ## %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [-1.0E+0,2.0E+0] ; AVX-NEXT: retq ; ; AVX512-LABEL: const_trunc_v2f64: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0] +; AVX512-NEXT: vmovups {{.*#+}} xmm0 = [-1.0E+0,2.0E+0] ; AVX512-NEXT: retq %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> ) ret <2 x double> %t @@ -803,12 +803,12 @@ ; ; AVX-LABEL: const_trunc_v4f32: ; AVX: ## %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0] ; AVX-NEXT: retq ; ; AVX512-LABEL: const_trunc_v4f32: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0] +; AVX512-NEXT: vmovups {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0] ; AVX512-NEXT: retq %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> ) ret <4 x float> %t diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -1214,7 +1214,7 @@ ; ; AVX1-LABEL: fptoui_2f32_to_2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX1-NEXT: vmovups {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 @@ -1282,7 +1282,7 @@ ; ; AVX1-LABEL: fptoui_4f32_to_4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX1-NEXT: vmovups {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 @@ -1533,7 +1533,7 @@ ; ; AVX1-LABEL: fptoui_8f32_to_8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX1-NEXT: vmovups {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 ; AVX1-NEXT: vsubps %ymm1, %ymm0, %ymm1 ; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 @@ -1950,7 +1950,7 @@ ; ; AVX-LABEL: fptosi_2f64_to_2i64_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [1,18446744073709551615] ; AVX-NEXT: retq %cvt = fptosi <2 x double> to <2 x i64> ret <2 x i64> %cvt @@ -1964,7 +1964,7 @@ ; ; AVX-LABEL: fptosi_2f64_to_2i32_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u> +; AVX-NEXT: vmovups {{.*#+}} xmm0 = <4294967295,1,u,u> ; AVX-NEXT: retq %cvt = fptosi <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> @@ -1980,7 +1980,7 @@ ; ; AVX-LABEL: fptosi_4f64_to_4i64_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] ; AVX-NEXT: retq %cvt = fptosi <4 x double> to <4 x i64> ret <4 x i64> %cvt @@ -1994,7 +1994,7 @@ ; ; AVX-LABEL: fptosi_4f64_to_4i32_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [4294967295,1,4294967294,3] ; AVX-NEXT: retq %cvt = fptosi <4 x double> to <4 x i32> ret <4 x i32> %cvt @@ -2008,7 +2008,7 @@ ; ; AVX-LABEL: fptoui_2f64_to_2i64_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [2,4] ; AVX-NEXT: retq %cvt = fptoui <2 x double> to <2 x i64> ret <2 x i64> %cvt @@ -2022,7 +2022,7 @@ ; ; AVX-LABEL: fptoui_2f64_to_2i32_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u> +; AVX-NEXT: vmovups {{.*#+}} xmm0 = <2,4,u,u> ; AVX-NEXT: retq %cvt = fptoui <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> @@ -2038,7 +2038,7 @@ ; ; AVX-LABEL: fptoui_4f64_to_4i64_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [2,4,6,8] ; AVX-NEXT: retq %cvt = fptoui <4 x double> to <4 x i64> ret <4 x i64> %cvt @@ -2052,7 +2052,7 @@ ; ; AVX-LABEL: fptoui_4f64_to_4i32_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [2,4,6,8] ; AVX-NEXT: retq %cvt = fptoui <4 x double> to <4 x i32> ret <4 x i32> %cvt @@ -2066,7 +2066,7 @@ ; ; AVX-LABEL: fptosi_4f32_to_4i32_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [1,4294967295,2,3] ; AVX-NEXT: retq %cvt = fptosi <4 x float> to <4 x i32> ret <4 x i32> %cvt @@ -2081,7 +2081,7 @@ ; ; AVX-LABEL: fptosi_4f32_to_4i64_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [1,18446744073709551615,2,3] ; AVX-NEXT: retq %cvt = fptosi <4 x float> to <4 x i64> ret <4 x i64> %cvt @@ -2096,7 +2096,7 @@ ; ; AVX-LABEL: fptosi_8f32_to_8i32_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] ; AVX-NEXT: retq %cvt = fptosi <8 x float> to <8 x i32> ret <8 x i32> %cvt @@ -2110,7 +2110,7 @@ ; ; AVX-LABEL: fptoui_4f32_to_4i32_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [1,2,4,6] ; AVX-NEXT: retq %cvt = fptoui <4 x float> to <4 x i32> ret <4 x i32> %cvt @@ -2125,7 +2125,7 @@ ; ; AVX-LABEL: fptoui_4f32_to_4i64_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [1,2,4,8] ; AVX-NEXT: retq %cvt = fptoui <4 x float> to <4 x i64> ret <4 x i64> %cvt @@ -2140,7 +2140,7 @@ ; ; AVX-LABEL: fptoui_8f32_to_8i32_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] ; AVX-NEXT: retq %cvt = fptoui <8 x float> to <8 x i32> ret <8 x i32> %cvt @@ -2276,10 +2276,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: pushq %rbx ; AVX-NEXT: subq $16, %rsp -; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: callq __fixtfsi ; AVX-NEXT: movl %eax, %ebx -; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __fixtfsi ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: vmovd %ebx, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_fpext.ll b/llvm/test/CodeGen/X86/vec_fpext.ll --- a/llvm/test/CodeGen/X86/vec_fpext.ll +++ b/llvm/test/CodeGen/X86/vec_fpext.ll @@ -260,15 +260,15 @@ ; ; X32-AVX-LABEL: fpext_fromconst: ; X32-AVX: # %bb.0: # %entry -; X32-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.0E+0,-2.0E+0] -; X32-AVX-NEXT: # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X32-AVX-NEXT: vmovups {{.*#+}} xmm0 = [1.0E+0,-2.0E+0] +; X32-AVX-NEXT: # encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X32-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X32-AVX-NEXT: retl # encoding: [0xc3] ; ; X32-AVX512VL-LABEL: fpext_fromconst: ; X32-AVX512VL: # %bb.0: # %entry -; X32-AVX512VL-NEXT: vmovaps {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [1.0E+0,-2.0E+0] -; X32-AVX512VL-NEXT: # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X32-AVX512VL-NEXT: vmovups {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [1.0E+0,-2.0E+0] +; X32-AVX512VL-NEXT: # encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X32-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X32-AVX512VL-NEXT: retl # encoding: [0xc3] ; @@ -281,15 +281,15 @@ ; ; X64-AVX-LABEL: fpext_fromconst: ; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.0E+0,-2.0E+0] -; X64-AVX-NEXT: # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovups {{.*#+}} xmm0 = [1.0E+0,-2.0E+0] +; X64-AVX-NEXT: # encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: fpext_fromconst: ; X64-AVX512VL: # %bb.0: # %entry -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [1.0E+0,-2.0E+0] -; X64-AVX512VL-NEXT: # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vmovups {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [1.0E+0,-2.0E+0] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xf8,0x10,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] entry: @@ -311,14 +311,14 @@ ; X32-AVX-LABEL: PR42079: ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X32-AVX-NEXT: vmovaps (%eax), %xmm0 # encoding: [0xc5,0xf8,0x28,0x00] +; X32-AVX-NEXT: vmovups (%eax), %xmm0 # encoding: [0xc5,0xf8,0x10,0x00] ; X32-AVX-NEXT: vcvtps2pd %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5a,0xc0] ; X32-AVX-NEXT: retl # encoding: [0xc3] ; ; X32-AVX512VL-LABEL: PR42079: ; X32-AVX512VL: # %bb.0: ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X32-AVX512VL-NEXT: vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00] +; X32-AVX512VL-NEXT: vmovups (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x00] ; X32-AVX512VL-NEXT: vcvtps2pd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0xc0] ; X32-AVX512VL-NEXT: retl # encoding: [0xc3] ; @@ -330,13 +330,13 @@ ; ; X64-AVX-LABEL: PR42079: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07] +; X64-AVX-NEXT: vmovups (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x10,0x07] ; X64-AVX-NEXT: vcvtps2pd %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5a,0xc0] ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: PR42079: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; X64-AVX512VL-NEXT: vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] ; X64-AVX512VL-NEXT: vcvtps2pd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0xc0] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %a = load volatile <4 x float>, <4 x float>* %x diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -2955,7 +2955,7 @@ ; ; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovups (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -2994,7 +2994,7 @@ ; ; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load volatile <4 x i32>, <4 x i32> *%a @@ -3028,7 +3028,7 @@ ; ; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64_2: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %a = load volatile <4 x i32>, <4 x i32>* %x @@ -3189,7 +3189,7 @@ ; ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovups (%rdi), %ymm0 ; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq @@ -3359,7 +3359,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovups (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -3470,7 +3470,7 @@ ; ; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vmovups (%rdi), %xmm0 ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -3483,7 +3483,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovups (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -3538,7 +3538,7 @@ ; ; AVX512F-LABEL: uitofp_volatile_load_4i32_to_2f64_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vmovups (%rdi), %xmm0 ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -3546,13 +3546,13 @@ ; ; AVX512VL-LABEL: uitofp_volatile_load_4i32_to_2f64_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vmovups (%rdi), %xmm0 ; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovups (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -3560,7 +3560,7 @@ ; ; AVX512VLDQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VLDQ-NEXT: vmovups (%rdi), %xmm0 ; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq %a = load volatile <4 x i32>, <4 x i32>* %x @@ -3733,7 +3733,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovups (%rdi), %ymm0 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq @@ -3797,7 +3797,7 @@ ; ; AVX512F-LABEL: uitofp_load_4i32_to_4f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vmovups (%rdi), %xmm0 ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -3809,7 +3809,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_4i32_to_4f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovups (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq @@ -3982,7 +3982,7 @@ ; ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovups (%rdi), %ymm0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512DQ-NEXT: vzeroupper @@ -4144,7 +4144,7 @@ ; ; VEX-LABEL: sitofp_load_8i64_to_8f32: ; VEX: # %bb.0: -; VEX-NEXT: vmovaps (%rdi), %xmm0 +; VEX-NEXT: vmovups (%rdi), %xmm0 ; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 ; VEX-NEXT: vmovdqa 32(%rdi), %xmm2 ; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 @@ -4175,7 +4175,7 @@ ; ; AVX512F-LABEL: sitofp_load_8i64_to_8f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vmovups (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 @@ -4206,7 +4206,7 @@ ; ; AVX512VL-LABEL: sitofp_load_8i64_to_8f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vmovups (%rdi), %xmm0 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 @@ -4564,7 +4564,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovups (%rdi), %ymm0 ; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512DQ-NEXT: vzeroupper @@ -4628,7 +4628,7 @@ ; ; AVX512F-LABEL: uitofp_load_4i32_to_4f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vmovups (%rdi), %xmm0 ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -4641,7 +4641,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_4i32_to_4f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovups (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -4981,7 +4981,7 @@ ; ; AVX2-LABEL: uitofp_load_8i64_to_8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovups (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 @@ -5029,7 +5029,7 @@ ; ; AVX512F-LABEL: uitofp_load_8i64_to_8f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vmovups (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 @@ -5060,7 +5060,7 @@ ; ; AVX512VL-LABEL: uitofp_load_8i64_to_8f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vmovups (%rdi), %xmm0 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 @@ -5149,7 +5149,7 @@ ; ; AVX1-LABEL: uitofp_load_8i32_to_8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovups (%rdi), %ymm0 ; AVX1-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 @@ -5177,7 +5177,7 @@ ; ; AVX512F-LABEL: uitofp_load_8i32_to_8f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vmovups (%rdi), %ymm0 ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -5189,7 +5189,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_8i32_to_8f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovups (%rdi), %ymm0 ; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq @@ -5330,7 +5330,7 @@ ; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rax) +; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -5339,7 +5339,7 @@ ; AVX2-NEXT: movq 24(%rdi), %rax ; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: vmovaps %ymm0, (%rax) +; AVX2-NEXT: vmovups %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5348,7 +5348,7 @@ ; AVX512-NEXT: movq 24(%rdi), %rax ; AVX512-NEXT: vpmovsxwd 8(%rdi), %ymm0 ; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512-NEXT: vmovaps %ymm0, (%rax) +; AVX512-NEXT: vmovups %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = load %Arguments, %Arguments* %a0, align 1 diff --git a/llvm/test/CodeGen/X86/vec_logical.ll b/llvm/test/CodeGen/X86/vec_logical.ll --- a/llvm/test/CodeGen/X86/vec_logical.ll +++ b/llvm/test/CodeGen/X86/vec_logical.ll @@ -12,7 +12,7 @@ ; AVX-LABEL: t: ; AVX: # %bb.0: ; AVX-NEXT: vxorps {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, 0 +; AVX-NEXT: vmovups %xmm0, 0 ; AVX-NEXT: retl %tmp1277 = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %A store <4 x float> %tmp1277, <4 x float>* null @@ -71,7 +71,7 @@ ; AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vorps (%ecx), %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, (%eax) +; AVX-NEXT: vmovups %xmm0, (%eax) ; AVX-NEXT: retl entry: %tmp3 = load <4 x float>, <4 x float>* %c diff --git a/llvm/test/CodeGen/X86/vec_minmax_sint.ll b/llvm/test/CodeGen/X86/vec_minmax_sint.ll --- a/llvm/test/CodeGen/X86/vec_minmax_sint.ll +++ b/llvm/test/CodeGen/X86/vec_minmax_sint.ll @@ -1555,7 +1555,7 @@ ; ; AVX-LABEL: max_gt_v2i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [18446744073709551615,7] ; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 @@ -1573,7 +1573,7 @@ ; ; AVX-LABEL: max_gt_v4i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] ; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 @@ -1590,7 +1590,7 @@ ; ; AVX-LABEL: max_gt_v4i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [4294967295,4294967295,7,7] ; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 @@ -1608,7 +1608,7 @@ ; ; AVX-LABEL: max_gt_v8i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] ; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 @@ -1625,7 +1625,7 @@ ; ; AVX-LABEL: max_gt_v8i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] ; AVX-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 -1, i32 0 @@ -1643,7 +1643,7 @@ ; ; AVX-LABEL: max_gt_v16i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] ; AVX-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 -1, i32 0 @@ -1660,7 +1660,7 @@ ; ; AVX-LABEL: max_gt_v16i8c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] ; AVX-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 -1, i32 0 @@ -1677,7 +1677,7 @@ ; ; AVX-LABEL: max_ge_v2i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [18446744073709551615,7] ; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 @@ -1695,7 +1695,7 @@ ; ; AVX-LABEL: max_ge_v4i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] ; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 @@ -1712,7 +1712,7 @@ ; ; AVX-LABEL: max_ge_v4i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [4294967295,4294967295,7,7] ; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 @@ -1730,7 +1730,7 @@ ; ; AVX-LABEL: max_ge_v8i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] ; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 @@ -1747,7 +1747,7 @@ ; ; AVX-LABEL: max_ge_v8i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] ; AVX-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 -1, i32 0 @@ -1765,7 +1765,7 @@ ; ; AVX-LABEL: max_ge_v16i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] ; AVX-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 -1, i32 0 @@ -1782,7 +1782,7 @@ ; ; AVX-LABEL: max_ge_v16i8c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] ; AVX-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 -1, i32 0 @@ -1799,7 +1799,7 @@ ; ; AVX-LABEL: min_lt_v2i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [18446744073709551609,1] ; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 @@ -1817,7 +1817,7 @@ ; ; AVX-LABEL: min_lt_v4i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] ; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 @@ -1834,7 +1834,7 @@ ; ; AVX-LABEL: min_lt_v4i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [4294967289,4294967289,1,1] ; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 @@ -1852,7 +1852,7 @@ ; ; AVX-LABEL: min_lt_v8i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] ; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 @@ -1869,7 +1869,7 @@ ; ; AVX-LABEL: min_lt_v8i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] ; AVX-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 -1, i32 0 @@ -1887,7 +1887,7 @@ ; ; AVX-LABEL: min_lt_v16i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] ; AVX-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 -1, i32 0 @@ -1904,7 +1904,7 @@ ; ; AVX-LABEL: min_lt_v16i8c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] ; AVX-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 -1, i32 0 @@ -1921,7 +1921,7 @@ ; ; AVX-LABEL: min_le_v2i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [18446744073709551609,1] ; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 @@ -1939,7 +1939,7 @@ ; ; AVX-LABEL: min_le_v4i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] ; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 @@ -1956,7 +1956,7 @@ ; ; AVX-LABEL: min_le_v4i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [4294967289,4294967289,1,1] ; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 @@ -1974,7 +1974,7 @@ ; ; AVX-LABEL: min_le_v8i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] ; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 @@ -1991,7 +1991,7 @@ ; ; AVX-LABEL: min_le_v8i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] ; AVX-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 -1, i32 0 @@ -2009,7 +2009,7 @@ ; ; AVX-LABEL: min_le_v16i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] ; AVX-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 -1, i32 0 @@ -2026,7 +2026,7 @@ ; ; AVX-LABEL: min_le_v16i8c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] ; AVX-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 -1, i32 0 diff --git a/llvm/test/CodeGen/X86/vec_minmax_uint.ll b/llvm/test/CodeGen/X86/vec_minmax_uint.ll --- a/llvm/test/CodeGen/X86/vec_minmax_uint.ll +++ b/llvm/test/CodeGen/X86/vec_minmax_uint.ll @@ -1675,7 +1675,7 @@ ; ; AVX-LABEL: max_gt_v2i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [18446744073709551615,7] ; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 @@ -1693,7 +1693,7 @@ ; ; AVX-LABEL: max_gt_v4i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] ; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 @@ -1710,7 +1710,7 @@ ; ; AVX-LABEL: max_gt_v4i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [4294967295,4294967295,7,7] ; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 @@ -1728,7 +1728,7 @@ ; ; AVX-LABEL: max_gt_v8i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] ; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 @@ -1745,7 +1745,7 @@ ; ; AVX-LABEL: max_gt_v8i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] ; AVX-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 -1, i32 0 @@ -1763,7 +1763,7 @@ ; ; AVX-LABEL: max_gt_v16i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] ; AVX-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 -1, i32 0 @@ -1780,7 +1780,7 @@ ; ; AVX-LABEL: max_gt_v16i8c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] ; AVX-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 -1, i32 0 @@ -1797,7 +1797,7 @@ ; ; AVX-LABEL: max_ge_v2i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [18446744073709551615,7] ; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 @@ -1815,7 +1815,7 @@ ; ; AVX-LABEL: max_ge_v4i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] ; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 @@ -1832,7 +1832,7 @@ ; ; AVX-LABEL: max_ge_v4i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [4294967295,4294967295,7,7] ; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 @@ -1850,7 +1850,7 @@ ; ; AVX-LABEL: max_ge_v8i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] ; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 @@ -1867,7 +1867,7 @@ ; ; AVX-LABEL: max_ge_v8i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] ; AVX-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 -1, i32 0 @@ -1885,7 +1885,7 @@ ; ; AVX-LABEL: max_ge_v16i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] ; AVX-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 -1, i32 0 @@ -1902,7 +1902,7 @@ ; ; AVX-LABEL: max_ge_v16i8c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] ; AVX-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 -1, i32 0 @@ -1919,7 +1919,7 @@ ; ; AVX-LABEL: min_lt_v2i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [18446744073709551609,1] ; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 @@ -1937,7 +1937,7 @@ ; ; AVX-LABEL: min_lt_v4i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] ; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 @@ -1954,7 +1954,7 @@ ; ; AVX-LABEL: min_lt_v4i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [4294967289,4294967289,1,1] ; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 @@ -1972,7 +1972,7 @@ ; ; AVX-LABEL: min_lt_v8i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] ; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 @@ -1989,7 +1989,7 @@ ; ; AVX-LABEL: min_lt_v8i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,65531,65531,65529,1,3,3,1] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [1,65531,65531,65529,1,3,3,1] ; AVX-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 1, i32 0 @@ -2007,7 +2007,7 @@ ; ; AVX-LABEL: min_lt_v16i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [1,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] ; AVX-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 1, i32 0 @@ -2024,7 +2024,7 @@ ; ; AVX-LABEL: min_lt_v16i8c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [1,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] ; AVX-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 1, i32 0 @@ -2041,7 +2041,7 @@ ; ; AVX-LABEL: min_le_v2i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [18446744073709551609,1] ; AVX-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 @@ -2059,7 +2059,7 @@ ; ; AVX-LABEL: min_le_v4i64c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] ; AVX-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 @@ -2076,7 +2076,7 @@ ; ; AVX-LABEL: min_le_v4i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [4294967289,4294967289,1,1] ; AVX-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 @@ -2094,7 +2094,7 @@ ; ; AVX-LABEL: min_le_v8i32c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] ; AVX-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 @@ -2111,7 +2111,7 @@ ; ; AVX-LABEL: min_le_v8i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] ; AVX-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 -1, i32 0 @@ -2129,7 +2129,7 @@ ; ; AVX-LABEL: min_le_v16i16c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] ; AVX-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 -1, i32 0 @@ -2146,7 +2146,7 @@ ; ; AVX-LABEL: min_le_v16i8c: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] ; AVX-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 -1, i32 0 diff --git a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll --- a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll +++ b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll @@ -221,7 +221,7 @@ ; X32_AVX-NEXT: subl $28, %esp ; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32_AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32_AVX-NEXT: vmovaps %xmm0, (%esp) ## 16-byte Spill +; X32_AVX-NEXT: vmovups %xmm0, (%esp) ## 16-byte Spill ; X32_AVX-NEXT: calll _f ; X32_AVX-NEXT: vroundss $4, (%esp), %xmm0, %xmm0 ## 16-byte Folded Reload ; X32_AVX-NEXT: addl $28, %esp @@ -231,7 +231,7 @@ ; X64_AVX: ## %bb.0: ; X64_AVX-NEXT: subq $24, %rsp ; X64_AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64_AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill +; X64_AVX-NEXT: vmovups %xmm0, (%rsp) ## 16-byte Spill ; X64_AVX-NEXT: callq _f ; X64_AVX-NEXT: vroundss $4, (%rsp), %xmm0, %xmm0 ## 16-byte Folded Reload ; X64_AVX-NEXT: addq $24, %rsp @@ -257,12 +257,12 @@ ; ; X32_AVX-LABEL: test5: ; X32_AVX: ## %bb.0: ## %entry -; X32_AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2] +; X32_AVX-NEXT: vmovups {{.*#+}} xmm0 = [1.28E+2,1.23321E+2] ; X32_AVX-NEXT: retl ; ; X64_AVX-LABEL: test5: ; X64_AVX: ## %bb.0: ## %entry -; X64_AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2] +; X64_AVX-NEXT: vmovups {{.*#+}} xmm0 = [1.28E+2,1.23321E+2] ; X64_AVX-NEXT: retq entry: %0 = tail call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> , i32 128) nounwind readnone diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -2376,12 +2376,12 @@ ; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm0 ; AVX1-NEXT: vmovdqa %xmm8, 48(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm4, 32(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm4, 16(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm4, (%rsi) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vmovups %xmm4, 32(%rsi) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vmovups %xmm4, 16(%rsi) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vmovups %xmm4, (%rsi) ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4 ; AVX1-NEXT: vmovdqa %xmm4, 192(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -2270,12 +2270,12 @@ ; ; AVX-LABEL: fold_bitreverse_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] ; AVX-NEXT: retq ; ; XOP-LABEL: fold_bitreverse_v16i8: ; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] +; XOP-NEXT: vmovups {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] ; XOP-NEXT: retq %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> ) ret <16 x i8> %b @@ -2290,12 +2290,12 @@ ; ; AVX-LABEL: fold_bitreverse_v16i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] ; AVX-NEXT: retq ; ; XOP-LABEL: fold_bitreverse_v16i16: ; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] +; XOP-NEXT: vmovups {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] ; XOP-NEXT: retq %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> ) ret <16 x i16> %b @@ -2312,25 +2312,25 @@ ; ; AVX1-LABEL: fold_bitreverse_v16i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] -; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] +; AVX1-NEXT: vmovups {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] +; AVX1-NEXT: vmovups {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] ; AVX1-NEXT: retq ; ; AVX2-LABEL: fold_bitreverse_v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] +; AVX2-NEXT: vmovups {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] +; AVX2-NEXT: vmovups {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] ; AVX2-NEXT: retq ; ; AVX512-LABEL: fold_bitreverse_v16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] +; AVX512-NEXT: vmovups {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] ; AVX512-NEXT: retq ; ; XOP-LABEL: fold_bitreverse_v16i32: ; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] -; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] +; XOP-NEXT: vmovups {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] +; XOP-NEXT: vmovups {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] ; XOP-NEXT: retq %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> ) ret <16 x i32> %b diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll @@ -101,8 +101,8 @@ define <4 x float> @constrained_vector_fma_v4f32() #0 { ; CHECK-LABEL: constrained_vector_fma_v4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1] -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0] +; CHECK-NEXT: vmovups {{.*#+}} xmm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1] +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0] ; CHECK-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + mem ; CHECK-NEXT: retq entry: @@ -118,8 +118,8 @@ define <8 x float> @constrained_vector_fma_v8f32() #0 { ; CHECK-LABEL: constrained_vector_fma_v8f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1,7.5E+0,6.5E+0,5.5E+0,4.5E+0] -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0,1.15E+1,1.05E+1,9.5E+0,8.5E+0] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1,7.5E+0,6.5E+0,5.5E+0,4.5E+0] +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0,1.15E+1,1.05E+1,9.5E+0,8.5E+0] ; CHECK-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -201,7 +201,7 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmod -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmod @@ -251,15 +251,15 @@ ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: callq fmodf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: callq fmodf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: callq fmodf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -309,7 +309,7 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmod -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmod @@ -371,17 +371,17 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmod -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmod ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmod -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmod @@ -991,7 +991,7 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq pow -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq pow @@ -1041,15 +1041,15 @@ ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: callq powf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: callq powf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: callq powf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -1099,7 +1099,7 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq pow -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq pow @@ -1161,17 +1161,17 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq pow -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq pow ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq pow -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq pow @@ -1248,7 +1248,7 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: movl $3, %edi ; AVX-NEXT: callq __powidf2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: movl $3, %edi ; AVX-NEXT: callq __powidf2 @@ -1298,15 +1298,15 @@ ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: movl $3, %edi ; AVX-NEXT: callq __powisf2 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: movl $3, %edi ; AVX-NEXT: callq __powisf2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: movl $3, %edi ; AVX-NEXT: callq __powisf2 -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -1356,7 +1356,7 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: movl $3, %edi ; AVX-NEXT: callq __powidf2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: movl $3, %edi ; AVX-NEXT: callq __powidf2 @@ -1418,17 +1418,17 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: movl $3, %edi ; AVX-NEXT: callq __powidf2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: movl $3, %edi ; AVX-NEXT: callq __powidf2 ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: movl $3, %edi ; AVX-NEXT: callq __powidf2 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: movl $3, %edi ; AVX-NEXT: callq __powidf2 @@ -1498,7 +1498,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 32 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq sin -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq sin ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -1542,13 +1542,13 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq sinf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq sinf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq sinf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -1593,7 +1593,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 64 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq sin -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq sin ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -1647,15 +1647,15 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq sin -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq sin ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq sin -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq sin ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -1723,7 +1723,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 32 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq cos -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq cos ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -1767,13 +1767,13 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq cosf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq cosf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq cosf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -1818,7 +1818,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 64 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq cos -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq cos ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -1872,15 +1872,15 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq cos -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq cos ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq cos -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq cos ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -1948,7 +1948,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 32 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -1992,13 +1992,13 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq expf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq expf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq expf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -2043,7 +2043,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 64 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2097,15 +2097,15 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2173,7 +2173,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 32 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp2 ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2217,13 +2217,13 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq exp2f -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq exp2f -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq exp2f -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -2268,7 +2268,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 64 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp2 ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2322,15 +2322,15 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp2 ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp2 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp2 ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2398,7 +2398,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 32 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2442,13 +2442,13 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq logf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq logf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq logf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -2493,7 +2493,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 64 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2547,15 +2547,15 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2623,7 +2623,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 32 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log10 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log10 ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2667,13 +2667,13 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq log10f -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq log10f -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq log10f -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -2718,7 +2718,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 64 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log10 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log10 ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2772,15 +2772,15 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log10 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log10 ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log10 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log10 ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2848,7 +2848,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 32 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log2 ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2892,13 +2892,13 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq log2f -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq log2f -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq log2f -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -2943,7 +2943,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 64 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log2 ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2997,15 +2997,15 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log2 ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log2 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log2 ; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -3419,7 +3419,7 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmax -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmax @@ -3468,15 +3468,15 @@ ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: callq fmaxf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq fmaxf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: callq fmaxf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -3525,7 +3525,7 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmax -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmax @@ -3586,17 +3586,17 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmax -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmax ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmax -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmax @@ -3670,7 +3670,7 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmin -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmin @@ -3719,15 +3719,15 @@ ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: callq fminf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq fminf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: callq fminf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -3776,7 +3776,7 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmin -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmin @@ -3837,17 +3837,17 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmin -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmin ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmin -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmin @@ -4093,7 +4093,7 @@ ; ; AVX512DQ-LABEL: constrained_vector_fptosi_v4i64_v4f32: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1] +; AVX512DQ-NEXT: vmovups {{.*#+}} xmm0 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1] ; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq @@ -4237,7 +4237,7 @@ ; ; AVX512DQ-LABEL: constrained_vector_fptosi_v2i64_v2f64: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [4.2100000000000001E+1,4.2200000000000003E+1] +; AVX512DQ-NEXT: vmovups {{.*#+}} xmm0 = [4.2100000000000001E+1,4.2200000000000003E+1] ; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -4335,7 +4335,7 @@ ; ; AVX512DQ-LABEL: constrained_vector_fptosi_v4i64_v4f64: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vmovaps {{.*#+}} ymm0 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1] +; AVX512DQ-NEXT: vmovups {{.*#+}} ymm0 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1] ; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq @@ -4391,7 +4391,7 @@ ; ; AVX512-LABEL: constrained_vector_fptoui_v2i32_v2f32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4.2E+1,4.3E+1,0.0E+0,0.0E+0] +; AVX512-NEXT: vmovups {{.*#+}} xmm0 = [4.2E+1,4.3E+1,0.0E+0,0.0E+0] ; AVX512-NEXT: vcvttps2udq %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper @@ -4460,11 +4460,11 @@ ; ; AVX1-LABEL: constrained_vector_fptoui_v4i32_v4f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1] +; AVX1-NEXT: vmovups {{.*#+}} xmm0 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX1-NEXT: vmovups {{.*#+}} xmm1 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1] ; AVX1-NEXT: vcmpltps %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vmovaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vmovups {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vsubps %xmm0, %xmm1, %xmm0 @@ -4474,7 +4474,7 @@ ; ; AVX512-LABEL: constrained_vector_fptoui_v4i32_v4f32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1] +; AVX512-NEXT: vmovups {{.*#+}} xmm0 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1] ; AVX512-NEXT: vcvttps2udq %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper @@ -4884,7 +4884,7 @@ ; ; AVX512DQ-LABEL: constrained_vector_fptoui_v4i64_v4f32: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1] +; AVX512DQ-NEXT: vmovups {{.*#+}} xmm0 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1] ; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq @@ -4940,7 +4940,7 @@ ; ; AVX512-LABEL: constrained_vector_fptoui_v2i32_v2f64: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [4.2100000000000001E+1,4.2200000000000003E+1,0.0E+0,0.0E+0] +; AVX512-NEXT: vmovups {{.*#+}} ymm0 = [4.2100000000000001E+1,4.2200000000000003E+1,0.0E+0,0.0E+0] ; AVX512-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper @@ -5016,7 +5016,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm3[0,2] ; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vmovups {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vblendvps %xmm3, %xmm4, %xmm5, %xmm3 ; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm0, %ymm0 @@ -5028,7 +5028,7 @@ ; ; AVX512-LABEL: constrained_vector_fptoui_v4i32_v4f64: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1] +; AVX512-NEXT: vmovups {{.*#+}} ymm0 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1] ; AVX512-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper @@ -5169,7 +5169,7 @@ ; ; AVX512DQ-LABEL: constrained_vector_fptoui_v2i64_v2f64: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [4.2100000000000001E+1,4.2200000000000003E+1] +; AVX512DQ-NEXT: vmovups {{.*#+}} xmm0 = [4.2100000000000001E+1,4.2200000000000003E+1] ; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -5439,7 +5439,7 @@ ; ; AVX512DQ-LABEL: constrained_vector_fptoui_v4i64_v4f64: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vmovaps {{.*#+}} ymm0 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1] +; AVX512DQ-NEXT: vmovups {{.*#+}} ymm0 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1] ; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq @@ -5938,7 +5938,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 32 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq round -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq round ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -5981,13 +5981,13 @@ ; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq roundf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq roundf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: callq roundf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovups (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] @@ -6032,7 +6032,7 @@ ; AVX-NEXT: .cfi_def_cfa_offset 64 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq round -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq round ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/vector-extend-inreg.ll b/llvm/test/CodeGen/X86/vector-extend-inreg.ll --- a/llvm/test/CodeGen/X86/vector-extend-inreg.ll +++ b/llvm/test/CodeGen/X86/vector-extend-inreg.ll @@ -69,14 +69,14 @@ ; X32-AVX-NEXT: movl 40(%ebp), %ecx ; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: vmovaps %ymm1, (%esp) -; X32-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) +; X32-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X32-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X32-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X32-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X32-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X32-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X32-AVX-NEXT: vmovups %ymm1, (%esp) +; X32-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: leal (%ecx,%ecx), %eax ; X32-AVX-NEXT: andl $31, %eax ; X32-AVX-NEXT: movl 128(%esp,%eax,4), %eax @@ -98,9 +98,9 @@ ; X64-AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm3[3,3,3,3] ; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovaps %ymm1, (%rsp) +; X64-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vmovups %ymm1, (%rsp) ; X64-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; X64-AVX-NEXT: andl $15, %edi ; X64-AVX-NEXT: movq (%rsp,%rdi,8), %rax diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -22,7 +22,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] +; AVX1-NEXT: vmovups {{.*#+}} ymm3 = [63,63,63,63] ; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 @@ -125,7 +125,7 @@ ; ; XOPAVX1-LABEL: var_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] +; XOPAVX1-NEXT: vmovups {{.*#+}} ymm3 = [63,63,63,63] ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 @@ -163,7 +163,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [31,31,31,31,31,31,31,31] +; AVX1-NEXT: vmovups {{.*#+}} ymm8 = [31,31,31,31,31,31,31,31] ; AVX1-NEXT: vandnps %ymm8, %ymm2, %ymm4 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -282,7 +282,7 @@ ; ; XOPAVX1-LABEL: var_funnnel_v8i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] +; XOPAVX1-NEXT: vmovups {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 @@ -320,7 +320,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vmovups {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vandnps %ymm8, %ymm2, %ymm4 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX1-NEXT: vpsllw $12, %xmm5, %xmm6 @@ -485,7 +485,7 @@ ; ; XOPAVX1-LABEL: var_funnnel_v16i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOPAVX1-NEXT: vmovups {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 @@ -541,7 +541,7 @@ ; AVX1-NEXT: vpsrlw $4, %xmm5, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm10, %xmm7 -; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX1-NEXT: vmovups {{.*#+}} ymm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX1-NEXT: vandnps %ymm9, %ymm2, %ymm8 ; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm3 ; AVX1-NEXT: vpsllw $5, %xmm3, %xmm3 @@ -776,7 +776,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; XOPAVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm3, %xmm3 -; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX1-NEXT: vmovups {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; XOPAVX1-NEXT: vandnps %ymm8, %ymm2, %ymm6 ; XOPAVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 @@ -1539,7 +1539,7 @@ ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX1-NEXT: vmovups {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; XOPAVX1-NEXT: vandps %ymm4, %ymm2, %ymm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -22,7 +22,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] +; AVX1-NEXT: vmovups {{.*#+}} ymm3 = [63,63,63,63] ; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 @@ -126,7 +126,7 @@ ; ; XOPAVX1-LABEL: var_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] +; XOPAVX1-NEXT: vmovups {{.*#+}} ymm3 = [63,63,63,63] ; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 @@ -164,7 +164,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [31,31,31,31,31,31,31,31] +; AVX1-NEXT: vmovups {{.*#+}} ymm8 = [31,31,31,31,31,31,31,31] ; AVX1-NEXT: vandps %ymm2, %ymm8, %ymm4 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -284,7 +284,7 @@ ; ; XOPAVX1-LABEL: var_funnnel_v8i32: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] +; XOPAVX1-NEXT: vmovups {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] ; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 @@ -322,7 +322,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vmovups {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vandps %ymm2, %ymm8, %ymm4 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX1-NEXT: vpsllw $12, %xmm5, %xmm6 @@ -488,7 +488,7 @@ ; ; XOPAVX1-LABEL: var_funnnel_v16i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOPAVX1-NEXT: vmovups {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 @@ -541,7 +541,7 @@ ; AVX1-NEXT: vpsrlw $4, %xmm4, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm6 -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX1-NEXT: vmovups {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX1-NEXT: vandps %ymm2, %ymm8, %ymm7 ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3 ; AVX1-NEXT: vpsllw $5, %xmm3, %xmm3 @@ -764,7 +764,7 @@ ; ; XOPAVX1-LABEL: var_funnnel_v32i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX1-NEXT: vmovups {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 @@ -1483,7 +1483,7 @@ ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX1-NEXT: vmovups {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; XOPAVX1-NEXT: vandnps %ymm4, %ymm2, %ymm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 diff --git a/llvm/test/CodeGen/X86/vector-gep.ll b/llvm/test/CodeGen/X86/vector-gep.ll --- a/llvm/test/CodeGen/X86/vector-gep.ll +++ b/llvm/test/CodeGen/X86/vector-gep.ll @@ -185,24 +185,24 @@ ; CHECK-NEXT: vmovdqa %xmm6, 176(%eax) ; CHECK-NEXT: vmovdqa %xmm1, 160(%eax) ; CHECK-NEXT: vmovdqa %xmm4, 144(%eax) -; CHECK-NEXT: vmovaps (%esp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 128(%eax) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 112(%eax) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 96(%eax) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 80(%eax) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 64(%eax) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 48(%eax) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 32(%eax) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 16(%eax) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, (%eax) +; CHECK-NEXT: vmovups (%esp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups %xmm0, 128(%eax) +; CHECK-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups %xmm0, 112(%eax) +; CHECK-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups %xmm0, 96(%eax) +; CHECK-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups %xmm0, 80(%eax) +; CHECK-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups %xmm0, 64(%eax) +; CHECK-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups %xmm0, 48(%eax) +; CHECK-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups %xmm0, 32(%eax) +; CHECK-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups %xmm0, 16(%eax) +; CHECK-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovups %xmm0, (%eax) ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -578,7 +578,7 @@ ; ALL-LABEL: store_cvt_4f32_to_8i16_undef: ; ALL: # %bb.0: ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, (%rdi) +; ALL-NEXT: vmovups %xmm0, (%rdi) ; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -591,7 +591,7 @@ ; ALL-LABEL: store_cvt_4f32_to_8i16_zero: ; ALL: # %bb.0: ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, (%rdi) +; ALL-NEXT: vmovups %xmm0, (%rdi) ; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -655,14 +655,14 @@ ; ALL-LABEL: cvt_2f64_to_2i16: ; ALL: # %bb.0: ; ALL-NEXT: subq $40, %rsp -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 +; ALL-NEXT: vmovups (%rsp), %xmm0 ; ALL-NEXT: addq $40, %rsp ; ALL-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> @@ -676,7 +676,7 @@ ; ALL-NEXT: subq $88, %rsp ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; ALL-NEXT: vzeroupper ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) @@ -693,7 +693,7 @@ ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 +; ALL-NEXT: vmovups (%rsp), %xmm0 ; ALL-NEXT: addq $88, %rsp ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -707,7 +707,7 @@ ; ALL-NEXT: subq $88, %rsp ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; ALL-NEXT: vzeroupper ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) @@ -724,7 +724,7 @@ ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 +; ALL-NEXT: vmovups (%rsp), %xmm0 ; ALL-NEXT: addq $88, %rsp ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -739,7 +739,7 @@ ; ALL-NEXT: subq $88, %rsp ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; ALL-NEXT: vzeroupper ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) @@ -793,7 +793,7 @@ ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movzwl %ax, %r14d ; AVX1-NEXT: orl %ebx, %r14d @@ -818,7 +818,7 @@ ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovups (%rsp), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movzwl %ax, %eax ; AVX1-NEXT: orl %ebx, %eax @@ -860,7 +860,7 @@ ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movzwl %ax, %r14d ; AVX2-NEXT: orl %ebx, %r14d @@ -885,7 +885,7 @@ ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovups (%rsp), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movzwl %ax, %eax ; AVX2-NEXT: orl %ebx, %eax @@ -926,7 +926,7 @@ ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movzwl %ax, %r14d ; AVX512-NEXT: orl %ebx, %r14d @@ -954,7 +954,7 @@ ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovups (%rsp), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movzwl %ax, %eax ; AVX512-NEXT: orl %ebx, %eax @@ -1003,7 +1003,7 @@ ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movl %eax, %ebp -; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; ALL-NEXT: vmovups (%rsp), %xmm0 # 16-byte Reload ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, (%rbx) ; ALL-NEXT: movw %bp, 2(%rbx) @@ -1043,7 +1043,7 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movw %ax, 4(%rbx) ; AVX1-NEXT: movw %bp, (%rbx) @@ -1081,7 +1081,7 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movw %ax, 4(%rbx) ; AVX2-NEXT: movw %bp, (%rbx) @@ -1119,7 +1119,7 @@ ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movw %ax, 4(%rbx) ; AVX512-NEXT: movw %bp, (%rbx) @@ -1145,7 +1145,7 @@ ; ALL-NEXT: movq %rdi, %rbx ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; ALL-NEXT: vzeroupper ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) @@ -1162,8 +1162,8 @@ ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: vmovaps %xmm0, (%rbx) +; ALL-NEXT: vmovups (%rsp), %xmm0 +; ALL-NEXT: vmovups %xmm0, (%rbx) ; ALL-NEXT: addq $80, %rsp ; ALL-NEXT: popq %rbx ; ALL-NEXT: retq @@ -1182,7 +1182,7 @@ ; ALL-NEXT: movq %rdi, %rbx ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vmovups %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; ALL-NEXT: vzeroupper ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) @@ -1200,7 +1200,7 @@ ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovaps %xmm0, (%rbx) +; ALL-NEXT: vmovups %xmm0, (%rbx) ; ALL-NEXT: addq $80, %rsp ; ALL-NEXT: popq %rbx ; ALL-NEXT: retq @@ -1251,7 +1251,7 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %r14d ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1259,7 +1259,7 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movw %ax, 12(%rbx) ; AVX1-NEXT: movw %r15w, 8(%rbx) @@ -1319,7 +1319,7 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %r14d ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1327,7 +1327,7 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movw %ax, 12(%rbx) ; AVX2-NEXT: movw %r15w, 8(%rbx) @@ -1389,7 +1389,7 @@ ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %r14d ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1397,7 +1397,7 @@ ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movw %ax, 12(%rbx) ; AVX512-NEXT: movw %r15w, 8(%rbx) diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -1671,12 +1671,12 @@ ; ; NOBW-LABEL: foldv2i64: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; NOBW-NEXT: vmovups {{.*#+}} xmm0 = [55,0,0,0] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv2i64: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; AVX512VLBWDQ-NEXT: vmovups {{.*#+}} xmm0 = [55,0,0,0] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv2i64: @@ -1695,12 +1695,12 @@ ; ; NOBW-LABEL: foldv2i64u: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; NOBW-NEXT: vmovups {{.*#+}} xmm0 = [55,0,0,0] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv2i64u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; AVX512VLBWDQ-NEXT: vmovups {{.*#+}} xmm0 = [55,0,0,0] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv2i64u: @@ -1719,12 +1719,12 @@ ; ; NOBW-LABEL: foldv4i32: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; NOBW-NEXT: vmovups {{.*#+}} xmm0 = [23,0,32,24] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv4i32: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; AVX512VLBWDQ-NEXT: vmovups {{.*#+}} xmm0 = [23,0,32,24] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv4i32: @@ -1743,12 +1743,12 @@ ; ; NOBW-LABEL: foldv4i32u: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; NOBW-NEXT: vmovups {{.*#+}} xmm0 = [23,0,32,24] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv4i32u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; AVX512VLBWDQ-NEXT: vmovups {{.*#+}} xmm0 = [23,0,32,24] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv4i32u: @@ -1767,12 +1767,12 @@ ; ; NOBW-LABEL: foldv8i16: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; NOBW-NEXT: vmovups {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv8i16: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX512VLBWDQ-NEXT: vmovups {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16: @@ -1791,12 +1791,12 @@ ; ; NOBW-LABEL: foldv8i16u: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; NOBW-NEXT: vmovups {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv8i16u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX512VLBWDQ-NEXT: vmovups {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16u: @@ -1815,12 +1815,12 @@ ; ; NOBW-LABEL: foldv16i8: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; NOBW-NEXT: vmovups {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv16i8: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX512VLBWDQ-NEXT: vmovups {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8: @@ -1839,12 +1839,12 @@ ; ; NOBW-LABEL: foldv16i8u: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; NOBW-NEXT: vmovups {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv16i8u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX512VLBWDQ-NEXT: vmovups {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8u: diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll @@ -1102,12 +1102,12 @@ define <4 x i64> @foldv4i64() nounwind { ; X64-LABEL: foldv4i64: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [55,0,64,56] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0] +; X32-AVX-NEXT: vmovups {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0] ; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> , i1 0) ret <4 x i64> %out @@ -1116,12 +1116,12 @@ define <4 x i64> @foldv4i64u() nounwind { ; X64-LABEL: foldv4i64u: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [55,0,64,56] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0] +; X32-AVX-NEXT: vmovups {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0] ; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> , i1 -1) ret <4 x i64> %out @@ -1130,12 +1130,12 @@ define <8 x i32> @foldv8i32() nounwind { ; X64-LABEL: foldv8i32: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv8i32: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X32-AVX-NEXT: vmovups {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] ; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> , i1 0) ret <8 x i32> %out @@ -1144,12 +1144,12 @@ define <8 x i32> @foldv8i32u() nounwind { ; X64-LABEL: foldv8i32u: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv8i32u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X32-AVX-NEXT: vmovups {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] ; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> , i1 -1) ret <8 x i32> %out @@ -1158,12 +1158,12 @@ define <16 x i16> @foldv16i16() nounwind { ; X64-LABEL: foldv16i16: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv16i16: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; X32-AVX-NEXT: vmovups {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] ; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> , i1 0) ret <16 x i16> %out @@ -1172,12 +1172,12 @@ define <16 x i16> @foldv16i16u() nounwind { ; X64-LABEL: foldv16i16u: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv16i16u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; X32-AVX-NEXT: vmovups {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] ; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> , i1 -1) ret <16 x i16> %out @@ -1186,12 +1186,12 @@ define <32 x i8> @foldv32i8() nounwind { ; X64-LABEL: foldv32i8: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv32i8: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; X32-AVX-NEXT: vmovups {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] ; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> , i1 0) ret <32 x i8> %out @@ -1200,12 +1200,12 @@ define <32 x i8> @foldv32i8u() nounwind { ; X64-LABEL: foldv32i8u: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; X64-NEXT: vmovups {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv32i8u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; X32-AVX-NEXT: vmovups {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] ; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> , i1 -1) ret <32 x i8> %out diff --git a/llvm/test/CodeGen/X86/vector-partial-undef.ll b/llvm/test/CodeGen/X86/vector-partial-undef.ll --- a/llvm/test/CodeGen/X86/vector-partial-undef.ll +++ b/llvm/test/CodeGen/X86/vector-partial-undef.ll @@ -154,7 +154,7 @@ ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [6,1,5,4,3,2,0,7] +; AVX-NEXT: vmovups {{.*#+}} ymm1 = [6,1,5,4,3,2,0,7] ; AVX-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %extend = shufflevector <4 x i32> %x, <4 x i32> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -593,17 +593,17 @@ ; ; AVX-LABEL: foldv2i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [1,64] ; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] +; BITALG_NOVLX-NEXT: vmovups {{.*#+}} xmm0 = [1,64] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] +; BITALG-NEXT: vmovups {{.*#+}} xmm0 = [1,64] ; BITALG-NEXT: retq %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> ) ret <2 x i64> %out @@ -617,17 +617,17 @@ ; ; AVX-LABEL: foldv4i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [1,32,0,8] ; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] +; BITALG_NOVLX-NEXT: vmovups {{.*#+}} xmm0 = [1,32,0,8] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] +; BITALG-NEXT: vmovups {{.*#+}} xmm0 = [1,32,0,8] ; BITALG-NEXT: retq %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> ) ret <4 x i32> %out @@ -641,17 +641,17 @@ ; ; AVX-LABEL: foldv8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] ; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv8i16: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] +; BITALG_NOVLX-NEXT: vmovups {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv8i16: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] +; BITALG-NEXT: vmovups {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] ; BITALG-NEXT: retq %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> ) ret <8 x i16> %out @@ -665,17 +665,17 @@ ; ; AVX-LABEL: foldv16i8: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] ; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv16i8: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vmovups {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv16i8: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] +; BITALG-NEXT: vmovups {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] ; BITALG-NEXT: retq %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> ) ret <16 x i8> %out diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll @@ -308,7 +308,7 @@ define <4 x i64> @foldv4i64() nounwind { ; ALL-LABEL: foldv4i64: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,64,0,8] +; ALL-NEXT: vmovups {{.*#+}} ymm0 = [1,64,0,8] ; ALL-NEXT: retq %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> ) ret <4 x i64> %out @@ -317,7 +317,7 @@ define <8 x i32> @foldv8i32() nounwind { ; ALL-LABEL: foldv8i32: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] +; ALL-NEXT: vmovups {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] ; ALL-NEXT: retq %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> ) ret <8 x i32> %out @@ -326,7 +326,7 @@ define <16 x i16> @foldv16i16() nounwind { ; ALL-LABEL: foldv16i16: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1] +; ALL-NEXT: vmovups {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1] ; ALL-NEXT: retq %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> ) ret <16 x i16> %out @@ -335,7 +335,7 @@ define <32 x i8> @foldv32i8() nounwind { ; ALL-LABEL: foldv32i8: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7] +; ALL-NEXT: vmovups {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7] ; ALL-NEXT: retq %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> ) ret <32 x i8> %out diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -536,7 +536,7 @@ ; ; AVX1-LABEL: trunc_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -636,7 +636,7 @@ ; ; AVX1-LABEL: trunc_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -521,7 +521,7 @@ ; ; AVX1-LABEL: trunc_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -619,7 +619,7 @@ ; ; AVX1-LABEL: trunc_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -548,7 +548,7 @@ ; ; AVX1-LABEL: trunc_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -648,7 +648,7 @@ ; ; AVX1-LABEL: trunc_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1666,8 +1666,8 @@ ; AVX-LABEL: constant_gets_selected: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, (%rdi) -; AVX-NEXT: vmovaps %xmm0, (%rsi) +; AVX-NEXT: vmovups %xmm0, (%rdi) +; AVX-NEXT: vmovups %xmm0, (%rsi) ; AVX-NEXT: retq entry: %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -2462,7 +2462,7 @@ ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vmovaps %xmm1, (%rsi) +; AVX1-NEXT: vmovups %xmm1, (%rsi) ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_mem_pmovzx_v4f32: @@ -2471,7 +2471,7 @@ ; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2OR512VL-NEXT: vmovaps %xmm1, (%rsi) +; AVX2OR512VL-NEXT: vmovups %xmm1, (%rsi) ; AVX2OR512VL-NEXT: retq %1 = load <2 x float>, <2 x float>* %p0 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> @@ -2574,8 +2574,8 @@ ; ; AVX512VL-LABEL: shuffle_mem_v4f32_0624: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %xmm2 -; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,2,4] +; AVX512VL-NEXT: vmovups (%rdi), %xmm2 +; AVX512VL-NEXT: vmovups {{.*#+}} xmm1 = [0,6,2,4] ; AVX512VL-NEXT: vpermi2ps %xmm0, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 ; AVX512VL-NEXT: retq @@ -2600,7 +2600,7 @@ ; ; AVX512VL-LABEL: shuffle_mem_v4f32_4760: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,3,2,4] +; AVX512VL-NEXT: vmovups {{.*#+}} xmm1 = [0,3,2,4] ; AVX512VL-NEXT: vpermt2ps (%rdi), %xmm1, %xmm0 ; AVX512VL-NEXT: retq %1 = load <4 x float>, <4 x float>* %a1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1420,7 +1420,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0] ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1455,7 +1455,7 @@ define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1490,7 +1490,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535] ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1526,7 +1526,7 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0] ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2082,7 +2082,7 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -2118,7 +2118,7 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -36,7 +36,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8f32_00000010: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -48,7 +48,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_00000010: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] ; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -71,7 +71,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8f32_00000200: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -83,7 +83,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_00000200: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] ; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -106,7 +106,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8f32_00003000: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -118,7 +118,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_00003000: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] ; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -136,7 +136,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8f32_00040000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -153,7 +153,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8f32_00500000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -170,7 +170,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8f32_06000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -187,7 +187,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8f32_70000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} xmm1 = [7,0,0,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -213,7 +213,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8f32_00112233: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -236,7 +236,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8f32_00001111: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -248,7 +248,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_00001111: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -296,7 +296,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_08084c4c: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm2 = [0,8,0,8,4,12,4,12] ; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -358,7 +358,7 @@ ; ; AVX512VL-LABEL: shuffle_v8f32_08192a3b: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,8,1,9,2,10,3,11] +; AVX512VL-NEXT: vmovups {{.*#+}} ymm2 = [0,8,1,9,2,10,3,11] ; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -377,7 +377,7 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_08991abb: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vmovups {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] @@ -386,16 +386,16 @@ ; ; AVX2-FAST-LABEL: shuffle_v8f32_08991abb: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_08991abb: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [8,0,1,1,9,2,3,3] +; AVX512VL-NEXT: vmovups {{.*#+}} ymm2 = [8,0,1,1,9,2,3,3] ; AVX512VL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -414,14 +414,14 @@ ; ; AVX2-LABEL: shuffle_v8f32_091b2d3f: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u> +; AVX2-NEXT: vmovups {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u> ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_091b2d3f: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15] +; AVX512VL-NEXT: vmovups {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15] ; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -445,14 +445,14 @@ ; ; AVX2-FAST-LABEL: shuffle_v8f32_09ab1def: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_09ab1def: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [8,1,2,3,9,5,6,7] +; AVX512VL-NEXT: vmovups {{.*#+}} ymm2 = [8,1,2,3,9,5,6,7] ; AVX512VL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -771,7 +771,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8f32_c348cda0: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] @@ -781,7 +781,7 @@ ; ; AVX512VL-LABEL: shuffle_v8f32_c348cda0: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [4,11,12,0,4,5,2,8] +; AVX512VL-NEXT: vmovups {{.*#+}} ymm2 = [4,11,12,0,4,5,2,8] ; AVX512VL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -811,16 +811,16 @@ ; ; AVX2-FAST-LABEL: shuffle_v8f32_f511235a: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2> ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_f511235a: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [15,5,1,1,2,3,5,10] +; AVX512VL-NEXT: vmovups {{.*#+}} ymm2 = [15,5,1,1,2,3,5,10] ; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -914,7 +914,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8f32_76543210: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -926,7 +926,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_76543210: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] ; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -948,7 +948,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_3210ba98: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] ; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -970,7 +970,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_3210fedc: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] ; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -992,7 +992,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_7654fedc: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] ; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1014,7 +1014,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_fedc7654: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12] ; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq @@ -1044,7 +1044,7 @@ ; ; AVX512VL-FAST-LABEL: PR21138: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] ; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %truc, <8 x float> %tchose, <8 x i32> @@ -1066,7 +1066,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_ba987654: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm2 = [3,2,1,0,15,14,13,12] ; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq @@ -1089,7 +1089,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_ba983210: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm2 = [3,2,1,0,11,10,9,8] ; AVX512VL-FAST-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-FAST-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq @@ -1143,7 +1143,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_084c195d: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,8,4,12,1,9,5,13] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm2 = [0,8,4,12,1,9,5,13] ; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1179,14 +1179,14 @@ ; ; AVX2-LABEL: shuffle_v8f32_089abcde: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-NEXT: vmovups {{.*#+}} ymm2 = ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_089abcde: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [8,0,1,2,3,4,5,6] +; AVX512VL-NEXT: vmovups {{.*#+}} ymm2 = [8,0,1,2,3,4,5,6] ; AVX512VL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 ; AVX512VL-NEXT: retq @@ -1319,7 +1319,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_mem_v8f32_8BA0CFE4: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,3,2,8,4,7,6,12] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,3,2,8,4,7,6,12] ; AVX512VL-FAST-NEXT: vpermt2ps (%rdi), %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %1 = load <8 x float>, <8 x float>* %a1 @@ -1358,7 +1358,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8i32_00000010: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -1370,7 +1370,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_00000010: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] ; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1393,7 +1393,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8i32_00000200: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -1405,7 +1405,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_00000200: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] ; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1428,7 +1428,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8i32_00003000: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -1440,7 +1440,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_00003000: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] ; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1458,7 +1458,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00040000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1475,7 +1475,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00500000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1492,7 +1492,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_06000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1509,7 +1509,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_70000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} xmm1 = [7,0,0,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1546,7 +1546,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8i32_00112233: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -1558,7 +1558,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_00112233: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] ; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1581,7 +1581,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8i32_00001111: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -1593,7 +1593,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_00001111: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1737,9 +1737,9 @@ ; ; AVX2-FAST-LABEL: shuffle_v8i32_08991abb: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: retq @@ -1801,7 +1801,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8i32_09ab1def: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: retq @@ -1951,7 +1951,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00015444: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1966,7 +1966,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00204644: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1981,7 +1981,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_03004474: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1996,7 +1996,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_10004444: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2011,7 +2011,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_22006446: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2026,7 +2026,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_33307474: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2041,7 +2041,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_32104567: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2056,7 +2056,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00236744: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2071,7 +2071,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00226644: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2086,7 +2086,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_10324567: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2101,7 +2101,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_11334567: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2116,7 +2116,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_01235467: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2131,7 +2131,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_01235466: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2146,7 +2146,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_002u6u44: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4> +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4> ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2161,7 +2161,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00uu66uu: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u> +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u> ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2176,7 +2176,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_103245uu: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u> +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u> ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2191,7 +2191,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_1133uu67: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7> +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7> ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2206,7 +2206,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_0uu354uu: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u> +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u> ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2221,7 +2221,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_uuu3uu66: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = +; AVX2OR512VL-NEXT: vmovups {{.*#+}} ymm1 = ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2248,7 +2248,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8i32_6caa87e5: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] @@ -2351,7 +2351,7 @@ ; ; AVX2-FAST-LABEL: shuffle_v8i32_76543210: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; @@ -2363,7 +2363,7 @@ ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_76543210: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] ; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2517,7 +2517,7 @@ ; ; AVX2-LABEL: shuffle_v8i32_089abcde: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-NEXT: vmovups {{.*#+}} ymm2 = ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; AVX2-NEXT: retq @@ -2693,7 +2693,7 @@ ; ; AVX2-LABEL: shuffle_v8i32_0dcd3f14: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,3,u,1,4> +; AVX2-NEXT: vmovups {{.*#+}} ymm2 = <0,u,u,u,3,u,1,4> ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5],ymm0[6,7] @@ -3019,7 +3019,7 @@ ; ; AVX2-LABEL: shuffle_v8i32_12345670: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0] +; AVX2-NEXT: vmovups {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0] ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -3153,7 +3153,7 @@ ; AVX512VL-FAST: # %bb.0: # %entry ; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [1,4,3,3] +; AVX512VL-FAST-NEXT: vmovups {{.*#+}} xmm1 = [1,4,3,3] ; AVX512VL-FAST-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 ; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -223,7 +223,7 @@ define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x float> %a) { ; ALL-LABEL: shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1> +; ALL-NEXT: vmovups {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1> ; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> @@ -233,7 +233,7 @@ define <16 x i32> @shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x i32> %a) { ; ALL-LABEL: shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1> +; ALL-NEXT: vmovups {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1> ; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> @@ -253,7 +253,7 @@ define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24] +; ALL-NEXT: vmovups {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24] ; ALL-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 ; ALL-NEXT: retq %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> @@ -284,7 +284,7 @@ define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, <16 x float>* %b) { ; ALL-LABEL: shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24] +; ALL-NEXT: vmovups {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24] ; ALL-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0 ; ALL-NEXT: retq %c = load <16 x float>, <16 x float>* %b @@ -354,7 +354,7 @@ define <4 x i32> @test_v16i32_0_4_8_12(<16 x i32> %v) { ; ALL-LABEL: test_v16i32_0_4_8_12: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [0,4,8,12] +; ALL-NEXT: vmovups {{.*#+}} xmm1 = [0,4,8,12] ; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; ALL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -205,7 +205,7 @@ ; ; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; SKX: ## %bb.0: -; SKX-NEXT: vmovaps {{.*#+}} xmm1 = [65535,0,0,0] +; SKX-NEXT: vmovups {{.*#+}} xmm1 = [65535,0,0,0] ; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 ; SKX-NEXT: retq %shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -121,7 +121,7 @@ ; ; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,0,0] +; AVX512DQ-NEXT: vmovups {{.*#+}} xmm1 = [255,0,0,0] ; AVX512DQ-NEXT: vandps %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -46,13 +46,13 @@ define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00000010: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00000010: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -62,13 +62,13 @@ define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00000200: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00000200: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -78,13 +78,13 @@ define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00003000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00003000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -94,13 +94,13 @@ define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00040000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00040000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -110,13 +110,13 @@ define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00500000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00500000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -126,13 +126,13 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_06000000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_06000000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -142,7 +142,7 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_70000000: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0] +; ALL-NEXT: vmovups {{.*#+}} xmm1 = [7,0,0,0] ; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -161,13 +161,13 @@ define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00112233: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00112233: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -177,13 +177,13 @@ define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00001111: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00001111: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -507,13 +507,13 @@ define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00015444: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00015444: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -523,13 +523,13 @@ define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00204644: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00204644: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -539,13 +539,13 @@ define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_03004474: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_03004474: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -555,13 +555,13 @@ define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_10004444: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_10004444: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -571,13 +571,13 @@ define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_22006446: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_22006446: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -587,13 +587,13 @@ define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_33307474: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_33307474: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -603,13 +603,13 @@ define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_32104567: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_32104567: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -619,13 +619,13 @@ define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00236744: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00236744: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -635,13 +635,13 @@ define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00226644: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00226644: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -687,13 +687,13 @@ define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_002u6u44: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4> +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4> ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_002u6u44: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0> +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0> ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -703,13 +703,13 @@ define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00uu66uu: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u> +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u> ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00uu66uu: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u> +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u> ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -864,13 +864,13 @@ define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00000010: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00000010: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -880,13 +880,13 @@ define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00000200: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00000200: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -896,13 +896,13 @@ define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00003000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00003000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -912,13 +912,13 @@ define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00040000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00040000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -928,13 +928,13 @@ define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00500000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00500000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -944,13 +944,13 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_06000000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_06000000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -960,7 +960,7 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_70000000: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0] +; ALL-NEXT: vmovups {{.*#+}} xmm1 = [7,0,0,0] ; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -997,13 +997,13 @@ define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00112233: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00112233: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1013,13 +1013,13 @@ define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00001111: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00001111: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1345,13 +1345,13 @@ define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00015444: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00015444: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1361,13 +1361,13 @@ define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00204644: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00204644: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1377,13 +1377,13 @@ define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_03004474: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_03004474: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1393,13 +1393,13 @@ define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_10004444: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_10004444: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1409,13 +1409,13 @@ define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_22006446: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_22006446: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1425,13 +1425,13 @@ define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_33307474: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_33307474: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1441,13 +1441,13 @@ define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_32104567: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_32104567: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1457,13 +1457,13 @@ define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00236744: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00236744: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1473,13 +1473,13 @@ define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00226644: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00226644: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1489,13 +1489,13 @@ define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_10324567: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_10324567: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,6,0,7,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1505,13 +1505,13 @@ define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_11334567: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,1,3,3,4,5,6,7] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [1,1,3,3,4,5,6,7] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_11334567: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,4,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,4,0,5,0,6,0,7,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1521,13 +1521,13 @@ define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_01235467: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,2,3,5,4,6,7] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,1,2,3,5,4,6,7] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_01235467: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,7,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,7,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1537,13 +1537,13 @@ define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_01235466: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,2,3,5,4,6,6] +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [0,1,2,3,5,4,6,6] ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_01235466: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,6,0] +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,6,0] ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1553,13 +1553,13 @@ define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_002u6u44: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4> +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4> ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_002u6u44: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0> +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0> ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1569,13 +1569,13 @@ define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00uu66uu: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u> +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u> ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00uu66uu: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u> +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u> ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1585,13 +1585,13 @@ define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_103245uu: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <1,0,3,2,4,5,u,u> +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = <1,0,3,2,4,5,u,u> ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_103245uu: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <1,0,0,0,3,0,2,0,4,0,5,0,u,u,u,u> +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = <1,0,0,0,3,0,2,0,4,0,5,0,u,u,u,u> ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1601,13 +1601,13 @@ define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_1133uu67: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <1,1,3,3,u,u,6,7> +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = <1,1,3,3,u,u,6,7> ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_1133uu67: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <1,0,1,0,3,0,3,0,u,u,u,u,6,0,7,0> +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = <1,0,1,0,3,0,3,0,u,u,u,u,6,0,7,0> ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1617,13 +1617,13 @@ define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_0uu354uu: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,u,u,3,5,4,u,u> +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = <0,u,u,3,5,4,u,u> ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_0uu354uu: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,u,u,u,u,3,0,5,0,4,0,u,u,u,u> +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = <0,0,u,u,u,u,3,0,5,0,4,0,u,u,u,u> ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -1633,13 +1633,13 @@ define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_uuu3uu66: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovups {{.*#+}} zmm1 = ; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_uuu3uu66: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = +; AVX512F-32-NEXT: vmovups {{.*#+}} zmm1 = ; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -36,7 +36,7 @@ ; KNL-LABEL: expand1: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vmovaps {{.*#+}} ymm1 = [16,0,18,1,20,2,22,3] +; KNL-NEXT: vmovups {{.*#+}} ymm1 = [16,0,18,1,20,2,22,3] ; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 ; KNL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -105,7 +105,7 @@ ; SKX: # %bb.0: ; SKX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [8,0,10,0,12,0,14,0] +; SKX-NEXT: vmovups {{.*#+}} ymm2 = [8,0,10,0,12,0,14,0] ; SKX-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; SKX-NEXT: ret{{[l|q]}} ; @@ -236,7 +236,7 @@ ; CHECK-LABEL: expand12: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16] +; CHECK-NEXT: vmovups {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16] ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 @@ -269,7 +269,7 @@ ; KNL-LABEL: expand14: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vmovaps {{.*#+}} ymm1 = [16,17,0,19,1,21,22,23] +; KNL-NEXT: vmovups {{.*#+}} ymm1 = [16,17,0,19,1,21,22,23] ; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 ; KNL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -284,7 +284,7 @@ ; SKX-LABEL: expand15: ; SKX: # %bb.0: ; SKX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; SKX-NEXT: vmovaps {{.*#+}} ymm1 = +; SKX-NEXT: vmovups {{.*#+}} ymm1 = ; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; SKX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] ; SKX-NEXT: ret{{[l|q]}} @@ -469,8 +469,8 @@ define <8 x float> @test_masked_permps_v8f32(<8 x float>* %vp, <8 x float> %vec2) { ; SKX64-LABEL: test_masked_permps_v8f32: ; SKX64: # %bb.0: -; SKX64-NEXT: vmovaps (%rdi), %ymm2 -; SKX64-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,3,11,7,6,14,15] +; SKX64-NEXT: vmovups (%rdi), %ymm2 +; SKX64-NEXT: vmovups {{.*#+}} ymm1 = [7,6,3,11,7,6,14,15] ; SKX64-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1 ; SKX64-NEXT: vmovaps %ymm1, %ymm0 ; SKX64-NEXT: retq @@ -478,8 +478,8 @@ ; KNL64-LABEL: test_masked_permps_v8f32: ; KNL64: # %bb.0: ; KNL64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL64-NEXT: vmovaps (%rdi), %ymm1 -; KNL64-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,3,19,7,6,22,23] +; KNL64-NEXT: vmovups (%rdi), %ymm1 +; KNL64-NEXT: vmovups {{.*#+}} ymm2 = [7,6,3,19,7,6,22,23] ; KNL64-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 ; KNL64-NEXT: vmovaps %ymm1, %ymm0 ; KNL64-NEXT: retq @@ -487,8 +487,8 @@ ; SKX32-LABEL: test_masked_permps_v8f32: ; SKX32: # %bb.0: ; SKX32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX32-NEXT: vmovaps (%eax), %ymm2 -; SKX32-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,3,11,7,6,14,15] +; SKX32-NEXT: vmovups (%eax), %ymm2 +; SKX32-NEXT: vmovups {{.*#+}} ymm1 = [7,6,3,11,7,6,14,15] ; SKX32-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1 ; SKX32-NEXT: vmovaps %ymm1, %ymm0 ; SKX32-NEXT: retl @@ -497,8 +497,8 @@ ; KNL32: # %bb.0: ; KNL32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; KNL32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL32-NEXT: vmovaps (%eax), %ymm1 -; KNL32-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,3,19,7,6,22,23] +; KNL32-NEXT: vmovups (%eax), %ymm1 +; KNL32-NEXT: vmovups {{.*#+}} ymm2 = [7,6,3,19,7,6,22,23] ; KNL32-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 ; KNL32-NEXT: vmovaps %ymm1, %ymm0 ; KNL32-NEXT: retl @@ -511,8 +511,8 @@ define <16 x float> @test_masked_permps_v16f32(<16 x float>* %vp, <16 x float> %vec2) { ; X64-LABEL: test_masked_permps_v16f32: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %zmm2 -; X64-NEXT: vmovaps {{.*#+}} zmm1 = [15,13,11,19,14,12,22,23,7,6,3,27,7,29,3,31] +; X64-NEXT: vmovups (%rdi), %zmm2 +; X64-NEXT: vmovups {{.*#+}} zmm1 = [15,13,11,19,14,12,22,23,7,6,3,27,7,29,3,31] ; X64-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 ; X64-NEXT: vmovaps %zmm1, %zmm0 ; X64-NEXT: retq @@ -520,8 +520,8 @@ ; X86-LABEL: test_masked_permps_v16f32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovaps (%eax), %zmm2 -; X86-NEXT: vmovaps {{.*#+}} zmm1 = [15,13,11,19,14,12,22,23,7,6,3,27,7,29,3,31] +; X86-NEXT: vmovups (%eax), %zmm2 +; X86-NEXT: vmovups {{.*#+}} zmm1 = [15,13,11,19,14,12,22,23,7,6,3,27,7,29,3,31] ; X86-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 ; X86-NEXT: vmovaps %zmm1, %zmm0 ; X86-NEXT: retl @@ -617,30 +617,30 @@ define void @PR43170(<16 x float>* %a0) { ; SKX64-LABEL: PR43170: ; SKX64: # %bb.0: # %entry -; SKX64-NEXT: vmovaps {{.*}}(%rip), %ymm0 -; SKX64-NEXT: vmovaps %zmm0, (%rdi) +; SKX64-NEXT: vmovups {{.*}}(%rip), %ymm0 +; SKX64-NEXT: vmovups %zmm0, (%rdi) ; SKX64-NEXT: vzeroupper ; SKX64-NEXT: retq ; ; KNL64-LABEL: PR43170: ; KNL64: # %bb.0: # %entry -; KNL64-NEXT: vmovaps {{.*}}(%rip), %ymm0 -; KNL64-NEXT: vmovaps %zmm0, (%rdi) +; KNL64-NEXT: vmovups {{.*}}(%rip), %ymm0 +; KNL64-NEXT: vmovups %zmm0, (%rdi) ; KNL64-NEXT: retq ; ; SKX32-LABEL: PR43170: ; SKX32: # %bb.0: # %entry ; SKX32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX32-NEXT: vmovaps src1, %ymm0 -; SKX32-NEXT: vmovaps %zmm0, (%eax) +; SKX32-NEXT: vmovups src1, %ymm0 +; SKX32-NEXT: vmovups %zmm0, (%eax) ; SKX32-NEXT: vzeroupper ; SKX32-NEXT: retl ; ; KNL32-LABEL: PR43170: ; KNL32: # %bb.0: # %entry ; KNL32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL32-NEXT: vmovaps src1, %ymm0 -; KNL32-NEXT: vmovaps %zmm0, (%eax) +; KNL32-NEXT: vmovups src1, %ymm0 +; KNL32-NEXT: vmovups %zmm0, (%eax) ; KNL32-NEXT: retl entry: %0 = load <8 x float>, <8 x float>* bitcast (%union1* @src1 to <8 x float>*), align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -140,7 +140,7 @@ ; AVX512-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0] +; AVX512-NEXT: vmovups {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0] ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 ; AVX512-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] @@ -308,7 +308,7 @@ define <2 x double> @constant_fold_vpermilvar_pd() { ; CHECK-LABEL: constant_fold_vpermilvar_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2.0E+0,1.0E+0] +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [2.0E+0,1.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> , <2 x i64> ) ret <2 x double> %1 @@ -317,7 +317,7 @@ define <4 x double> @constant_fold_vpermilvar_pd_256() { ; CHECK-LABEL: constant_fold_vpermilvar_pd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0] +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> , <4 x i64> ) ret <4 x double> %1 @@ -326,7 +326,7 @@ define <4 x float> @constant_fold_vpermilvar_ps() { ; CHECK-LABEL: constant_fold_vpermilvar_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0] +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> , <4 x i32> ) ret <4 x float> %1 @@ -335,7 +335,7 @@ define <8 x float> @constant_fold_vpermilvar_ps_256() { ; CHECK-LABEL: constant_fold_vpermilvar_ps_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0] +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> , <8 x i32> ) ret <8 x float> %1 @@ -364,7 +364,7 @@ ; X86-AVX2: # %bb.0: # %entry ; X86-AVX2-NEXT: vmovups 32, %ymm0 ; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; X86-AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> +; X86-AVX2-NEXT: vmovups {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; X86-AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,1,0,3,4,5,4,7] ; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] @@ -378,7 +378,7 @@ ; X86-AVX512: # %bb.0: # %entry ; X86-AVX512-NEXT: vmovups 0, %zmm0 ; X86-AVX512-NEXT: vmovups 64, %ymm1 -; X86-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23] +; X86-AVX512-NEXT: vmovups {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23] ; X86-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 ; X86-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X86-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1 @@ -407,7 +407,7 @@ ; X64-AVX2: # %bb.0: # %entry ; X64-AVX2-NEXT: vmovups 32, %ymm0 ; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; X64-AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> +; X64-AVX2-NEXT: vmovups {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; X64-AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,1,0,3,4,5,4,7] ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] @@ -421,7 +421,7 @@ ; X64-AVX512: # %bb.0: # %entry ; X64-AVX512-NEXT: vmovups 0, %zmm0 ; X64-AVX512-NEXT: vmovups 64, %ymm1 -; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23] +; X64-AVX512-NEXT: vmovups {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23] ; X64-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 ; X64-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -116,7 +116,7 @@ define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) { ; CHECK-LABEL: combine_as_vpermd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7] +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> @@ -128,7 +128,7 @@ define <8 x float> @combine_as_vpermps(<8 x float> %a0) { ; CHECK-LABEL: combine_as_vpermps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7> +; CHECK-NEXT: vmovups {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7> ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> @@ -754,7 +754,7 @@ define <8 x i32> @constant_fold_permd() { ; CHECK-LABEL: constant_fold_permd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> , <8 x i32> ) ret <8 x i32> %1 @@ -763,7 +763,7 @@ define <8 x float> @constant_fold_permps() { ; CHECK-LABEL: constant_fold_permps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5.0E+0,7.0E+0,3.0E+0,2.0E+0,8.0E+0,2.0E+0,6.0E+0,1.0E+0] +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [5.0E+0,7.0E+0,3.0E+0,2.0E+0,8.0E+0,2.0E+0,6.0E+0,1.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> , <8 x i32> ) ret <8 x float> %1 @@ -772,7 +772,7 @@ define <32 x i8> @constant_fold_pshufb_256() { ; CHECK-LABEL: constant_fold_pshufb_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250> +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250> ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> , <32 x i8> ) ret <32 x i8> %1 @@ -822,7 +822,7 @@ ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] ; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-NEXT: vmovups {{.*#+}} ymm2 = ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: ret{{[l|q]}} @@ -833,7 +833,7 @@ ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = <23,18,7,2,20,u,3,2> +; AVX512-NEXT: vmovups {{.*#+}} ymm0 = <23,18,7,2,20,u,3,2> ; AVX512-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll @@ -306,28 +306,28 @@ define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { ; X86-LABEL: combine_vpermt2var_16f32_identity_mask: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X86-NEXT: vmovups {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z} -; X86-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X86-NEXT: vmovups {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X86-NEXT: vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_identity_mask: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-AVX512F-NEXT: vmovups {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-AVX512F-NEXT: kmovw %edi, %k1 ; X64-AVX512F-NEXT: vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z} -; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-AVX512F-NEXT: vmovups {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X64-AVX512F-NEXT: vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_identity_mask: ; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-AVX512BW-NEXT: vmovups {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X64-AVX512BW-NEXT: kmovd %edi, %k1 ; X64-AVX512BW-NEXT: vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z} -; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] +; X64-AVX512BW-NEXT: vmovups {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X64-AVX512BW-NEXT: vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z} ; X64-AVX512BW-NEXT: retq %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) @@ -806,13 +806,13 @@ define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x double> %x1) { ; X86-LABEL: combine_vpermi2var_8f64_as_vpermpd: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] +; X86-NEXT: vmovups {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermi2var_8f64_as_vpermpd: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-NEXT: vmovups {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; X64-NEXT: retq %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> , <8 x double> %x1, i8 -1) @@ -823,13 +823,13 @@ define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) { ; X86-LABEL: combine_vpermt2var_8i64_as_vpermq: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] +; X86-NEXT: vmovups {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_8i64_as_vpermq: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-NEXT: vmovups {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; X64-NEXT: retq %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %x0, <8 x i64> %x1, i8 -1) @@ -840,7 +840,7 @@ define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x float> %x1) { ; CHECK-LABEL: combine_vpermi2var_16f32_as_vpermps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] +; CHECK-NEXT: vmovups {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x1, i16 -1) @@ -851,7 +851,7 @@ define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) { ; CHECK-LABEL: combine_vpermt2var_16i32_as_vpermd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] +; CHECK-NEXT: vmovups {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %x0, <16 x i32> %x1, i16 -1) @@ -951,7 +951,7 @@ ; CHECK-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero: ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8] +; CHECK-NEXT: vmovups {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8] ; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = shufflevector <16 x float> %x0, <16 x float> zeroinitializer, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -750,7 +750,7 @@ ; ; AVX-LABEL: constant_fold_pshufb: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> +; AVX-NEXT: vmovups {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> ; AVX-NEXT: retq %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> , <16 x i8> ) ret <16 x i8> %1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -277,20 +277,20 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X86-NEXT: vmovaps %xmm0, (%eax) +; X86-NEXT: vmovups %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-AVX-LABEL: buildvector_v4f32_0404: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[0],xmm1[0] -; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX-NEXT: vmovups %xmm0, (%rdi) ; X64-AVX-NEXT: retq ; ; X64-AVX2-LABEL: buildvector_v4f32_0404: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; X64-AVX2-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX2-NEXT: vmovups %xmm0, (%rdi) ; X64-AVX2-NEXT: retq %v0 = insertelement <4 x float> undef, float %a, i32 0 %v1 = insertelement <4 x float> %v0, float %b, i32 1 @@ -306,13 +306,13 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm1[0],xmm0[3],zero,xmm0[2] -; X86-NEXT: vmovaps %xmm0, (%eax) +; X86-NEXT: vmovups %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: buildvector_v4f32_07z6: ; X64: # %bb.0: ; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[3],zero,xmm1[2] -; X64-NEXT: vmovaps %xmm0, (%rdi) +; X64-NEXT: vmovups %xmm0, (%rdi) ; X64-NEXT: retq %b2 = extractelement <4 x float> %b, i32 2 %b3 = extractelement <4 x float> %b, i32 3 @@ -327,7 +327,7 @@ define <2 x double> @constant_fold_vpermil2pd() { ; CHECK-LABEL: constant_fold_vpermil2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> , <2 x double> , <2 x i64> , i8 2) ret <2 x double> %1 @@ -336,7 +336,7 @@ define <4 x double> @constant_fold_vpermil2pd_256() { ; CHECK-LABEL: constant_fold_vpermil2pd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0] +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> , <4 x double> , <4 x i64> , i8 2) ret <4 x double> %1 @@ -345,7 +345,7 @@ define <4 x float> @constant_fold_vpermil2ps() { ; CHECK-LABEL: constant_fold_vpermil2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0] +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> , <4 x float> , <4 x i32> , i8 2) ret <4 x float> %1 @@ -354,7 +354,7 @@ define <8 x float> @constant_fold_vpermil2ps_256() { ; CHECK-LABEL: constant_fold_vpermil2ps_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0] +; CHECK-NEXT: vmovups {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> , <8 x float> , <8 x i32> , i8 2) ret <8 x float> %1 @@ -363,7 +363,7 @@ define <16 x i8> @constant_fold_vpperm() { ; CHECK-LABEL: constant_fold_vpperm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; CHECK-NEXT: vmovups {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> , <16 x i8> , <16 x i8> ) ret <16 x i8> %1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1570,7 +1570,7 @@ ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX-NEXT: vmovaps %xmm2, (%rdi) +; AVX-NEXT: vmovups %xmm2, (%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> @@ -2568,7 +2568,7 @@ ; AVX-LABEL: combine_scalar_load_with_blend_with_zero: ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovaps %xmm0, (%rsi) +; AVX-NEXT: vmovups %xmm0, (%rsi) ; AVX-NEXT: retq %1 = load double, double* %a0, align 8 %2 = insertelement <2 x double> undef, double %1, i32 0 @@ -2772,7 +2772,7 @@ ; ; AVX-LABEL: PR30264: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = +; AVX-NEXT: vmovups {{.*#+}} xmm1 = ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x float> %x, <4 x float> , <4 x i32> @@ -3100,8 +3100,8 @@ ; ; AVX-LABEL: PR43024: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; AVX-NEXT: vmovaps %xmm0, (%rax) +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] +; AVX-NEXT: vmovups %xmm0, (%rax) ; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -23,7 +23,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: andl $1, %esi ; AVX-NEXT: andl $1, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; AVX-NEXT: retq @@ -52,7 +52,7 @@ ; AVX-NEXT: # kill: def $esi killed $esi def $rsi ; AVX-NEXT: # kill: def $edi killed $edi def $rdi ; AVX-NEXT: andl $1, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $1, %esi ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero @@ -132,7 +132,7 @@ ; AVX-NEXT: andl $3, %edi ; AVX-NEXT: andl $3, %esi ; AVX-NEXT: andl $3, %edx -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] @@ -217,7 +217,7 @@ ; AVX-NEXT: andl $3, %edi ; AVX-NEXT: andl $3, %esi ; AVX-NEXT: andl $3, %edx -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0 @@ -371,7 +371,7 @@ ; AVX-NEXT: andl $7, %edx ; AVX-NEXT: andl $7, %ecx ; AVX-NEXT: andl $7, %r8d -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $7, %r9d ; AVX-NEXT: movzwl -24(%rsp,%rdi,2), %edi ; AVX-NEXT: vmovd %edi, %xmm0 @@ -634,7 +634,7 @@ ; AVX-NEXT: # kill: def $esi killed $esi def $rsi ; AVX-NEXT: # kill: def $edi killed $edi def $rdi ; AVX-NEXT: andl $15, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movzbl -24(%rsp,%rdi), %eax ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: andl $15, %esi @@ -785,7 +785,7 @@ ; AVX-NEXT: andl $3, %edx ; AVX-NEXT: movl 12(%rdi), %esi ; AVX-NEXT: andl $3, %esi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0 ; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0 @@ -1101,7 +1101,7 @@ ; AVX-NEXT: movzbl 13(%rdi), %ebp ; AVX-NEXT: movzbl 14(%rdi), %eax ; AVX-NEXT: movzbl 15(%rdi), %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movzbl -24(%rsp,%r9), %r9d ; AVX-NEXT: vmovd %r9d, %xmm0 ; AVX-NEXT: andl $15, %ebx @@ -1270,9 +1270,9 @@ ; AVX-NEXT: # kill: def $edx killed $edx def $rdx ; AVX-NEXT: # kill: def $edi killed $edi def $rdi ; AVX-NEXT: andl $3, %edi -; AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $3, %edx -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -1399,13 +1399,13 @@ ; AVX-NEXT: # kill: def $esi killed $esi def $rsi ; AVX-NEXT: # kill: def $edi killed $edi def $rdi ; AVX-NEXT: andl $7, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movzwl -40(%rsp,%rdi,2), %eax ; AVX-NEXT: andl $7, %esi ; AVX-NEXT: andl $7, %edx ; AVX-NEXT: andl $7, %ecx ; AVX-NEXT: andl $7, %r8d -; AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $7, %r9d ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -17,7 +17,7 @@ ; ALL-NEXT: andl $3, %edi ; ALL-NEXT: andl $3, %ecx ; ALL-NEXT: andl $3, %edx -; ALL-NEXT: vmovaps %ymm0, (%rsp) +; ALL-NEXT: vmovups %ymm0, (%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero @@ -46,7 +46,7 @@ ; ALL-NEXT: subq $64, %rsp ; ALL-NEXT: andl $3, %edx ; ALL-NEXT: andl $3, %esi -; ALL-NEXT: vmovaps %ymm0, (%rsp) +; ALL-NEXT: vmovups %ymm0, (%rsp) ; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -71,7 +71,7 @@ ; ALL-NEXT: andl $1, %edi ; ALL-NEXT: andl $1, %ecx ; ALL-NEXT: andl $1, %edx -; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero @@ -100,7 +100,7 @@ ; ALL-NEXT: andl $3, %esi ; ALL-NEXT: andl $3, %edx ; ALL-NEXT: andl $3, %ecx -; ALL-NEXT: vmovaps %ymm0, (%rsp) +; ALL-NEXT: vmovups %ymm0, (%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -131,7 +131,7 @@ ; ALL-NEXT: subq $64, %rsp ; ALL-NEXT: andl $3, %edi ; ALL-NEXT: andl $3, %esi -; ALL-NEXT: vmovaps %ymm0, (%rsp) +; ALL-NEXT: vmovups %ymm0, (%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -156,7 +156,7 @@ ; ALL-NEXT: andl $1, %esi ; ALL-NEXT: andl $1, %edx ; ALL-NEXT: andl $1, %ecx -; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -198,7 +198,7 @@ ; ALL-NEXT: andl $7, %edx ; ALL-NEXT: andl $7, %ecx ; ALL-NEXT: andl $7, %r8d -; ALL-NEXT: vmovaps %ymm0, (%rsp) +; ALL-NEXT: vmovups %ymm0, (%rsp) ; ALL-NEXT: andl $7, %r9d ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] @@ -249,7 +249,7 @@ ; ALL-NEXT: andl $3, %edx ; ALL-NEXT: andl $3, %ecx ; ALL-NEXT: andl $3, %r8d -; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: andl $3, %r9d ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] @@ -294,7 +294,7 @@ ; AVX1-NEXT: # kill: def $esi killed $esi def $rsi ; AVX1-NEXT: # kill: def $edi killed $edi def $rdi ; AVX1-NEXT: andl $15, %edi -; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: vmovups %ymm0, (%rsp) ; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: andl $15, %esi @@ -356,7 +356,7 @@ ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi ; AVX2-NEXT: andl $15, %edi -; AVX2-NEXT: vmovaps %ymm0, (%rsp) +; AVX2-NEXT: vmovups %ymm0, (%rsp) ; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: andl $15, %esi @@ -449,7 +449,7 @@ ; AVX1-NEXT: # kill: def $esi killed $esi def $rsi ; AVX1-NEXT: # kill: def $edi killed $edi def $rdi ; AVX1-NEXT: andl $7, %edi -; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: andl $7, %esi @@ -505,7 +505,7 @@ ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi ; AVX2-NEXT: andl $7, %edi -; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: andl $7, %esi @@ -605,7 +605,7 @@ ; ALL-NEXT: andl $3, %edx ; ALL-NEXT: movq 24(%rdi), %rsi ; ALL-NEXT: andl $3, %esi -; ALL-NEXT: vmovaps %ymm0, (%rsp) +; ALL-NEXT: vmovups %ymm0, (%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -646,7 +646,7 @@ ; ALL-NEXT: andl $1, %edx ; ALL-NEXT: movq 24(%rdi), %rsi ; ALL-NEXT: andl $1, %esi -; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -553,7 +553,7 @@ ; ; AVX1-LABEL: trunc_add_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -665,7 +665,7 @@ ; ; AVX1-LABEL: trunc_add_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -757,7 +757,7 @@ ; ; AVX1-LABEL: trunc_add_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -1365,7 +1365,7 @@ ; ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -1477,7 +1477,7 @@ ; ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -1569,7 +1569,7 @@ ; ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -1803,7 +1803,7 @@ ; ; AVX1-LABEL: trunc_mul_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -2375,7 +2375,7 @@ ; ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -2793,7 +2793,7 @@ ; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-NEXT: vzeroupper @@ -2833,7 +2833,7 @@ ; ; AVX1-LABEL: trunc_and_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 @@ -2959,7 +2959,7 @@ ; ; AVX1-LABEL: trunc_and_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm8 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm7, %ymm8, %ymm7 ; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 @@ -3064,7 +3064,7 @@ ; ; AVX1-LABEL: trunc_and_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 @@ -3190,7 +3190,7 @@ ; ; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper @@ -3227,7 +3227,7 @@ ; ; AVX1-LABEL: trunc_and_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3339,7 +3339,7 @@ ; ; AVX1-LABEL: trunc_and_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -3431,7 +3431,7 @@ ; ; AVX1-LABEL: trunc_and_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3556,7 +3556,7 @@ ; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-NEXT: vzeroupper @@ -3598,7 +3598,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3726,7 +3726,7 @@ ; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2 ; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -3829,7 +3829,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3953,7 +3953,7 @@ ; ; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper @@ -3990,7 +3990,7 @@ ; ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4102,7 +4102,7 @@ ; ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -4194,7 +4194,7 @@ ; ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4319,7 +4319,7 @@ ; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-NEXT: vzeroupper @@ -4361,7 +4361,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4489,7 +4489,7 @@ ; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 ; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -4592,7 +4592,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4716,7 +4716,7 @@ ; ; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper @@ -4753,7 +4753,7 @@ ; ; AVX1-LABEL: trunc_or_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4865,7 +4865,7 @@ ; ; AVX1-LABEL: trunc_or_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -4957,7 +4957,7 @@ ; ; AVX1-LABEL: trunc_or_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -34,7 +34,7 @@ ; ; AVX2-FAST-LABEL: trunc8i64_8i32: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -73,7 +73,7 @@ ; ; AVX2-FAST-LABEL: trunc8i64_8i32_ashr: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -116,7 +116,7 @@ ; ; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] +; AVX2-FAST-NEXT: vmovups {{.*#+}} xmm2 = [1,3,5,7] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -178,7 +178,7 @@ ; ; AVX1-LABEL: trunc8i64_8i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -267,7 +267,7 @@ ; ; AVX1-LABEL: trunc8i64_8i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -651,7 +651,7 @@ ; ; AVX1-LABEL: trunc16i32_16i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 @@ -854,7 +854,7 @@ ; ; AVX1-LABEL: trunc16i32_16i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -1230,7 +1230,7 @@ ; ; AVX1-LABEL: trunc32i16_32i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vmovups {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 @@ -1315,7 +1315,7 @@ ; ; AVX2-FAST-LABEL: trunc2x4i64_8i32: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vmovups {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -1581,27 +1581,27 @@ ; ; AVX-LABEL: foldv2i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,0,0] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv2i64: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQ-NEXT: vmovups {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv2i64: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQVL-NEXT: vmovups {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG_NOVLX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,0,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG-NEXT: vmovups {{.*#+}} xmm0 = [8,0,0,0] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv2i64: @@ -1620,27 +1620,27 @@ ; ; AVX-LABEL: foldv2i64u: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,0,0] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv2i64u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQ-NEXT: vmovups {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv2i64u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQVL-NEXT: vmovups {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv2i64u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG_NOVLX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,0,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv2i64u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG-NEXT: vmovups {{.*#+}} xmm0 = [8,0,0,0] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv2i64u: @@ -1659,27 +1659,27 @@ ; ; AVX-LABEL: foldv4i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,32,0] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv4i32: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQ-NEXT: vmovups {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv4i32: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQVL-NEXT: vmovups {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; BITALG_NOVLX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,32,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; BITALG-NEXT: vmovups {{.*#+}} xmm0 = [8,0,32,0] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv4i32: @@ -1698,27 +1698,27 @@ ; ; AVX-LABEL: foldv4i32u: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,32,0] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv4i32u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQ-NEXT: vmovups {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv4i32u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQVL-NEXT: vmovups {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i32u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; BITALG_NOVLX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,32,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i32u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; BITALG-NEXT: vmovups {{.*#+}} xmm0 = [8,0,32,0] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv4i32u: @@ -1737,27 +1737,27 @@ ; ; AVX-LABEL: foldv8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv8i16: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQ-NEXT: vmovups {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv8i16: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQVL-NEXT: vmovups {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv8i16: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; BITALG_NOVLX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv8i16: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; BITALG-NEXT: vmovups {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16: @@ -1776,27 +1776,27 @@ ; ; AVX-LABEL: foldv8i16u: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv8i16u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQ-NEXT: vmovups {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv8i16u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQVL-NEXT: vmovups {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv8i16u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; BITALG_NOVLX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv8i16u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; BITALG-NEXT: vmovups {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16u: @@ -1815,27 +1815,27 @@ ; ; AVX-LABEL: foldv16i8: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv16i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQ-NEXT: vmovups {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv16i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQVL-NEXT: vmovups {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv16i8: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; BITALG_NOVLX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv16i8: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; BITALG-NEXT: vmovups {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8: @@ -1854,27 +1854,27 @@ ; ; AVX-LABEL: foldv16i8u: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv16i8u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQ-NEXT: vmovups {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv16i8u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQVL-NEXT: vmovups {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv16i8u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; BITALG_NOVLX-NEXT: vmovups {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv16i8u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; BITALG-NEXT: vmovups {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8u: diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -1114,22 +1114,22 @@ define <4 x i64> @foldv4i64() nounwind { ; AVX-LABEL: foldv4i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [8,0,64,0] ; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; BITALG_NOVLX-NEXT: vmovups {{.*#+}} ymm0 = [8,0,64,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; BITALG-NEXT: vmovups {{.*#+}} ymm0 = [8,0,64,0] ; BITALG-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0] +; X32-AVX-NEXT: vmovups {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0] ; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> , i1 0) ret <4 x i64> %out @@ -1138,22 +1138,22 @@ define <4 x i64> @foldv4i64u() nounwind { ; AVX-LABEL: foldv4i64u: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; AVX-NEXT: vmovups {{.*#+}} ymm0 = [8,0,64,0] ; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i64u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; BITALG_NOVLX-NEXT: vmovups {{.*#+}} ymm0 = [8,0,64,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i64u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; BITALG-NEXT: vmovups {{.*#+}} ymm0 = [8,0,64,0] ; BITALG-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0] +; X32-AVX-NEXT: vmovups {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0] ; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> , i1 -1) ret <4 x i64> %out @@ -1162,7 +1162,7 @@ define <8 x i32> @foldv8i32() nounwind { ; ALL-LABEL: foldv8i32: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; ALL-NEXT: vmovups {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] ; ALL-NEXT: ret{{[l|q]}} %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> , i1 0) ret <8 x i32> %out @@ -1171,7 +1171,7 @@ define <8 x i32> @foldv8i32u() nounwind { ; ALL-LABEL: foldv8i32u: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; ALL-NEXT: vmovups {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] ; ALL-NEXT: ret{{[l|q]}} %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> , i1 -1) ret <8 x i32> %out @@ -1180,7 +1180,7 @@ define <16 x i16> @foldv16i16() nounwind { ; ALL-LABEL: foldv16i16: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; ALL-NEXT: vmovups {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] ; ALL-NEXT: ret{{[l|q]}} %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> , i1 0) ret <16 x i16> %out @@ -1189,7 +1189,7 @@ define <16 x i16> @foldv16i16u() nounwind { ; ALL-LABEL: foldv16i16u: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; ALL-NEXT: vmovups {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] ; ALL-NEXT: ret{{[l|q]}} %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> , i1 -1) ret <16 x i16> %out @@ -1198,7 +1198,7 @@ define <32 x i8> @foldv32i8() nounwind { ; ALL-LABEL: foldv32i8: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; ALL-NEXT: vmovups {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] ; ALL-NEXT: ret{{[l|q]}} %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> , i1 0) ret <32 x i8> %out @@ -1207,7 +1207,7 @@ define <32 x i8> @foldv32i8u() nounwind { ; ALL-LABEL: foldv32i8u: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; ALL-NEXT: vmovups {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] ; ALL-NEXT: ret{{[l|q]}} %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> , i1 -1) ret <32 x i8> %out diff --git a/llvm/test/CodeGen/X86/vector-zmov.ll b/llvm/test/CodeGen/X86/vector-zmov.ll --- a/llvm/test/CodeGen/X86/vector-zmov.ll +++ b/llvm/test/CodeGen/X86/vector-zmov.ll @@ -61,7 +61,7 @@ ; ; AVX-LABEL: load_zmov_4i32_to_0zzz_volatile: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll --- a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll +++ b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll @@ -14,9 +14,9 @@ ; X86-NEXT: andl $-64, %esp ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl 456(%ebp), %esi -; X86-NEXT: vmovaps 328(%ebp), %zmm3 -; X86-NEXT: vmovaps 200(%ebp), %zmm4 -; X86-NEXT: vmovaps 72(%ebp), %zmm5 +; X86-NEXT: vmovups 328(%ebp), %zmm3 +; X86-NEXT: vmovups 200(%ebp), %zmm4 +; X86-NEXT: vmovups 72(%ebp), %zmm5 ; X86-NEXT: vp2intersectd %zmm1, %zmm0, %k0 ; X86-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill ; X86-NEXT: kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill @@ -71,7 +71,7 @@ ; X64-NEXT: andq $-64, %rsp ; X64-NEXT: subq $64, %rsp ; X64-NEXT: movq %rdi, %r14 -; X64-NEXT: vmovaps 16(%rbp), %zmm8 +; X64-NEXT: vmovups 16(%rbp), %zmm8 ; X64-NEXT: vp2intersectd %zmm1, %zmm0, %k0 ; X64-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; X64-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill diff --git a/llvm/test/CodeGen/X86/vselect-constants.ll b/llvm/test/CodeGen/X86/vselect-constants.ll --- a/llvm/test/CodeGen/X86/vselect-constants.ll +++ b/llvm/test/CodeGen/X86/vselect-constants.ll @@ -8,9 +8,9 @@ ; Each minimal select test is repeated with a more typical pattern that includes a compare to ; generate the condition value. -; TODO: If we don't have blendv, this can definitely be improved. There's also a selection of +; TODO: If we don't have blendv, this can definitely be improved. There's also a selection of ; chips where it makes sense to transform the general case blendv to 2 bit-ops. That should be -; a uarch-specfic transform. At some point (Ryzen?), the implementation should catch up to the +; a uarch-specfic transform. At some point (Ryzen?), the implementation should catch up to the ; architecture, so blendv is as fast as a single bit-op. define <4 x i32> @sel_C1_or_C2_vec(<4 x i1> %cond) { @@ -27,7 +27,7 @@ ; AVX-LABEL: sel_C1_or_C2_vec: ; AVX: # %bb.0: ; AVX-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [42,0,4294967294,4294967295] +; AVX-NEXT: vmovups {{.*#+}} xmm1 = [42,0,4294967294,4294967295] ; AVX-NEXT: vblendvps %xmm0, {{.*}}(%rip), %xmm1, %xmm0 ; AVX-NEXT: retq %add = select <4 x i1> %cond, <4 x i32> , <4 x i32> @@ -47,7 +47,7 @@ ; AVX-LABEL: cmp_sel_C1_or_C2_vec: ; AVX: # %bb.0: ; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [42,0,4294967294,4294967295] +; AVX-NEXT: vmovups {{.*#+}} xmm1 = [42,0,4294967294,4294967295] ; AVX-NEXT: vblendvps %xmm0, {{.*}}(%rip), %xmm1, %xmm0 ; AVX-NEXT: retq %cond = icmp eq <4 x i32> %x, %y diff --git a/llvm/test/CodeGen/X86/widen_load-1.ll b/llvm/test/CodeGen/X86/widen_load-1.ll --- a/llvm/test/CodeGen/X86/widen_load-1.ll +++ b/llvm/test/CodeGen/X86/widen_load-1.ll @@ -10,7 +10,7 @@ ; SSE: callq killcommon ; AVX: vmovsd compl+128(%rip), %xmm0 -; AVX: vmovaps %xmm0, (%rsp) +; AVX: vmovups %xmm0, (%rsp) ; AVX: callq killcommon @compl = linkonce global [20 x i64] zeroinitializer, align 64 ; <[20 x i64]*> [#uses=1] diff --git a/llvm/test/CodeGen/X86/widen_load-3.ll b/llvm/test/CodeGen/X86/widen_load-3.ll --- a/llvm/test/CodeGen/X86/widen_load-3.ll +++ b/llvm/test/CodeGen/X86/widen_load-3.ll @@ -29,13 +29,13 @@ ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vmovaps (%ecx), %ymm0 -; X86-AVX-NEXT: vmovaps 48(%ecx), %xmm1 +; X86-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-AVX-NEXT: vmovups 48(%ecx), %xmm1 ; X86-AVX-NEXT: vextractps $1, %xmm1, 52(%eax) ; X86-AVX-NEXT: vmovss %xmm1, 48(%eax) -; X86-AVX-NEXT: vmovaps 32(%ecx), %xmm1 -; X86-AVX-NEXT: vmovaps %xmm1, 32(%eax) -; X86-AVX-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX-NEXT: vmovups 32(%ecx), %xmm1 +; X86-AVX-NEXT: vmovups %xmm1, 32(%eax) +; X86-AVX-NEXT: vmovups %ymm0, (%eax) ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl $4 ; @@ -55,12 +55,12 @@ ; X64-AVX-LABEL: load7_aligned: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: movq %rdi, %rax -; X64-AVX-NEXT: vmovaps (%rsi), %ymm0 +; X64-AVX-NEXT: vmovups (%rsi), %ymm0 ; X64-AVX-NEXT: movq 48(%rsi), %rcx ; X64-AVX-NEXT: movq %rcx, 48(%rdi) -; X64-AVX-NEXT: vmovaps 32(%rsi), %xmm1 -; X64-AVX-NEXT: vmovaps %xmm1, 32(%rdi) -; X64-AVX-NEXT: vmovaps %ymm0, (%rdi) +; X64-AVX-NEXT: vmovups 32(%rsi), %xmm1 +; X64-AVX-NEXT: vmovups %xmm1, 32(%rdi) +; X64-AVX-NEXT: vmovups %ymm0, (%rdi) ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq %x1 = load <7 x i64>, <7 x i64>* %x @@ -94,8 +94,8 @@ ; X86-AVX-NEXT: movl 52(%ecx), %ecx ; X86-AVX-NEXT: movl %ecx, 52(%eax) ; X86-AVX-NEXT: movl %edx, 48(%eax) -; X86-AVX-NEXT: vmovaps %xmm1, 32(%eax) -; X86-AVX-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX-NEXT: vmovups %xmm1, 32(%eax) +; X86-AVX-NEXT: vmovups %ymm0, (%eax) ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl $4 ; @@ -119,8 +119,8 @@ ; X64-AVX-NEXT: vmovups 32(%rsi), %xmm1 ; X64-AVX-NEXT: movq 48(%rsi), %rcx ; X64-AVX-NEXT: movq %rcx, 48(%rdi) -; X64-AVX-NEXT: vmovaps %xmm1, 32(%rdi) -; X64-AVX-NEXT: vmovaps %ymm0, (%rdi) +; X64-AVX-NEXT: vmovups %xmm1, 32(%rdi) +; X64-AVX-NEXT: vmovups %ymm0, (%rdi) ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq %x1 = load <7 x i64>, <7 x i64>* %x, align 1 diff --git a/llvm/test/CodeGen/X86/win64-funclet-savexmm.ll b/llvm/test/CodeGen/X86/win64-funclet-savexmm.ll --- a/llvm/test/CodeGen/X86/win64-funclet-savexmm.ll +++ b/llvm/test/CodeGen/X86/win64-funclet-savexmm.ll @@ -85,17 +85,17 @@ ; CHECK: subq $88, %rsp ; CHECK: .seh_stackalloc 88 ; CHECK: leaq 112(%rdx), %rbp -; CHECK: vmovaps %xmm8, 32(%rsp) +; CHECK: vmovups %xmm8, 32(%rsp) ; CHECK: .seh_savexmm %xmm8, 32 -; CHECK: vmovaps %xmm7, 48(%rsp) +; CHECK: vmovups %xmm7, 48(%rsp) ; CHECK: .seh_savexmm %xmm7, 48 -; CHECK: vmovaps %xmm6, 64(%rsp) +; CHECK: vmovups %xmm6, 64(%rsp) ; CHECK: .seh_savexmm %xmm6, 64 ; CHECK: .seh_endprologue ; CHECK: movl -{{[0-9]+}}(%rbp), %ecx -; CHECK: vmovaps 64(%rsp), %xmm6 -; CHECK: vmovaps 48(%rsp), %xmm7 -; CHECK: vmovaps 32(%rsp), %xmm8 +; CHECK: vmovups 64(%rsp), %xmm6 +; CHECK: vmovups 48(%rsp), %xmm7 +; CHECK: vmovups 32(%rsp), %xmm8 ; CHECK: leaq .LBB0_1(%rip), %rax ; CHECK: addq $88, %rsp ; CHECK: popq %rbx diff --git a/llvm/test/CodeGen/X86/win_cst_pool.ll b/llvm/test/CodeGen/X86/win_cst_pool.ll --- a/llvm/test/CodeGen/X86/win_cst_pool.ll +++ b/llvm/test/CodeGen/X86/win_cst_pool.ll @@ -39,7 +39,7 @@ ; CHECK-NEXT: .long 1 ; CHECK-NEXT: .long 0 ; CHECK: vec1: -; CHECK: movaps __xmm@00000000000000010000000200000003(%rip), %xmm0 +; CHECK: movups __xmm@00000000000000010000000200000003(%rip), %xmm0 ; CHECK-NEXT: ret define <8 x i16> @vec2() { @@ -58,7 +58,7 @@ ; CHECK-NEXT: .short 1 ; CHECK-NEXT: .short 0 ; CHECK: vec2: -; CHECK: movaps __xmm@00000001000200030004000500060007(%rip), %xmm0 +; CHECK: movups __xmm@00000001000200030004000500060007(%rip), %xmm0 ; CHECK-NEXT: ret @@ -74,7 +74,7 @@ ; CHECK-NEXT: .zero 4 ; CHECK-NEXT: .zero 4 ; CHECK: undef1: -; CHECK: movaps __xmm@00000000000000003f8000003f800000(%rip), %xmm0 +; CHECK: movups __xmm@00000000000000003f8000003f800000(%rip), %xmm0 ; CHECK-NEXT: ret } diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -271,10 +271,10 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, 96(%rdi) -; AVX1-NEXT: vmovaps %ymm3, 64(%rdi) -; AVX1-NEXT: vmovaps %ymm1, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm2, (%rdi) +; AVX1-NEXT: vmovups %ymm0, 96(%rdi) +; AVX1-NEXT: vmovups %ymm3, 64(%rdi) +; AVX1-NEXT: vmovups %ymm1, 32(%rdi) +; AVX1-NEXT: vmovups %ymm2, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1563,14 +1563,14 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm14, %ymm3 -; AVX1-NEXT: vmovaps %ymm3, 224(%rdi) -; AVX1-NEXT: vmovaps %ymm0, 192(%rdi) -; AVX1-NEXT: vmovaps %ymm7, 160(%rdi) -; AVX1-NEXT: vmovaps %ymm6, 128(%rdi) -; AVX1-NEXT: vmovaps %ymm1, 96(%rdi) -; AVX1-NEXT: vmovaps %ymm2, 64(%rdi) -; AVX1-NEXT: vmovaps %ymm4, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm8, (%rdi) +; AVX1-NEXT: vmovups %ymm3, 224(%rdi) +; AVX1-NEXT: vmovups %ymm0, 192(%rdi) +; AVX1-NEXT: vmovups %ymm7, 160(%rdi) +; AVX1-NEXT: vmovups %ymm6, 128(%rdi) +; AVX1-NEXT: vmovups %ymm1, 96(%rdi) +; AVX1-NEXT: vmovups %ymm2, 64(%rdi) +; AVX1-NEXT: vmovups %ymm4, 32(%rdi) +; AVX1-NEXT: vmovups %ymm8, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1676,7 +1676,7 @@ ; AVX512-LABEL: splat2_v4f64_load_store: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512-NEXT: vmovups {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovups %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper @@ -1714,7 +1714,7 @@ ; AVX512-LABEL: splat2_v4i64_load_store: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512-NEXT: vmovups {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] ; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovups %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll --- a/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll @@ -9,7 +9,7 @@ ; CHECK-LABEL: broadcast128: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: retq %1 = alloca <2 x i64>, align 16