diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -460,10 +460,13 @@ std::pair getExecutionDomain(const MachineInstr &MI) const override; + std::pair + getExecutionDomainImpl(const MachineInstr &MI) const; uint16_t getExecutionDomainCustom(const MachineInstr &MI) const; void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override; + void setExecutionDomainImpl(MachineInstr &MI, unsigned Domain) const; bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -7622,7 +7622,7 @@ } std::pair -X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { +X86InstrInfo::getExecutionDomainImpl(const MachineInstr &MI) const { uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; unsigned opcode = MI.getOpcode(); uint16_t validDomains = 0; @@ -7662,7 +7662,23 @@ return std::make_pair(domain, validDomains); } -void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { +std::pair +X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { + std::pair Res = getExecutionDomainImpl(MI); + + if (Res.first && Subtarget.hasAVX2()) { + // Rotate the domain encodings 1-3 so that 3 becomes 1 to make integer + // prefered. + Res.first = (((Res.first - 1) + 1) % 3) + 1; + Res.second = ((Res.second & 0x6) << 1) | ((Res.second & 0x8) >> 2) | + (Res.second & 0x1); + } + + return Res; +} + +void X86InstrInfo::setExecutionDomainImpl(MachineInstr &MI, + unsigned Domain) const { assert(Domain>0 && Domain<4 && "Invalid execution domain"); uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; assert(dom && "Not an SSE instruction"); @@ -7712,6 +7728,17 @@ MI.setDesc(get(table[Domain - 1])); } +void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { + assert(Domain>0 && Domain<4 && "Invalid execution domain"); + if (Subtarget.hasAVX2()) { + // Rotate the domain encoding so that domain 1 becomes 3. So that we prefer + // integer instructions. + Domain = ((Domain - 1) + 2) % 3 + 1; + } + + setExecutionDomainImpl(MI, Domain); +} + /// Return the noop instruction to use for a noop. void X86InstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); diff --git a/llvm/test/CodeGen/X86/2012-07-15-broadcastfold.ll b/llvm/test/CodeGen/X86/2012-07-15-broadcastfold.ll --- a/llvm/test/CodeGen/X86/2012-07-15-broadcastfold.ll +++ b/llvm/test/CodeGen/X86/2012-07-15-broadcastfold.ll @@ -3,9 +3,9 @@ declare x86_fastcallcc i64 @barrier() ;CHECK-LABEL: bcast_fold: -;CHECK: vmov{{[au]}}ps %xmm{{[0-9]+}}, [[SPILLED:[^\)]+\)]] +;CHECK: vmovdq{{[au]}} %xmm{{[0-9]+}}, [[SPILLED:[^\)]+\)]] ;CHECK: barrier -;CHECK: vbroadcastss [[SPILLED]], %ymm0 +;CHECK: vpbroadcastd [[SPILLED]], %ymm0 ;CHECK: ret define <8 x float> @bcast_fold( float* %A) { BB: diff --git a/llvm/test/CodeGen/X86/GlobalISel/memop-vec.ll b/llvm/test/CodeGen/X86/GlobalISel/memop-vec.ll --- a/llvm/test/CodeGen/X86/GlobalISel/memop-vec.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/memop-vec.ll @@ -5,7 +5,7 @@ define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) { ; SKX-LABEL: test_load_v4i32_noalign: ; SKX: # %bb.0: -; SKX-NEXT: vmovups (%rdi), %xmm0 +; SKX-NEXT: vmovdqu (%rdi), %xmm0 ; SKX-NEXT: retq %r = load <4 x i32>, <4 x i32>* %p1, align 1 ret <4 x i32> %r @@ -14,7 +14,7 @@ define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) { ; SKX-LABEL: test_load_v4i32_align: ; SKX: # %bb.0: -; SKX-NEXT: vmovaps (%rdi), %xmm0 +; SKX-NEXT: vmovdqa (%rdi), %xmm0 ; SKX-NEXT: retq %r = load <4 x i32>, <4 x i32>* %p1, align 16 ret <4 x i32> %r @@ -23,7 +23,7 @@ define <8 x i32> @test_load_v8i32_noalign(<8 x i32> * %p1) { ; SKX-LABEL: test_load_v8i32_noalign: ; SKX: # %bb.0: -; SKX-NEXT: vmovups (%rdi), %ymm0 +; SKX-NEXT: vmovdqu (%rdi), %ymm0 ; SKX-NEXT: retq %r = load <8 x i32>, <8 x i32>* %p1, align 1 ret <8 x i32> %r @@ -32,7 +32,7 @@ define <8 x i32> @test_load_v8i32_align(<8 x i32> * %p1) { ; SKX-LABEL: test_load_v8i32_align: ; SKX: # %bb.0: -; SKX-NEXT: vmovaps (%rdi), %ymm0 +; SKX-NEXT: vmovdqa (%rdi), %ymm0 ; SKX-NEXT: retq %r = load <8 x i32>, <8 x i32>* %p1, align 32 ret <8 x i32> %r @@ -41,7 +41,7 @@ define <16 x i32> @test_load_v16i32_noalign(<16 x i32> * %p1) { ; SKX-LABEL: test_load_v16i32_noalign: ; SKX: # %bb.0: -; SKX-NEXT: vmovups (%rdi), %zmm0 +; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 ; SKX-NEXT: retq %r = load <16 x i32>, <16 x i32>* %p1, align 1 ret <16 x i32> %r @@ -50,7 +50,7 @@ define <16 x i32> @test_load_v16i32_align(<16 x i32> * %p1) { ; SKX-LABEL: test_load_v16i32_align: ; SKX: # %bb.0: -; SKX-NEXT: vmovups (%rdi), %zmm0 +; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 ; SKX-NEXT: retq %r = load <16 x i32>, <16 x i32>* %p1, align 32 ret <16 x i32> %r @@ -59,7 +59,7 @@ define void @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) { ; SKX-LABEL: test_store_v4i32_noalign: ; SKX: # %bb.0: -; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vmovdqu %xmm0, (%rdi) ; SKX-NEXT: retq store <4 x i32> %val, <4 x i32>* %p1, align 1 ret void @@ -68,7 +68,7 @@ define void @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) { ; SKX-LABEL: test_store_v4i32_align: ; SKX: # %bb.0: -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovdqa %xmm0, (%rdi) ; SKX-NEXT: retq store <4 x i32> %val, <4 x i32>* %p1, align 16 ret void @@ -77,7 +77,7 @@ define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) { ; SKX-LABEL: test_store_v8i32_noalign: ; SKX: # %bb.0: -; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vmovdqu %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq store <8 x i32> %val, <8 x i32>* %p1, align 1 @@ -87,7 +87,7 @@ define void @test_store_v8i32_align(<8 x i32> %val, <8 x i32>* %p1) { ; SKX-LABEL: test_store_v8i32_align: ; SKX: # %bb.0: -; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vmovdqa %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq store <8 x i32> %val, <8 x i32>* %p1, align 32 @@ -97,7 +97,7 @@ define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) { ; SKX-LABEL: test_store_v16i32_noalign: ; SKX: # %bb.0: -; SKX-NEXT: vmovups %zmm0, (%rdi) +; SKX-NEXT: vmovdqu64 %zmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq store <16 x i32> %val, <16 x i32>* %p1, align 1 @@ -107,7 +107,7 @@ define void @test_store_v16i32_align(<16 x i32> %val, <16 x i32>* %p1) { ; SKX-LABEL: test_store_v16i32_align: ; SKX: # %bb.0: -; SKX-NEXT: vmovaps %zmm0, (%rdi) +; SKX-NEXT: vmovdqa64 %zmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq store <16 x i32> %val, <16 x i32>* %p1, align 64 diff --git a/llvm/test/CodeGen/X86/arg-copy-elide-win64.ll b/llvm/test/CodeGen/X86/arg-copy-elide-win64.ll --- a/llvm/test/CodeGen/X86/arg-copy-elide-win64.ll +++ b/llvm/test/CodeGen/X86/arg-copy-elide-win64.ll @@ -9,31 +9,31 @@ ; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: subq $368, %rsp # imm = 0x170 ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rbp -; CHECK-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: andq $-128, %rsp ; CHECK-NEXT: movq 288(%rbp), %rax -; CHECK-NEXT: vmovaps (%rax), %ymm0 +; CHECK-NEXT: vmovdqa (%rax), %ymm0 ; CHECK-NEXT: movq 296(%rbp), %rax -; CHECK-NEXT: vmovaps (%rax), %ymm1 +; CHECK-NEXT: vmovdqa (%rax), %ymm1 ; CHECK-NEXT: movq 304(%rbp), %rax -; CHECK-NEXT: vmovaps (%rax), %ymm2 +; CHECK-NEXT: vmovdqa (%rax), %ymm2 ; CHECK-NEXT: movq 312(%rbp), %rax -; CHECK-NEXT: vmovaps (%rax), %ymm3 -; CHECK-NEXT: vmovaps (%rcx), %ymm4 -; CHECK-NEXT: vmovaps (%rdx), %ymm5 -; CHECK-NEXT: vmovaps (%r8), %ymm6 -; CHECK-NEXT: vmovaps (%r9), %ymm7 -; CHECK-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %ymm0, (%rsp) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; CHECK-NEXT: vmovdqa (%rax), %ymm3 +; CHECK-NEXT: vmovdqa (%rcx), %ymm4 +; CHECK-NEXT: vmovdqa (%rdx), %ymm5 +; CHECK-NEXT: vmovdqa (%r8), %ymm6 +; CHECK-NEXT: vmovdqa (%r9), %ymm7 +; CHECK-NEXT: vmovdqa %ymm7, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa %ymm6, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa %ymm5, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa %ymm4, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa %ymm0, (%rsp) +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; CHECK-NEXT: leaq 240(%rbp), %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll --- a/llvm/test/CodeGen/X86/atomic-fp.ll +++ b/llvm/test/CodeGen/X86/atomic-fp.ll @@ -2,11 +2,11 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=X86-NOSSE ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=sse -verify-machineinstrs | FileCheck %s --check-prefix=X86-SSE1 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=sse2 -verify-machineinstrs | FileCheck %s --check-prefix=X86-SSE2 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx -verify-machineinstrs | FileCheck %s --check-prefix=X86-AVX -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512f -verify-machineinstrs | FileCheck %s --check-prefix=X86-AVX +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx -verify-machineinstrs | FileCheck %s --check-prefixes=X86-AVX,X86-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512f -verify-machineinstrs | FileCheck %s --check-prefixes=X86-AVX,X86-AVX512 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=X64-SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -verify-machineinstrs | FileCheck %s --check-prefix=X64-AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f -verify-machineinstrs | FileCheck %s --check-prefix=X64-AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -verify-machineinstrs | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f -verify-machineinstrs | FileCheck %s --check-prefixes=X64-AVX,X64-AVX512 ; ----- FADD ----- @@ -140,21 +140,37 @@ ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: fadd_64r: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: pushl %ebp -; X86-AVX-NEXT: movl %esp, %ebp -; X86-AVX-NEXT: andl $-8, %esp -; X86-AVX-NEXT: subl $8, %esp -; X86-AVX-NEXT: movl 8(%ebp), %eax -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vaddsd 12(%ebp), %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, (%eax) -; X86-AVX-NEXT: movl %ebp, %esp -; X86-AVX-NEXT: popl %ebp -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: fadd_64r: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-8, %esp +; X86-AVX1-NEXT: subl $8, %esp +; X86-AVX1-NEXT: movl 8(%ebp), %eax +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vaddsd 12(%ebp), %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vmovlps %xmm0, (%eax) +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: fadd_64r: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: pushl %ebp +; X86-AVX512-NEXT: movl %esp, %ebp +; X86-AVX512-NEXT: andl $-8, %esp +; X86-AVX512-NEXT: subl $8, %esp +; X86-AVX512-NEXT: movl 8(%ebp), %eax +; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vaddsd 12(%ebp), %xmm0, %xmm0 +; X86-AVX512-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovq %xmm0, (%eax) +; X86-AVX512-NEXT: movl %ebp, %esp +; X86-AVX512-NEXT: popl %ebp +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fadd_64r: ; X64-SSE: # %bb.0: @@ -304,20 +320,35 @@ ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: fadd_64g: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: pushl %ebp -; X86-AVX-NEXT: movl %esp, %ebp -; X86-AVX-NEXT: andl $-8, %esp -; X86-AVX-NEXT: subl $8, %esp -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vaddsd {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, glob64 -; X86-AVX-NEXT: movl %ebp, %esp -; X86-AVX-NEXT: popl %ebp -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: fadd_64g: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-8, %esp +; X86-AVX1-NEXT: subl $8, %esp +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vaddsd {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vmovlps %xmm0, glob64 +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: fadd_64g: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: pushl %ebp +; X86-AVX512-NEXT: movl %esp, %ebp +; X86-AVX512-NEXT: andl $-8, %esp +; X86-AVX512-NEXT: subl $8, %esp +; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vaddsd {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX512-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovq %xmm0, glob64 +; X86-AVX512-NEXT: movl %ebp, %esp +; X86-AVX512-NEXT: popl %ebp +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fadd_64g: ; X64-SSE: # %bb.0: @@ -467,20 +498,35 @@ ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: fadd_64imm: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: pushl %ebp -; X86-AVX-NEXT: movl %esp, %ebp -; X86-AVX-NEXT: andl $-8, %esp -; X86-AVX-NEXT: subl $8, %esp -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vaddsd {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, -559038737 -; X86-AVX-NEXT: movl %ebp, %esp -; X86-AVX-NEXT: popl %ebp -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: fadd_64imm: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-8, %esp +; X86-AVX1-NEXT: subl $8, %esp +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vaddsd {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vmovlps %xmm0, -559038737 +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: fadd_64imm: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: pushl %ebp +; X86-AVX512-NEXT: movl %esp, %ebp +; X86-AVX512-NEXT: andl $-8, %esp +; X86-AVX512-NEXT: subl $8, %esp +; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vaddsd {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX512-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovq %xmm0, -559038737 +; X86-AVX512-NEXT: movl %ebp, %esp +; X86-AVX512-NEXT: popl %ebp +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fadd_64imm: ; X64-SSE: # %bb.0: @@ -636,20 +682,35 @@ ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: fadd_64stack: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: pushl %ebp -; X86-AVX-NEXT: movl %esp, %ebp -; X86-AVX-NEXT: andl $-8, %esp -; X86-AVX-NEXT: subl $16, %esp -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vaddsd {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %ebp, %esp -; X86-AVX-NEXT: popl %ebp -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: fadd_64stack: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-8, %esp +; X86-AVX1-NEXT: subl $16, %esp +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vaddsd {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: fadd_64stack: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: pushl %ebp +; X86-AVX512-NEXT: movl %esp, %ebp +; X86-AVX512-NEXT: andl $-8, %esp +; X86-AVX512-NEXT: subl $16, %esp +; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vaddsd {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX512-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %ebp, %esp +; X86-AVX512-NEXT: popl %ebp +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fadd_64stack: ; X64-SSE: # %bb.0: @@ -744,22 +805,39 @@ ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: fadd_array: -; X86-AVX: # %bb.0: # %bb -; X86-AVX-NEXT: pushl %ebp -; X86-AVX-NEXT: movl %esp, %ebp -; X86-AVX-NEXT: andl $-8, %esp -; X86-AVX-NEXT: subl $8, %esp -; X86-AVX-NEXT: movl 20(%ebp), %eax -; X86-AVX-NEXT: movl 8(%ebp), %ecx -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vaddsd 12(%ebp), %xmm0, %xmm0 -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, (%ecx,%eax,8) -; X86-AVX-NEXT: movl %ebp, %esp -; X86-AVX-NEXT: popl %ebp -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: fadd_array: +; X86-AVX1: # %bb.0: # %bb +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-8, %esp +; X86-AVX1-NEXT: subl $8, %esp +; X86-AVX1-NEXT: movl 20(%ebp), %eax +; X86-AVX1-NEXT: movl 8(%ebp), %ecx +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vaddsd 12(%ebp), %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vmovlps %xmm0, (%ecx,%eax,8) +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: fadd_array: +; X86-AVX512: # %bb.0: # %bb +; X86-AVX512-NEXT: pushl %ebp +; X86-AVX512-NEXT: movl %esp, %ebp +; X86-AVX512-NEXT: andl $-8, %esp +; X86-AVX512-NEXT: subl $8, %esp +; X86-AVX512-NEXT: movl 20(%ebp), %eax +; X86-AVX512-NEXT: movl 8(%ebp), %ecx +; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vaddsd 12(%ebp), %xmm0, %xmm0 +; X86-AVX512-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovq %xmm0, (%ecx,%eax,8) +; X86-AVX512-NEXT: movl %ebp, %esp +; X86-AVX512-NEXT: popl %ebp +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fadd_array: ; X64-SSE: # %bb.0: # %bb diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=sse | FileCheck %s --check-prefixes=X86,X86-SSE,X86-SSE1 ; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=sse2 | FileCheck %s --check-prefixes=X86,X86-SSE,X86-SSE2 -; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=avx | FileCheck %s --check-prefixes=X86,X86-AVX -; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=avx512f | FileCheck %s --check-prefixes=X86,X86-AVX +; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=avx | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1 +; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=avx512f | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX512 ; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs | FileCheck %s --check-prefixes=X86,X86-NOSSE ; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=sse2 | FileCheck %s --check-prefixes=X64,X64-SSE -; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX -; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx512f | FileCheck %s --check-prefixes=X64,X64-AVX +; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx512f | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512 ; Note: This test is testing that the lowering for atomics matches what we ; currently emit for non-atomics + the atomic restriction. The presence of @@ -44,10 +44,15 @@ ; X64-SSE-NEXT: movss %xmm0, (%rdi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: store_float: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovss %xmm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: store_float: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovss %xmm0, (%rdi) +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: store_float: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovd %xmm0, (%rdi) +; X64-AVX512-NEXT: retq store atomic float %v, float* %fptr unordered, align 4 ret void } @@ -68,12 +73,19 @@ ; X86-SSE2-NEXT: movlps %xmm0, (%eax) ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: store_double: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, (%eax) -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: store_double: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vmovlps %xmm0, (%eax) +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: store_double: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovq %xmm0, (%eax) +; X86-AVX512-NEXT: retl ; ; X86-NOSSE-LABEL: store_double: ; X86-NOSSE: # %bb.0: @@ -95,10 +107,15 @@ ; X64-SSE-NEXT: movsd %xmm0, (%rdi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: store_double: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovsd %xmm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: store_double: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovsd %xmm0, (%rdi) +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: store_double: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovq %xmm0, (%rdi) +; X64-AVX512-NEXT: retq store atomic double %v, double* %fptr unordered, align 8 ret void } @@ -127,20 +144,35 @@ ; X86-SSE-NEXT: .cfi_adjust_cfa_offset -56 ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: store_fp128: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: subl $44, %esp -; X86-AVX-NEXT: .cfi_def_cfa_offset 48 -; X86-AVX-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl %eax, (%esp) -; X86-AVX-NEXT: calll __sync_lock_test_and_set_16 -; X86-AVX-NEXT: addl $40, %esp -; X86-AVX-NEXT: .cfi_def_cfa_offset 4 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: store_fp128: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: subl $44, %esp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 48 +; X86-AVX1-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl %eax, (%esp) +; X86-AVX1-NEXT: calll __sync_lock_test_and_set_16 +; X86-AVX1-NEXT: addl $40, %esp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: store_fp128: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: subl $44, %esp +; X86-AVX512-NEXT: .cfi_def_cfa_offset 48 +; X86-AVX512-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: vmovdqu %xmm0, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl %eax, (%esp) +; X86-AVX512-NEXT: calll __sync_lock_test_and_set_16 +; X86-AVX512-NEXT: addl $40, %esp +; X86-AVX512-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX512-NEXT: retl ; ; X86-NOSSE-LABEL: store_fp128: ; X86-NOSSE: # %bb.0: @@ -172,22 +204,34 @@ ; X64-SSE-NEXT: movaps %xmm0, (%rsp) ; X64-SSE-NEXT: movq (%rsp), %rsi ; X64-SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; X64-SSE-NEXT: callq __sync_lock_test_and_set_16 +; X64-SSE-NEXT: callq __sync_lock_test_and_set_16@PLT ; X64-SSE-NEXT: addq $24, %rsp ; X64-SSE-NEXT: .cfi_def_cfa_offset 8 ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: store_fp128: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: subq $24, %rsp -; X64-AVX-NEXT: .cfi_def_cfa_offset 32 -; X64-AVX-NEXT: vmovaps %xmm0, (%rsp) -; X64-AVX-NEXT: movq (%rsp), %rsi -; X64-AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; X64-AVX-NEXT: callq __sync_lock_test_and_set_16 -; X64-AVX-NEXT: addq $24, %rsp -; X64-AVX-NEXT: .cfi_def_cfa_offset 8 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: store_fp128: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: subq $24, %rsp +; X64-AVX1-NEXT: .cfi_def_cfa_offset 32 +; X64-AVX1-NEXT: vmovaps %xmm0, (%rsp) +; X64-AVX1-NEXT: movq (%rsp), %rsi +; X64-AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; X64-AVX1-NEXT: callq __sync_lock_test_and_set_16@PLT +; X64-AVX1-NEXT: addq $24, %rsp +; X64-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: store_fp128: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: subq $24, %rsp +; X64-AVX512-NEXT: .cfi_def_cfa_offset 32 +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsp) +; X64-AVX512-NEXT: movq (%rsp), %rsi +; X64-AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; X64-AVX512-NEXT: callq __sync_lock_test_and_set_16@PLT +; X64-AVX512-NEXT: addq $24, %rsp +; X64-AVX512-NEXT: .cfi_def_cfa_offset 8 +; X64-AVX512-NEXT: retq store atomic fp128 %v, fp128* %fptr unordered, align 16 ret void } @@ -232,17 +276,29 @@ ; X86-SSE2-NEXT: .cfi_def_cfa_offset 4 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: load_float: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: pushl %eax -; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vmovss %xmm0, (%esp) -; X86-AVX-NEXT: flds (%esp) -; X86-AVX-NEXT: popl %eax -; X86-AVX-NEXT: .cfi_def_cfa_offset 4 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: load_float: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %eax +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: vmovss %xmm0, (%esp) +; X86-AVX1-NEXT: flds (%esp) +; X86-AVX1-NEXT: popl %eax +; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: load_float: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: pushl %eax +; X86-AVX512-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512-NEXT: vmovd %xmm0, (%esp) +; X86-AVX512-NEXT: flds (%esp) +; X86-AVX512-NEXT: popl %eax +; X86-AVX512-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX512-NEXT: retl ; ; X86-NOSSE-LABEL: load_float: ; X86-NOSSE: # %bb.0: @@ -261,10 +317,15 @@ ; X64-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: load_float: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: load_float: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: load_float: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX512-NEXT: retq %v = load atomic float, float* %fptr unordered, align 4 ret float %v } @@ -297,17 +358,29 @@ ; X86-SSE2-NEXT: .cfi_def_cfa_offset 4 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: load_double: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: subl $12, %esp -; X86-AVX-NEXT: .cfi_def_cfa_offset 16 -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, (%esp) -; X86-AVX-NEXT: fldl (%esp) -; X86-AVX-NEXT: addl $12, %esp -; X86-AVX-NEXT: .cfi_def_cfa_offset 4 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: load_double: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: subl $12, %esp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 16 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vmovlps %xmm0, (%esp) +; X86-AVX1-NEXT: fldl (%esp) +; X86-AVX1-NEXT: addl $12, %esp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: load_double: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: subl $12, %esp +; X86-AVX512-NEXT: .cfi_def_cfa_offset 16 +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovq %xmm0, (%esp) +; X86-AVX512-NEXT: fldl (%esp) +; X86-AVX512-NEXT: addl $12, %esp +; X86-AVX512-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX512-NEXT: retl ; ; X86-NOSSE-LABEL: load_double: ; X86-NOSSE: # %bb.0: @@ -330,10 +403,15 @@ ; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: load_double: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: load_double: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: load_double: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X64-AVX512-NEXT: retq %v = load atomic double, double* %fptr unordered, align 8 ret double %v } @@ -394,31 +472,57 @@ ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 ; X86-SSE-NEXT: retl $4 ; -; X86-AVX-LABEL: load_fp128: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: pushl %esi -; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: subl $56, %esp -; X86-AVX-NEXT: .cfi_def_cfa_offset 64 -; X86-AVX-NEXT: .cfi_offset %esi, -8 -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl %eax, (%esp) -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: calll __sync_val_compare_and_swap_16 -; X86-AVX-NEXT: subl $4, %esp -; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; X86-AVX-NEXT: vmovaps %xmm0, (%esi) -; X86-AVX-NEXT: movl %esi, %eax -; X86-AVX-NEXT: addl $56, %esp -; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: popl %esi -; X86-AVX-NEXT: .cfi_def_cfa_offset 4 -; X86-AVX-NEXT: retl $4 +; X86-AVX1-LABEL: load_fp128: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: subl $56, %esp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 64 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl %eax, (%esp) +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: calll __sync_val_compare_and_swap_16 +; X86-AVX1-NEXT: subl $4, %esp +; X86-AVX1-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-AVX1-NEXT: vmovaps %xmm0, (%esi) +; X86-AVX1-NEXT: movl %esi, %eax +; X86-AVX1-NEXT: addl $56, %esp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX1-NEXT: retl $4 +; +; X86-AVX512-LABEL: load_fp128: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: pushl %esi +; X86-AVX512-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX512-NEXT: subl $56, %esp +; X86-AVX512-NEXT: .cfi_def_cfa_offset 64 +; X86-AVX512-NEXT: .cfi_offset %esi, -8 +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X86-AVX512-NEXT: vmovdqu %ymm0, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl %eax, (%esp) +; X86-AVX512-NEXT: vzeroupper +; X86-AVX512-NEXT: calll __sync_val_compare_and_swap_16 +; X86-AVX512-NEXT: subl $4, %esp +; X86-AVX512-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-NEXT: vmovdqa %xmm0, (%esi) +; X86-AVX512-NEXT: movl %esi, %eax +; X86-AVX512-NEXT: addl $56, %esp +; X86-AVX512-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX512-NEXT: popl %esi +; X86-AVX512-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX512-NEXT: retl $4 ; ; X86-NOSSE-LABEL: load_fp128: ; X86-NOSSE: # %bb.0: @@ -483,7 +587,7 @@ ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: xorl %ecx, %ecx ; X64-SSE-NEXT: xorl %r8d, %r8d -; X64-SSE-NEXT: callq __sync_val_compare_and_swap_16 +; X64-SSE-NEXT: callq __sync_val_compare_and_swap_16@PLT ; X64-SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; X64-SSE-NEXT: movq %rax, (%rsp) ; X64-SSE-NEXT: movaps (%rsp), %xmm0 @@ -491,21 +595,37 @@ ; X64-SSE-NEXT: .cfi_def_cfa_offset 8 ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: load_fp128: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: subq $24, %rsp -; X64-AVX-NEXT: .cfi_def_cfa_offset 32 -; X64-AVX-NEXT: xorl %esi, %esi -; X64-AVX-NEXT: xorl %edx, %edx -; X64-AVX-NEXT: xorl %ecx, %ecx -; X64-AVX-NEXT: xorl %r8d, %r8d -; X64-AVX-NEXT: callq __sync_val_compare_and_swap_16 -; X64-AVX-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rax, (%rsp) -; X64-AVX-NEXT: vmovaps (%rsp), %xmm0 -; X64-AVX-NEXT: addq $24, %rsp -; X64-AVX-NEXT: .cfi_def_cfa_offset 8 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: load_fp128: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: subq $24, %rsp +; X64-AVX1-NEXT: .cfi_def_cfa_offset 32 +; X64-AVX1-NEXT: xorl %esi, %esi +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: xorl %ecx, %ecx +; X64-AVX1-NEXT: xorl %r8d, %r8d +; X64-AVX1-NEXT: callq __sync_val_compare_and_swap_16@PLT +; X64-AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rax, (%rsp) +; X64-AVX1-NEXT: vmovaps (%rsp), %xmm0 +; X64-AVX1-NEXT: addq $24, %rsp +; X64-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: load_fp128: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: subq $24, %rsp +; X64-AVX512-NEXT: .cfi_def_cfa_offset 32 +; X64-AVX512-NEXT: xorl %esi, %esi +; X64-AVX512-NEXT: xorl %edx, %edx +; X64-AVX512-NEXT: xorl %ecx, %ecx +; X64-AVX512-NEXT: xorl %r8d, %r8d +; X64-AVX512-NEXT: callq __sync_val_compare_and_swap_16@PLT +; X64-AVX512-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rax, (%rsp) +; X64-AVX512-NEXT: vmovdqa (%rsp), %xmm0 +; X64-AVX512-NEXT: addq $24, %rsp +; X64-AVX512-NEXT: .cfi_def_cfa_offset 8 +; X64-AVX512-NEXT: retq %v = load atomic fp128, fp128* %fptr unordered, align 16 ret fp128 %v } @@ -555,13 +675,21 @@ ; X86-SSE2-NEXT: lock orl $0, (%esp) ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: store_double_seq_cst: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, (%eax) -; X86-AVX-NEXT: lock orl $0, (%esp) -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: store_double_seq_cst: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vmovlps %xmm0, (%eax) +; X86-AVX1-NEXT: lock orl $0, (%esp) +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: store_double_seq_cst: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovq %xmm0, (%eax) +; X86-AVX512-NEXT: lock orl $0, (%esp) +; X86-AVX512-NEXT: retl ; ; X86-NOSSE-LABEL: store_double_seq_cst: ; X86-NOSSE: # %bb.0: @@ -619,17 +747,29 @@ ; X86-SSE2-NEXT: .cfi_def_cfa_offset 4 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: load_float_seq_cst: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: pushl %eax -; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vmovss %xmm0, (%esp) -; X86-AVX-NEXT: flds (%esp) -; X86-AVX-NEXT: popl %eax -; X86-AVX-NEXT: .cfi_def_cfa_offset 4 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: load_float_seq_cst: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %eax +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: vmovss %xmm0, (%esp) +; X86-AVX1-NEXT: flds (%esp) +; X86-AVX1-NEXT: popl %eax +; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: load_float_seq_cst: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: pushl %eax +; X86-AVX512-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512-NEXT: vmovd %xmm0, (%esp) +; X86-AVX512-NEXT: flds (%esp) +; X86-AVX512-NEXT: popl %eax +; X86-AVX512-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX512-NEXT: retl ; ; X86-NOSSE-LABEL: load_float_seq_cst: ; X86-NOSSE: # %bb.0: @@ -648,10 +788,15 @@ ; X64-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: load_float_seq_cst: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: load_float_seq_cst: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: load_float_seq_cst: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX512-NEXT: retq %v = load atomic float, float* %fptr seq_cst, align 4 ret float %v } @@ -684,17 +829,29 @@ ; X86-SSE2-NEXT: .cfi_def_cfa_offset 4 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: load_double_seq_cst: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: subl $12, %esp -; X86-AVX-NEXT: .cfi_def_cfa_offset 16 -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovlps %xmm0, (%esp) -; X86-AVX-NEXT: fldl (%esp) -; X86-AVX-NEXT: addl $12, %esp -; X86-AVX-NEXT: .cfi_def_cfa_offset 4 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: load_double_seq_cst: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: subl $12, %esp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 16 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vmovlps %xmm0, (%esp) +; X86-AVX1-NEXT: fldl (%esp) +; X86-AVX1-NEXT: addl $12, %esp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: load_double_seq_cst: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: subl $12, %esp +; X86-AVX512-NEXT: .cfi_def_cfa_offset 16 +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovq %xmm0, (%esp) +; X86-AVX512-NEXT: fldl (%esp) +; X86-AVX512-NEXT: addl $12, %esp +; X86-AVX512-NEXT: .cfi_def_cfa_offset 4 +; X86-AVX512-NEXT: retl ; ; X86-NOSSE-LABEL: load_double_seq_cst: ; X86-NOSSE: # %bb.0: @@ -717,10 +874,15 @@ ; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: load_double_seq_cst: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: load_double_seq_cst: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: load_double_seq_cst: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X64-AVX512-NEXT: retq %v = load atomic double, double* %fptr seq_cst, align 8 ret double %v } diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -321,7 +321,7 @@ ; CHECK-O0-NEXT: movl $32, %edi ; CHECK-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; CHECK-O0-NEXT: xorl %ecx, %ecx -; CHECK-O0-NEXT: callq __atomic_load +; CHECK-O0-NEXT: callq __atomic_load@PLT ; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; CHECK-O0-NEXT: movq {{[0-9]+}}(%rsp), %rcx @@ -347,9 +347,9 @@ ; CHECK-O3-NEXT: movq %rsp, %rdx ; CHECK-O3-NEXT: movl $32, %edi ; CHECK-O3-NEXT: xorl %ecx, %ecx -; CHECK-O3-NEXT: callq __atomic_load -; CHECK-O3-NEXT: vmovups (%rsp), %ymm0 -; CHECK-O3-NEXT: vmovups %ymm0, (%rbx) +; CHECK-O3-NEXT: callq __atomic_load@PLT +; CHECK-O3-NEXT: vmovdqu (%rsp), %ymm0 +; CHECK-O3-NEXT: vmovdqu %ymm0, (%rbx) ; CHECK-O3-NEXT: movq %rbx, %rax ; CHECK-O3-NEXT: addq $32, %rsp ; CHECK-O3-NEXT: .cfi_def_cfa_offset 16 @@ -378,7 +378,7 @@ ; CHECK-O0-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; CHECK-O0-NEXT: movq %r8, {{[0-9]+}}(%rsp) ; CHECK-O0-NEXT: movl $32, %edi -; CHECK-O0-NEXT: callq __atomic_store +; CHECK-O0-NEXT: callq __atomic_store@PLT ; CHECK-O0-NEXT: addq $40, %rsp ; CHECK-O0-NEXT: .cfi_def_cfa_offset 8 ; CHECK-O0-NEXT: retq @@ -396,7 +396,7 @@ ; CHECK-O3-NEXT: movl $32, %edi ; CHECK-O3-NEXT: movq %rax, %rsi ; CHECK-O3-NEXT: xorl %ecx, %ecx -; CHECK-O3-NEXT: callq __atomic_store +; CHECK-O3-NEXT: callq __atomic_store@PLT ; CHECK-O3-NEXT: addq $40, %rsp ; CHECK-O3-NEXT: .cfi_def_cfa_offset 8 ; CHECK-O3-NEXT: retq @@ -430,8 +430,8 @@ ; ; CHECK-O3-EX-LABEL: vec_store: ; CHECK-O3-EX: # %bb.0: -; CHECK-O3-EX-NEXT: vmovss %xmm0, (%rdi) -; CHECK-O3-EX-NEXT: vextractps $1, %xmm0, 4(%rdi) +; CHECK-O3-EX-NEXT: vmovd %xmm0, (%rdi) +; CHECK-O3-EX-NEXT: vpextrd $1, %xmm0, 4(%rdi) ; CHECK-O3-EX-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %v2 = extractelement <2 x i32> %vec, i32 1 @@ -467,8 +467,8 @@ ; ; CHECK-O3-EX-LABEL: vec_store_unaligned: ; CHECK-O3-EX: # %bb.0: -; CHECK-O3-EX-NEXT: vmovss %xmm0, (%rdi) -; CHECK-O3-EX-NEXT: vextractps $1, %xmm0, 4(%rdi) +; CHECK-O3-EX-NEXT: vmovd %xmm0, (%rdi) +; CHECK-O3-EX-NEXT: vpextrd $1, %xmm0, 4(%rdi) ; CHECK-O3-EX-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %v2 = extractelement <2 x i32> %vec, i32 1 @@ -505,8 +505,8 @@ ; ; CHECK-O3-EX-LABEL: widen_broadcast2: ; CHECK-O3-EX: # %bb.0: -; CHECK-O3-EX-NEXT: vmovss %xmm0, (%rdi) -; CHECK-O3-EX-NEXT: vmovss %xmm0, 4(%rdi) +; CHECK-O3-EX-NEXT: vmovd %xmm0, (%rdi) +; CHECK-O3-EX-NEXT: vmovd %xmm0, 4(%rdi) ; CHECK-O3-EX-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %p1 = getelementptr i32, i32* %p0, i64 1 @@ -539,8 +539,8 @@ ; ; CHECK-O3-EX-LABEL: widen_broadcast2_unaligned: ; CHECK-O3-EX: # %bb.0: -; CHECK-O3-EX-NEXT: vmovss %xmm0, (%rdi) -; CHECK-O3-EX-NEXT: vmovss %xmm0, 4(%rdi) +; CHECK-O3-EX-NEXT: vmovd %xmm0, (%rdi) +; CHECK-O3-EX-NEXT: vmovd %xmm0, 4(%rdi) ; CHECK-O3-EX-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %p1 = getelementptr i32, i32* %p0, i64 1 diff --git a/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll b/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll --- a/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll +++ b/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll @@ -362,8 +362,8 @@ ; ; CHECK-AVX2-LABEL: test_overlap_4: ; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: vmovups -16(%rdi), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%rdi) +; CHECK-AVX2-NEXT: vmovdqu -16(%rdi), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, (%rdi) ; CHECK-AVX2-NEXT: movslq %esi, %rax ; CHECK-AVX2-NEXT: movq %rax, -8(%rdi) ; CHECK-AVX2-NEXT: movl %eax, -16(%rdi) @@ -384,8 +384,8 @@ ; ; CHECK-AVX512-LABEL: test_overlap_4: ; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: vmovups -16(%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%rdi) +; CHECK-AVX512-NEXT: vmovdqu -16(%rdi), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, (%rdi) ; CHECK-AVX512-NEXT: movslq %esi, %rax ; CHECK-AVX512-NEXT: movq %rax, -8(%rdi) ; CHECK-AVX512-NEXT: movl %eax, -16(%rdi) @@ -458,8 +458,8 @@ ; ; CHECK-AVX2-LABEL: test_overlap_5: ; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: vmovups -16(%rdi), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%rdi) +; CHECK-AVX2-NEXT: vmovdqu -16(%rdi), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, (%rdi) ; CHECK-AVX2-NEXT: movslq %esi, %rax ; CHECK-AVX2-NEXT: movq %rax, -16(%rdi) ; CHECK-AVX2-NEXT: movb %al, -14(%rdi) @@ -480,8 +480,8 @@ ; ; CHECK-AVX512-LABEL: test_overlap_5: ; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: vmovups -16(%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%rdi) +; CHECK-AVX512-NEXT: vmovdqu -16(%rdi), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, (%rdi) ; CHECK-AVX512-NEXT: movslq %esi, %rax ; CHECK-AVX512-NEXT: movq %rax, -16(%rdi) ; CHECK-AVX512-NEXT: movb %al, -14(%rdi) diff --git a/llvm/test/CodeGen/X86/avoid-sfb.ll b/llvm/test/CodeGen/X86/avoid-sfb.ll --- a/llvm/test/CodeGen/X86/avoid-sfb.ll +++ b/llvm/test/CodeGen/X86/avoid-sfb.ll @@ -48,8 +48,8 @@ ; CHECK-AVX2-NEXT: # %bb.1: # %if.then ; CHECK-AVX2-NEXT: movl %edx, 4(%rdi) ; CHECK-AVX2-NEXT: .LBB0_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx) +; CHECK-AVX2-NEXT: vmovdqu (%r8), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, (%rcx) ; CHECK-AVX2-NEXT: movl (%rdi), %eax ; CHECK-AVX2-NEXT: movl %eax, (%rsi) ; CHECK-AVX2-NEXT: movl 4(%rdi), %eax @@ -65,8 +65,8 @@ ; CHECK-AVX512-NEXT: # %bb.1: # %if.then ; CHECK-AVX512-NEXT: movl %edx, 4(%rdi) ; CHECK-AVX512-NEXT: .LBB0_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx) +; CHECK-AVX512-NEXT: vmovdqu (%r8), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, (%rcx) ; CHECK-AVX512-NEXT: movl (%rdi), %eax ; CHECK-AVX512-NEXT: movl %eax, (%rsi) ; CHECK-AVX512-NEXT: movl 4(%rdi), %eax @@ -203,8 +203,8 @@ ; CHECK-AVX2-NEXT: # %bb.3: # %if.then2 ; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi) ; CHECK-AVX2-NEXT: .LBB2_4: # %if.end3 -; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx) +; CHECK-AVX2-NEXT: vmovdqu (%r8), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, (%rcx) ; CHECK-AVX2-NEXT: movq (%rdi), %rax ; CHECK-AVX2-NEXT: movq %rax, (%rsi) ; CHECK-AVX2-NEXT: movl 8(%rdi), %eax @@ -225,8 +225,8 @@ ; CHECK-AVX512-NEXT: # %bb.3: # %if.then2 ; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi) ; CHECK-AVX512-NEXT: .LBB2_4: # %if.end3 -; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx) +; CHECK-AVX512-NEXT: vmovdqu (%r8), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, (%rcx) ; CHECK-AVX512-NEXT: movq (%rdi), %rax ; CHECK-AVX512-NEXT: movq %rax, (%rsi) ; CHECK-AVX512-NEXT: movl 8(%rdi), %eax @@ -306,8 +306,8 @@ ; CHECK-AVX2-NEXT: # %bb.1: # %if.then ; CHECK-AVX2-NEXT: movl %edx, 4(%rdi) ; CHECK-AVX2-NEXT: .LBB3_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx) +; CHECK-AVX2-NEXT: vmovdqu (%r8), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, (%rcx) ; CHECK-AVX2-NEXT: movl (%rdi), %eax ; CHECK-AVX2-NEXT: movl %eax, (%rsi) ; CHECK-AVX2-NEXT: movl 4(%rdi), %eax @@ -326,8 +326,8 @@ ; CHECK-AVX512-NEXT: # %bb.1: # %if.then ; CHECK-AVX512-NEXT: movl %edx, 4(%rdi) ; CHECK-AVX512-NEXT: .LBB3_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx) +; CHECK-AVX512-NEXT: vmovdqu (%r8), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, (%rcx) ; CHECK-AVX512-NEXT: movl (%rdi), %eax ; CHECK-AVX512-NEXT: movl %eax, (%rsi) ; CHECK-AVX512-NEXT: movl 4(%rdi), %eax @@ -399,8 +399,8 @@ ; CHECK-AVX2-NEXT: movslq %edx, %rax ; CHECK-AVX2-NEXT: movq %rax, 8(%rdi) ; CHECK-AVX2-NEXT: .LBB4_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx) +; CHECK-AVX2-NEXT: vmovdqu (%r8), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, (%rcx) ; CHECK-AVX2-NEXT: movq (%rdi), %rax ; CHECK-AVX2-NEXT: movq %rax, (%rsi) ; CHECK-AVX2-NEXT: movq 8(%rdi), %rax @@ -415,8 +415,8 @@ ; CHECK-AVX512-NEXT: movslq %edx, %rax ; CHECK-AVX512-NEXT: movq %rax, 8(%rdi) ; CHECK-AVX512-NEXT: .LBB4_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx) +; CHECK-AVX512-NEXT: vmovdqu (%r8), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, (%rcx) ; CHECK-AVX512-NEXT: movq (%rdi), %rax ; CHECK-AVX512-NEXT: movq %rax, (%rsi) ; CHECK-AVX512-NEXT: movq 8(%rdi), %rax @@ -579,8 +579,8 @@ ; CHECK-AVX2: # %bb.0: # %entry ; CHECK-AVX2-NEXT: movl $0, 4(%rdi) ; CHECK-AVX2-NEXT: movl $0, 36(%rdi) -; CHECK-AVX2-NEXT: vmovups 16(%rdi), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, 16(%rsi) +; CHECK-AVX2-NEXT: vmovdqu 16(%rdi), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, 16(%rsi) ; CHECK-AVX2-NEXT: movl 32(%rdi), %eax ; CHECK-AVX2-NEXT: movl %eax, 32(%rsi) ; CHECK-AVX2-NEXT: movl 36(%rdi), %eax @@ -591,8 +591,8 @@ ; CHECK-AVX2-NEXT: movl %eax, (%rsi) ; CHECK-AVX2-NEXT: movl 4(%rdi), %eax ; CHECK-AVX2-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX2-NEXT: vmovups 8(%rdi), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, 8(%rsi) +; CHECK-AVX2-NEXT: vmovdqu 8(%rdi), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, 8(%rsi) ; CHECK-AVX2-NEXT: movq 24(%rdi), %rax ; CHECK-AVX2-NEXT: movq %rax, 24(%rsi) ; CHECK-AVX2-NEXT: retq @@ -601,8 +601,8 @@ ; CHECK-AVX512: # %bb.0: # %entry ; CHECK-AVX512-NEXT: movl $0, 4(%rdi) ; CHECK-AVX512-NEXT: movl $0, 36(%rdi) -; CHECK-AVX512-NEXT: vmovups 16(%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, 16(%rsi) +; CHECK-AVX512-NEXT: vmovdqu 16(%rdi), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, 16(%rsi) ; CHECK-AVX512-NEXT: movl 32(%rdi), %eax ; CHECK-AVX512-NEXT: movl %eax, 32(%rsi) ; CHECK-AVX512-NEXT: movl 36(%rdi), %eax @@ -613,8 +613,8 @@ ; CHECK-AVX512-NEXT: movl %eax, (%rsi) ; CHECK-AVX512-NEXT: movl 4(%rdi), %eax ; CHECK-AVX512-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX512-NEXT: vmovups 8(%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, 8(%rsi) +; CHECK-AVX512-NEXT: vmovdqu 8(%rdi), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, 8(%rsi) ; CHECK-AVX512-NEXT: movq 24(%rdi), %rax ; CHECK-AVX512-NEXT: movq %rax, 24(%rsi) ; CHECK-AVX512-NEXT: retq @@ -671,8 +671,8 @@ ; CHECK-AVX2-NEXT: # %bb.1: # %if.then ; CHECK-AVX2-NEXT: movw %dx, 2(%rdi) ; CHECK-AVX2-NEXT: .LBB7_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx) +; CHECK-AVX2-NEXT: vmovdqu (%r8), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, (%rcx) ; CHECK-AVX2-NEXT: movzwl (%rdi), %eax ; CHECK-AVX2-NEXT: movw %ax, (%rsi) ; CHECK-AVX2-NEXT: movzwl 2(%rdi), %eax @@ -690,8 +690,8 @@ ; CHECK-AVX512-NEXT: # %bb.1: # %if.then ; CHECK-AVX512-NEXT: movw %dx, 2(%rdi) ; CHECK-AVX512-NEXT: .LBB7_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx) +; CHECK-AVX512-NEXT: vmovdqu (%r8), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, (%rcx) ; CHECK-AVX512-NEXT: movzwl (%rdi), %eax ; CHECK-AVX512-NEXT: movw %ax, (%rsi) ; CHECK-AVX512-NEXT: movzwl 2(%rdi), %eax @@ -765,16 +765,16 @@ ; CHECK-AVX2: # %bb.0: # %entry ; CHECK-AVX2-NEXT: movq %rdi, %rax ; CHECK-AVX2-NEXT: movl %esi, {{[0-9]+}}(%rsp) -; CHECK-AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%rdi) +; CHECK-AVX2-NEXT: vmovdqu {{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, (%rdi) ; CHECK-AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-AVX2-NEXT: movq %rcx, 16(%rdi) ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; CHECK-AVX2-NEXT: movl %ecx, 24(%rdi) ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; CHECK-AVX2-NEXT: movl %ecx, 28(%rdi) -; CHECK-AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-AVX2-NEXT: vmovdqu {{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, {{[0-9]+}}(%rsp) ; CHECK-AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -787,16 +787,16 @@ ; CHECK-AVX512: # %bb.0: # %entry ; CHECK-AVX512-NEXT: movq %rdi, %rax ; CHECK-AVX512-NEXT: movl %esi, {{[0-9]+}}(%rsp) -; CHECK-AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%rdi) +; CHECK-AVX512-NEXT: vmovdqu {{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, (%rdi) ; CHECK-AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-AVX512-NEXT: movq %rcx, 16(%rdi) ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; CHECK-AVX512-NEXT: movl %ecx, 24(%rdi) ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; CHECK-AVX512-NEXT: movl %ecx, 28(%rdi) -; CHECK-AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-AVX512-NEXT: vmovdqu {{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, {{[0-9]+}}(%rsp) ; CHECK-AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %ecx @@ -841,13 +841,13 @@ ; CHECK-NEXT: movq %rsi, %r12 ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movl %r9d, 12(%rdi) -; CHECK-NEXT: callq bar +; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: cmpl $18, %ebp ; CHECK-NEXT: jl .LBB9_2 ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: movl %ebp, 4(%rbx) ; CHECK-NEXT: movq %rbx, %rdi -; CHECK-NEXT: callq bar +; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: .LBB9_2: # %if.end ; CHECK-NEXT: movups (%r15), %xmm0 ; CHECK-NEXT: movups %xmm0, (%r14) @@ -888,13 +888,13 @@ ; DISABLED-NEXT: movq %rsi, %r12 ; DISABLED-NEXT: movq %rdi, %rbx ; DISABLED-NEXT: movl %r9d, 12(%rdi) -; DISABLED-NEXT: callq bar +; DISABLED-NEXT: callq bar@PLT ; DISABLED-NEXT: cmpl $18, %ebp ; DISABLED-NEXT: jl .LBB9_2 ; DISABLED-NEXT: # %bb.1: # %if.then ; DISABLED-NEXT: movl %ebp, 4(%rbx) ; DISABLED-NEXT: movq %rbx, %rdi -; DISABLED-NEXT: callq bar +; DISABLED-NEXT: callq bar@PLT ; DISABLED-NEXT: .LBB9_2: # %if.end ; DISABLED-NEXT: movups (%r15), %xmm0 ; DISABLED-NEXT: movups %xmm0, (%r14) @@ -935,18 +935,18 @@ ; CHECK-AVX2-NEXT: movq %rsi, %r12 ; CHECK-AVX2-NEXT: movq %rdi, %rbx ; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi) -; CHECK-AVX2-NEXT: callq bar +; CHECK-AVX2-NEXT: callq bar@PLT ; CHECK-AVX2-NEXT: cmpl $18, %ebp ; CHECK-AVX2-NEXT: jl .LBB9_2 ; CHECK-AVX2-NEXT: # %bb.1: # %if.then ; CHECK-AVX2-NEXT: movl %ebp, 4(%rbx) ; CHECK-AVX2-NEXT: movq %rbx, %rdi -; CHECK-AVX2-NEXT: callq bar +; CHECK-AVX2-NEXT: callq bar@PLT ; CHECK-AVX2-NEXT: .LBB9_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r15), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%r14) -; CHECK-AVX2-NEXT: vmovups (%rbx), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%r12) +; CHECK-AVX2-NEXT: vmovdqu (%r15), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, (%r14) +; CHECK-AVX2-NEXT: vmovdqu (%rbx), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, (%r12) ; CHECK-AVX2-NEXT: popq %rbx ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 40 ; CHECK-AVX2-NEXT: popq %r12 @@ -982,18 +982,18 @@ ; CHECK-AVX512-NEXT: movq %rsi, %r12 ; CHECK-AVX512-NEXT: movq %rdi, %rbx ; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi) -; CHECK-AVX512-NEXT: callq bar +; CHECK-AVX512-NEXT: callq bar@PLT ; CHECK-AVX512-NEXT: cmpl $18, %ebp ; CHECK-AVX512-NEXT: jl .LBB9_2 ; CHECK-AVX512-NEXT: # %bb.1: # %if.then ; CHECK-AVX512-NEXT: movl %ebp, 4(%rbx) ; CHECK-AVX512-NEXT: movq %rbx, %rdi -; CHECK-AVX512-NEXT: callq bar +; CHECK-AVX512-NEXT: callq bar@PLT ; CHECK-AVX512-NEXT: .LBB9_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r15), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%r14) -; CHECK-AVX512-NEXT: vmovups (%rbx), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%r12) +; CHECK-AVX512-NEXT: vmovdqu (%r15), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, (%r14) +; CHECK-AVX512-NEXT: vmovdqu (%rbx), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, (%r12) ; CHECK-AVX512-NEXT: popq %rbx ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 40 ; CHECK-AVX512-NEXT: popq %r12 @@ -1056,7 +1056,7 @@ ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: movl %edx, 4(%rbx) ; CHECK-NEXT: movq %rbx, %rdi -; CHECK-NEXT: callq bar +; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: .LBB10_2: # %if.end ; CHECK-NEXT: movups (%r12), %xmm0 ; CHECK-NEXT: movups %xmm0, (%r15) @@ -1104,7 +1104,7 @@ ; DISABLED-NEXT: # %bb.1: # %if.then ; DISABLED-NEXT: movl %edx, 4(%rbx) ; DISABLED-NEXT: movq %rbx, %rdi -; DISABLED-NEXT: callq bar +; DISABLED-NEXT: callq bar@PLT ; DISABLED-NEXT: .LBB10_2: # %if.end ; DISABLED-NEXT: movups (%r15), %xmm0 ; DISABLED-NEXT: movups %xmm0, (%r14) @@ -1148,10 +1148,10 @@ ; CHECK-AVX2-NEXT: # %bb.1: # %if.then ; CHECK-AVX2-NEXT: movl %edx, 4(%rbx) ; CHECK-AVX2-NEXT: movq %rbx, %rdi -; CHECK-AVX2-NEXT: callq bar +; CHECK-AVX2-NEXT: callq bar@PLT ; CHECK-AVX2-NEXT: .LBB10_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r12), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%r15) +; CHECK-AVX2-NEXT: vmovdqu (%r12), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, (%r15) ; CHECK-AVX2-NEXT: movq (%rbx), %rax ; CHECK-AVX2-NEXT: movq %rax, (%r14) ; CHECK-AVX2-NEXT: movl 8(%rbx), %eax @@ -1196,10 +1196,10 @@ ; CHECK-AVX512-NEXT: # %bb.1: # %if.then ; CHECK-AVX512-NEXT: movl %edx, 4(%rbx) ; CHECK-AVX512-NEXT: movq %rbx, %rdi -; CHECK-AVX512-NEXT: callq bar +; CHECK-AVX512-NEXT: callq bar@PLT ; CHECK-AVX512-NEXT: .LBB10_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r12), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%r15) +; CHECK-AVX512-NEXT: vmovdqu (%r12), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, (%r15) ; CHECK-AVX512-NEXT: movq (%rbx), %rax ; CHECK-AVX512-NEXT: movq %rax, (%r14) ; CHECK-AVX512-NEXT: movl 8(%rbx), %eax @@ -1292,14 +1292,14 @@ ; CHECK-AVX2-NEXT: # %bb.1: # %if.then ; CHECK-AVX2-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000 ; CHECK-AVX2-NEXT: .LBB11_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r8), %ymm0 -; CHECK-AVX2-NEXT: vmovups %ymm0, (%rcx) +; CHECK-AVX2-NEXT: vmovdqu (%r8), %ymm0 +; CHECK-AVX2-NEXT: vmovdqu %ymm0, (%rcx) ; CHECK-AVX2-NEXT: movl (%rdi), %eax ; CHECK-AVX2-NEXT: movl %eax, (%rsi) ; CHECK-AVX2-NEXT: movl 4(%rdi), %eax ; CHECK-AVX2-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX2-NEXT: vmovups 8(%rdi), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, 8(%rsi) +; CHECK-AVX2-NEXT: vmovdqu 8(%rdi), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, 8(%rsi) ; CHECK-AVX2-NEXT: movq 24(%rdi), %rax ; CHECK-AVX2-NEXT: movq %rax, 24(%rsi) ; CHECK-AVX2-NEXT: vzeroupper @@ -1312,14 +1312,14 @@ ; CHECK-AVX512-NEXT: # %bb.1: # %if.then ; CHECK-AVX512-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000 ; CHECK-AVX512-NEXT: .LBB11_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r8), %ymm0 -; CHECK-AVX512-NEXT: vmovups %ymm0, (%rcx) +; CHECK-AVX512-NEXT: vmovdqu (%r8), %ymm0 +; CHECK-AVX512-NEXT: vmovdqu %ymm0, (%rcx) ; CHECK-AVX512-NEXT: movl (%rdi), %eax ; CHECK-AVX512-NEXT: movl %eax, (%rsi) ; CHECK-AVX512-NEXT: movl 4(%rdi), %eax ; CHECK-AVX512-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX512-NEXT: vmovups 8(%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, 8(%rsi) +; CHECK-AVX512-NEXT: vmovdqu 8(%rdi), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, 8(%rsi) ; CHECK-AVX512-NEXT: movq 24(%rdi), %rax ; CHECK-AVX512-NEXT: movq %rax, 24(%rsi) ; CHECK-AVX512-NEXT: vzeroupper @@ -1390,14 +1390,14 @@ ; CHECK-AVX2-NEXT: # %bb.1: # %if.then ; CHECK-AVX2-NEXT: movq $1, 8(%rdi) ; CHECK-AVX2-NEXT: .LBB12_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r8), %ymm0 -; CHECK-AVX2-NEXT: vmovups %ymm0, (%rcx) +; CHECK-AVX2-NEXT: vmovdqu (%r8), %ymm0 +; CHECK-AVX2-NEXT: vmovdqu %ymm0, (%rcx) ; CHECK-AVX2-NEXT: movq (%rdi), %rax ; CHECK-AVX2-NEXT: movq %rax, (%rsi) ; CHECK-AVX2-NEXT: movq 8(%rdi), %rax ; CHECK-AVX2-NEXT: movq %rax, 8(%rsi) -; CHECK-AVX2-NEXT: vmovups 16(%rdi), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, 16(%rsi) +; CHECK-AVX2-NEXT: vmovdqu 16(%rdi), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, 16(%rsi) ; CHECK-AVX2-NEXT: vzeroupper ; CHECK-AVX2-NEXT: retq ; @@ -1408,14 +1408,14 @@ ; CHECK-AVX512-NEXT: # %bb.1: # %if.then ; CHECK-AVX512-NEXT: movq $1, 8(%rdi) ; CHECK-AVX512-NEXT: .LBB12_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r8), %ymm0 -; CHECK-AVX512-NEXT: vmovups %ymm0, (%rcx) +; CHECK-AVX512-NEXT: vmovdqu (%r8), %ymm0 +; CHECK-AVX512-NEXT: vmovdqu %ymm0, (%rcx) ; CHECK-AVX512-NEXT: movq (%rdi), %rax ; CHECK-AVX512-NEXT: movq %rax, (%rsi) ; CHECK-AVX512-NEXT: movq 8(%rdi), %rax ; CHECK-AVX512-NEXT: movq %rax, 8(%rsi) -; CHECK-AVX512-NEXT: vmovups 16(%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, 16(%rsi) +; CHECK-AVX512-NEXT: vmovdqu 16(%rdi), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, 16(%rsi) ; CHECK-AVX512-NEXT: vzeroupper ; CHECK-AVX512-NEXT: retq entry: @@ -1455,15 +1455,15 @@ ; CHECK-AVX2-LABEL: test_alias: ; CHECK-AVX2: # %bb.0: # %entry ; CHECK-AVX2-NEXT: movl %esi, (%rdi) -; CHECK-AVX2-NEXT: vmovups (%rdi), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, 4(%rdi) +; CHECK-AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; CHECK-AVX2-NEXT: vmovdqu %xmm0, 4(%rdi) ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512-LABEL: test_alias: ; CHECK-AVX512: # %bb.0: # %entry ; CHECK-AVX512-NEXT: movl %esi, (%rdi) -; CHECK-AVX512-NEXT: vmovups (%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, 4(%rdi) +; CHECK-AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; CHECK-AVX512-NEXT: vmovdqu %xmm0, 4(%rdi) ; CHECK-AVX512-NEXT: retq entry: %a = bitcast i8* %A to i32* diff --git a/llvm/test/CodeGen/X86/avx-cast.ll b/llvm/test/CodeGen/X86/avx-cast.ll --- a/llvm/test/CodeGen/X86/avx-cast.ll +++ b/llvm/test/CodeGen/X86/avx-cast.ll @@ -1,25 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 ; Prefer a blend instruction to a vinsert128 instruction because blends ; are simpler (no lane changes) and therefore will have equal or better ; performance. define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp { -; AVX-LABEL: castA: -; AVX: ## %bb.0: -; AVX-NEXT: vmovaps %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: castA: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovaps %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: castA: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovdqa %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> ret <8 x float> %shuffle.i } define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp { -; AVX-LABEL: castB: -; AVX: ## %bb.0: -; AVX-NEXT: vmovaps %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: castB: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovaps %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: castB: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovdqa %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle.i = shufflevector <2 x double> %m, <2 x double> zeroinitializer, <4 x i32> ret <4 x double> %shuffle.i } @@ -27,10 +37,15 @@ ; AVX2 is needed for integer types. define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp { -; AVX-LABEL: castC: -; AVX: ## %bb.0: -; AVX-NEXT: vmovaps %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: castC: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovaps %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: castC: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovdqa %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle.i = shufflevector <2 x i64> %m, <2 x i64> zeroinitializer, <4 x i32> ret <4 x i64> %shuffle.i } diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll --- a/llvm/test/CodeGen/X86/avx-insertelt.ll +++ b/llvm/test/CodeGen/X86/avx-insertelt.ll @@ -3,21 +3,33 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 define <8 x float> @insert_f32(<8 x float> %y, float %f, <8 x float> %x) { -; ALL-LABEL: insert_f32: -; ALL: # %bb.0: -; ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] -; ALL-NEXT: retq +; AVX-LABEL: insert_f32: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] +; AVX2-NEXT: retq %i0 = insertelement <8 x float> %y, float %f, i32 0 ret <8 x float> %i0 } define <4 x double> @insert_f64(<4 x double> %y, double %f, <4 x double> %x) { -; ALL-LABEL: insert_f64: -; ALL: # %bb.0: -; ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; ALL-NEXT: retq +; AVX-LABEL: insert_f64: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: retq %i0 = insertelement <4 x double> %y, double %f, i32 0 ret <4 x double> %i0 } diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -44,7 +44,7 @@ ; ; AVX512VL-LABEL: test_x86_avx_vinsertf128_pd_256_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc1,0x01] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1) ret <4 x double> %res @@ -59,7 +59,7 @@ ; ; AVX512VL-LABEL: test_x86_avx_vinsertf128_ps_256_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc1,0x01] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1) ret <8 x float> %res @@ -74,7 +74,7 @@ ; ; AVX512VL-LABEL: test_x86_avx_vinsertf128_si_256_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc1,0x01] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1) ret <8 x i32> %res @@ -84,12 +84,19 @@ ; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's ; not a vinsertf128 $1. define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2: -; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vblendps $240, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x0c,0xc0,0xf0] -; CHECK-NEXT: # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX-LABEL: test_x86_avx_vinsertf128_si_256_2: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX-NEXT: vblendps $240, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x0c,0xc0,0xf0] +; AVX-NEXT: # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_vinsertf128_si_256_2: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-NEXT: vpblendd $240, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x02,0xc0,0xf0] +; AVX512VL-NEXT: # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2) ret <8 x i32> %res } @@ -106,7 +113,7 @@ ; ; AVX512VL-LABEL: test_x86_avx_vextractf128_pd_256_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x39,0xc0,0x01] ; AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 1) @@ -123,7 +130,7 @@ ; ; AVX512VL-LABEL: test_x86_avx_vextractf128_ps_256_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x39,0xc0,0x01] ; AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 1) @@ -140,7 +147,7 @@ ; ; AVX512VL-LABEL: test_x86_avx_vextractf128_si_256_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x39,0xc0,0x01] ; AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 1) @@ -227,11 +234,17 @@ define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) { -; CHECK-LABEL: test_x86_avx_blend_pd_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vblendps $192, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x0c,0xc0,0xc0] -; CHECK-NEXT: # ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX-LABEL: test_x86_avx_blend_pd_256: +; AVX: # %bb.0: +; AVX-NEXT: vblendps $192, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x0c,0xc0,0xc0] +; AVX-NEXT: # ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_blend_pd_256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd $192, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x02,0xc0,0xc0] +; AVX512VL-NEXT: # ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -239,11 +252,17 @@ define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) { -; CHECK-LABEL: test_x86_avx_blend_ps_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vblendps $7, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x07] -; CHECK-NEXT: # ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX-LABEL: test_x86_avx_blend_ps_256: +; AVX: # %bb.0: +; AVX-NEXT: vblendps $7, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x07] +; AVX-NEXT: # ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_blend_ps_256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd $7, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0x07] +; AVX512VL-NEXT: # ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -298,11 +317,17 @@ define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) { -; CHECK-LABEL: test_x86_sse41_blendpd: -; CHECK: # %bb.0: -; CHECK-NEXT: vblendps $3, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x03] -; CHECK-NEXT: # xmm0 = xmm0[0,1],xmm1[2,3] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX-LABEL: test_x86_sse41_blendpd: +; AVX: # %bb.0: +; AVX-NEXT: vblendps $3, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x03] +; AVX-NEXT: # xmm0 = xmm0[0,1],xmm1[2,3] +; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_sse41_blendpd: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd $3, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x02,0xc0,0x03] +; AVX512VL-NEXT: # xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1] ret <2 x double> %res } @@ -310,11 +335,17 @@ define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) { -; CHECK-LABEL: test_x86_sse41_blendps: -; CHECK: # %bb.0: -; CHECK-NEXT: vblendps $8, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08] -; CHECK-NEXT: # xmm0 = xmm1[0,1,2],xmm0[3] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX-LABEL: test_x86_sse41_blendps: +; AVX: # %bb.0: +; AVX-NEXT: vblendps $8, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08] +; AVX-NEXT: # xmm0 = xmm1[0,1,2],xmm0[3] +; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_sse41_blendps: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd $8, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x02,0xc0,0x08] +; AVX512VL-NEXT: # xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -699,7 +730,7 @@ ; X86-AVX512VL-LABEL: test_x86_sse_storeu_ps: ; X86-AVX512VL: # %bb.0: ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512VL-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] +; X86-AVX512VL-NEXT: vmovdqu %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x00] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_sse_storeu_ps: @@ -709,7 +740,7 @@ ; ; X64-AVX512VL-LABEL: test_x86_sse_storeu_ps: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] +; X64-AVX512VL-NEXT: vmovdqu %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1) ret void @@ -819,7 +850,7 @@ ; X86-AVX512VL-LABEL: test_x86_avx_storeu_ps_256: ; X86-AVX512VL: # %bb.0: ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512VL-NEXT: vmovups %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x00] +; X86-AVX512VL-NEXT: vmovdqu %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x00] ; X86-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; @@ -831,7 +862,7 @@ ; ; X64-AVX512VL-LABEL: test_x86_avx_storeu_ps_256: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovups %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07] +; X64-AVX512VL-NEXT: vmovdqu %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x07] ; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1) @@ -885,7 +916,7 @@ ; ; AVX512VL-LABEL: test_x86_avx_vpermil_ps: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps $7, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x07] +; AVX512VL-NEXT: vpshufd $7, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc0,0x07] ; AVX512VL-NEXT: # xmm0 = xmm0[3,1,0,0] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1] @@ -903,7 +934,7 @@ ; ; AVX512VL-LABEL: test_x86_avx_vpermil_ps_256: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps $7, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x07] +; AVX512VL-NEXT: vpshufd $7, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x70,0xc0,0x07] ; AVX512VL-NEXT: # ymm0 = ymm0[3,1,0,0,7,5,4,4] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float> %a0, i8 7) ; <<8 x float>> [#uses=1] diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -958,7 +958,7 @@ ; X86-AVX512VL-LABEL: movnt_ps: ; X86-AVX512VL: # %bb.0: ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512VL-NEXT: vmovntps %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x00] +; X86-AVX512VL-NEXT: vmovntdq %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x00] ; X86-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; @@ -970,7 +970,7 @@ ; ; X64-AVX512VL-LABEL: movnt_ps: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovntps %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x07] +; X64-AVX512VL-NEXT: vmovntdq %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x07] ; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] tail call void @llvm.x86.avx.movnt.ps.256(i8* %p, <8 x float> %a) nounwind diff --git a/llvm/test/CodeGen/X86/avx-logic.ll b/llvm/test/CodeGen/X86/avx-logic.ll --- a/llvm/test/CodeGen/X86/avx-logic.ll +++ b/llvm/test/CodeGen/X86/avx-logic.ll @@ -37,10 +37,15 @@ } define <8 x float> @andps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp { -; ANY-LABEL: andps256: -; ANY: # %bb.0: # %entry -; ANY-NEXT: vandps %ymm0, %ymm1, %ymm0 -; ANY-NEXT: retq +; AVX1-LABEL: andps256: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; INT256-LABEL: andps256: +; INT256: # %bb.0: # %entry +; INT256-NEXT: vpand %ymm0, %ymm1, %ymm0 +; INT256-NEXT: retq entry: %0 = bitcast <8 x float> %x to <8 x i32> %1 = bitcast <8 x float> %y to <8 x i32> @@ -50,10 +55,15 @@ } define <8 x float> @andps256fold(<8 x float> %y) nounwind uwtable readnone ssp { -; ANY-LABEL: andps256fold: -; ANY: # %bb.0: # %entry -; ANY-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; ANY-NEXT: retq +; AVX1-LABEL: andps256fold: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; INT256-LABEL: andps256fold: +; INT256: # %bb.0: # %entry +; INT256-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; INT256-NEXT: retq entry: %0 = bitcast <8 x float> %y to <8 x i32> %and.i = and <8 x i32> %0, @@ -95,10 +105,15 @@ } define <8 x float> @xorps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp { -; ANY-LABEL: xorps256: -; ANY: # %bb.0: # %entry -; ANY-NEXT: vxorps %ymm0, %ymm1, %ymm0 -; ANY-NEXT: retq +; AVX1-LABEL: xorps256: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; INT256-LABEL: xorps256: +; INT256: # %bb.0: # %entry +; INT256-NEXT: vpxor %ymm0, %ymm1, %ymm0 +; INT256-NEXT: retq entry: %0 = bitcast <8 x float> %x to <8 x i32> %1 = bitcast <8 x float> %y to <8 x i32> @@ -108,10 +123,15 @@ } define <8 x float> @xorps256fold(<8 x float> %y) nounwind uwtable readnone ssp { -; ANY-LABEL: xorps256fold: -; ANY: # %bb.0: # %entry -; ANY-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 -; ANY-NEXT: retq +; AVX1-LABEL: xorps256fold: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; INT256-LABEL: xorps256fold: +; INT256: # %bb.0: # %entry +; INT256-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; INT256-NEXT: retq entry: %0 = bitcast <8 x float> %y to <8 x i32> %xor.i = xor <8 x i32> %0, @@ -153,10 +173,15 @@ } define <8 x float> @orps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp { -; ANY-LABEL: orps256: -; ANY: # %bb.0: # %entry -; ANY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; ANY-NEXT: retq +; AVX1-LABEL: orps256: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; INT256-LABEL: orps256: +; INT256: # %bb.0: # %entry +; INT256-NEXT: vpor %ymm0, %ymm1, %ymm0 +; INT256-NEXT: retq entry: %0 = bitcast <8 x float> %x to <8 x i32> %1 = bitcast <8 x float> %y to <8 x i32> @@ -166,10 +191,15 @@ } define <8 x float> @orps256fold(<8 x float> %y) nounwind uwtable readnone ssp { -; ANY-LABEL: orps256fold: -; ANY: # %bb.0: # %entry -; ANY-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; ANY-NEXT: retq +; AVX1-LABEL: orps256fold: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; INT256-LABEL: orps256fold: +; INT256: # %bb.0: # %entry +; INT256-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 +; INT256-NEXT: retq entry: %0 = bitcast <8 x float> %y to <8 x i32> %or.i = or <8 x i32> %0, @@ -215,10 +245,15 @@ } define <8 x float> @andnotps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp { -; ANY-LABEL: andnotps256: -; ANY: # %bb.0: # %entry -; ANY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; ANY-NEXT: retq +; AVX1-LABEL: andnotps256: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; INT256-LABEL: andnotps256: +; INT256: # %bb.0: # %entry +; INT256-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; INT256-NEXT: retq entry: %0 = bitcast <8 x float> %x to <8 x i32> %neg.i = xor <8 x i32> %0, @@ -229,10 +264,15 @@ } define <8 x float> @andnotps256fold(<8 x float> %y, <8 x float>* nocapture %x) nounwind uwtable readonly ssp { -; ANY-LABEL: andnotps256fold: -; ANY: # %bb.0: # %entry -; ANY-NEXT: vandnps (%rdi), %ymm0, %ymm0 -; ANY-NEXT: retq +; AVX1-LABEL: andnotps256fold: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vandnps (%rdi), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; INT256-LABEL: andnotps256fold: +; INT256: # %bb.0: # %entry +; INT256-NEXT: vpandn (%rdi), %ymm0, %ymm0 +; INT256-NEXT: retq entry: %tmp2 = load <8 x float>, <8 x float>* %x, align 32 %0 = bitcast <8 x float> %y to <8 x i32> @@ -280,8 +320,8 @@ ; ; INT256-LABEL: and_xor_splat1_v4i32: ; INT256: # %bb.0: -; INT256-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] -; INT256-NEXT: vandnps %xmm1, %xmm0, %xmm0 +; INT256-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; INT256-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; INT256-NEXT: retq %xor = xor <4 x i32> %x, %and = and <4 x i32> %xor, @@ -296,8 +336,8 @@ ; ; INT256-LABEL: and_xor_splat1_v4i64: ; INT256: # %bb.0: -; INT256-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] -; INT256-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; INT256-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] +; INT256-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; INT256-NEXT: retq %xor = xor <4 x i64> %x, %and = and <4 x i64> %xor, diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll --- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll +++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll @@ -10,7 +10,7 @@ ; ; AVX2-LABEL: shuffle_v8f32_45670123: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -25,7 +25,7 @@ ; ; AVX2-LABEL: shuffle_v8f32_45670123_mem: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = mem[2,3,0,1] ; AVX2-NEXT: retq entry: %a = load <8 x float>, <8 x float>* %pa @@ -35,10 +35,15 @@ } define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_0123cdef: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_0123cdef: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_0123cdef: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -52,7 +57,7 @@ ; ; AVX2-LABEL: shuffle_v8f32_01230123: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -67,7 +72,7 @@ ; ; AVX2-LABEL: shuffle_v8f32_01230123_mem: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: retq entry: %a = load <8 x float>, <8 x float>* %pa @@ -84,7 +89,7 @@ ; ; AVX2-LABEL: shuffle_v8f32_45674567: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -99,7 +104,7 @@ ; ; AVX2-LABEL: shuffle_v8f32_45674567_mem: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = mem[2,3,2,3] ; AVX2-NEXT: retq entry: %a = load <8 x float>, <8 x float>* %pa @@ -116,7 +121,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_2323: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: retq entry: %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -147,10 +152,15 @@ } define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v4i64_6701: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4i64_6701: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_6701: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -248,10 +258,15 @@ ;;;; Cases with undef indicies mixed in the mask define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_uu67u9ub: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_uu67u9ub: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_uu67u9ub: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -265,7 +280,7 @@ ; ; AVX2-LABEL: shuffle_v8f32_uu67uu67: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -273,20 +288,30 @@ } define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_uu67uuab: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_uu67uuab: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_uu67uuab: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_uu67uuef: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_uu67uuef: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_uu67uuef: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -300,7 +325,7 @@ ; ; AVX2-LABEL: shuffle_v8f32_uu674567: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -308,10 +333,15 @@ } define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_uu6789ab: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_uu6789ab: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_uu6789ab: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -325,7 +355,7 @@ ; ; AVX2-LABEL: shuffle_v8f32_4567uu67: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -333,10 +363,15 @@ } define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_4567uuef: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_4567uuef: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_4567uuef: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -345,11 +380,17 @@ ;;;; Cases we must not select vperm2f128 define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: shuffle_v8f32_uu67ucuf: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_uu67ucuf: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_uu67ucuf: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -361,155 +402,247 @@ ;; unless building for optsize where we should still use vperm2f128. define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) { -; ALL-LABEL: shuffle_v4f64_zz01: -; ALL: # %bb.0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_zz01: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_zz01: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX2-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize { -; ALL-LABEL: shuffle_v4f64_zz01_optsize: -; ALL: # %bb.0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_zz01_optsize: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_zz01_optsize: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX2-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) { -; ALL-LABEL: shuffle_v4f64_zz23: -; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_zz23: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_zz23: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize { -; ALL-LABEL: shuffle_v4f64_zz23_optsize: -; ALL: # %bb.0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_zz23_optsize: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_zz23_optsize: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] +; AVX2-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_zz23_pgso(<4 x double> %a) !prof !14 { -; ALL-LABEL: shuffle_v4f64_zz23_pgso: -; ALL: # %bb.0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_zz23_pgso: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_zz23_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] +; AVX2-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) { -; ALL-LABEL: shuffle_v4f64_zz45: -; ALL: # %bb.0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_zz45: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_zz45: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX2-NEXT: retq %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize { -; ALL-LABEL: shuffle_v4f64_zz45_optsize: -; ALL: # %bb.0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_zz45_optsize: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_zz45_optsize: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX2-NEXT: retq %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) { -; ALL-LABEL: shuffle_v4f64_zz67: -; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_zz67: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_zz67: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: retq %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize { -; ALL-LABEL: shuffle_v4f64_zz67_optsize: -; ALL: # %bb.0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_zz67_optsize: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_zz67_optsize: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] +; AVX2-NEXT: retq %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_zz67_pgso(<4 x double> %a) !prof !14 { -; ALL-LABEL: shuffle_v4f64_zz67_pgso: -; ALL: # %bb.0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_zz67_pgso: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_zz67_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] +; AVX2-NEXT: retq %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) { -; ALL-LABEL: shuffle_v4f64_01zz: -; ALL: # %bb.0: -; ALL-NEXT: vmovaps %xmm0, %xmm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_01zz: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_01zz: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %xmm0, %xmm0 +; AVX2-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize { -; ALL-LABEL: shuffle_v4f64_01zz_optsize: -; ALL: # %bb.0: -; ALL-NEXT: vmovaps %xmm0, %xmm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_01zz_optsize: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_01zz_optsize: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %xmm0, %xmm0 +; AVX2-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) { -; ALL-LABEL: shuffle_v4f64_23zz: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_23zz: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_23zz: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize { -; ALL-LABEL: shuffle_v4f64_23zz_optsize: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_23zz_optsize: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_23zz_optsize: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: retq %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) { -; ALL-LABEL: shuffle_v4f64_45zz: -; ALL: # %bb.0: -; ALL-NEXT: vmovaps %xmm0, %xmm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_45zz: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_45zz: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %xmm0, %xmm0 +; AVX2-NEXT: retq %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize { -; ALL-LABEL: shuffle_v4f64_45zz_optsize: -; ALL: # %bb.0: -; ALL-NEXT: vmovaps %xmm0, %xmm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_45zz_optsize: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_45zz_optsize: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %xmm0, %xmm0 +; AVX2-NEXT: retq %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) { -; ALL-LABEL: shuffle_v4f64_67zz: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_67zz: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_67zz: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: retq %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize { -; ALL-LABEL: shuffle_v4f64_67zz_optsize: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_67zz_optsize: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_67zz_optsize: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: retq %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32> ret <4 x double> %s } diff --git a/llvm/test/CodeGen/X86/avx-vzeroupper.ll b/llvm/test/CodeGen/X86/avx-vzeroupper.ll --- a/llvm/test/CodeGen/X86/avx-vzeroupper.ll +++ b/llvm/test/CodeGen/X86/avx-vzeroupper.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,VZ -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,VZ +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,VZ,VZ-AVX1 +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,VZ,VZ-AVX512 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,-vzeroupper | FileCheck %s --check-prefixes=ALL,DISABLE-VZ ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefixes=ALL,BDVER2 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefixes=ALL,BTVER2 @@ -30,19 +30,33 @@ ;; Check parameter 256-bit parameter passing define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind { -; VZ-LABEL: test01: -; VZ: # %bb.0: -; VZ-NEXT: subq $56, %rsp -; VZ-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; VZ-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; VZ-NEXT: vzeroupper -; VZ-NEXT: callq do_sse -; VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; VZ-NEXT: callq do_sse -; VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; VZ-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; VZ-NEXT: addq $56, %rsp -; VZ-NEXT: retq +; VZ-AVX1-LABEL: test01: +; VZ-AVX1: # %bb.0: +; VZ-AVX1-NEXT: subq $56, %rsp +; VZ-AVX1-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; VZ-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; VZ-AVX1-NEXT: vzeroupper +; VZ-AVX1-NEXT: callq do_sse +; VZ-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; VZ-AVX1-NEXT: callq do_sse +; VZ-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; VZ-AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; VZ-AVX1-NEXT: addq $56, %rsp +; VZ-AVX1-NEXT: retq +; +; VZ-AVX512-LABEL: test01: +; VZ-AVX512: # %bb.0: +; VZ-AVX512-NEXT: subq $56, %rsp +; VZ-AVX512-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; VZ-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; VZ-AVX512-NEXT: vzeroupper +; VZ-AVX512-NEXT: callq do_sse +; VZ-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; VZ-AVX512-NEXT: callq do_sse +; VZ-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; VZ-AVX512-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; VZ-AVX512-NEXT: addq $56, %rsp +; VZ-AVX512-NEXT: retq ; ; DISABLE-VZ-LABEL: test01: ; DISABLE-VZ: # %bb.0: @@ -125,34 +139,63 @@ ;; for this function it should be only once define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind { -; VZ-LABEL: test03: -; VZ: # %bb.0: # %entry -; VZ-NEXT: pushq %rbx -; VZ-NEXT: subq $16, %rsp -; VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; VZ-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; VZ-NEXT: .p2align 4, 0x90 -; VZ-NEXT: .LBB3_1: # %while.cond -; VZ-NEXT: # =>This Inner Loop Header: Depth=1 -; VZ-NEXT: callq foo -; VZ-NEXT: testl %eax, %eax -; VZ-NEXT: jne .LBB3_1 -; VZ-NEXT: # %bb.2: # %for.body.preheader -; VZ-NEXT: movl $4, %ebx -; VZ-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; VZ-NEXT: .p2align 4, 0x90 -; VZ-NEXT: .LBB3_3: # %for.body -; VZ-NEXT: # =>This Inner Loop Header: Depth=1 -; VZ-NEXT: callq do_sse -; VZ-NEXT: callq do_sse -; VZ-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 -; VZ-NEXT: callq do_sse -; VZ-NEXT: decl %ebx -; VZ-NEXT: jne .LBB3_3 -; VZ-NEXT: # %bb.4: # %for.end -; VZ-NEXT: addq $16, %rsp -; VZ-NEXT: popq %rbx -; VZ-NEXT: retq +; VZ-AVX1-LABEL: test03: +; VZ-AVX1: # %bb.0: # %entry +; VZ-AVX1-NEXT: pushq %rbx +; VZ-AVX1-NEXT: subq $16, %rsp +; VZ-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; VZ-AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; VZ-AVX1-NEXT: .p2align 4, 0x90 +; VZ-AVX1-NEXT: .LBB3_1: # %while.cond +; VZ-AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; VZ-AVX1-NEXT: callq foo +; VZ-AVX1-NEXT: testl %eax, %eax +; VZ-AVX1-NEXT: jne .LBB3_1 +; VZ-AVX1-NEXT: # %bb.2: # %for.body.preheader +; VZ-AVX1-NEXT: movl $4, %ebx +; VZ-AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; VZ-AVX1-NEXT: .p2align 4, 0x90 +; VZ-AVX1-NEXT: .LBB3_3: # %for.body +; VZ-AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; VZ-AVX1-NEXT: callq do_sse +; VZ-AVX1-NEXT: callq do_sse +; VZ-AVX1-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 +; VZ-AVX1-NEXT: callq do_sse +; VZ-AVX1-NEXT: decl %ebx +; VZ-AVX1-NEXT: jne .LBB3_3 +; VZ-AVX1-NEXT: # %bb.4: # %for.end +; VZ-AVX1-NEXT: addq $16, %rsp +; VZ-AVX1-NEXT: popq %rbx +; VZ-AVX1-NEXT: retq +; +; VZ-AVX512-LABEL: test03: +; VZ-AVX512: # %bb.0: # %entry +; VZ-AVX512-NEXT: pushq %rbx +; VZ-AVX512-NEXT: subq $16, %rsp +; VZ-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; VZ-AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; VZ-AVX512-NEXT: .p2align 4, 0x90 +; VZ-AVX512-NEXT: .LBB3_1: # %while.cond +; VZ-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; VZ-AVX512-NEXT: callq foo +; VZ-AVX512-NEXT: testl %eax, %eax +; VZ-AVX512-NEXT: jne .LBB3_1 +; VZ-AVX512-NEXT: # %bb.2: # %for.body.preheader +; VZ-AVX512-NEXT: movl $4, %ebx +; VZ-AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; VZ-AVX512-NEXT: .p2align 4, 0x90 +; VZ-AVX512-NEXT: .LBB3_3: # %for.body +; VZ-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; VZ-AVX512-NEXT: callq do_sse +; VZ-AVX512-NEXT: callq do_sse +; VZ-AVX512-NEXT: vmovdqa g+{{.*}}(%rip), %xmm0 +; VZ-AVX512-NEXT: callq do_sse +; VZ-AVX512-NEXT: decl %ebx +; VZ-AVX512-NEXT: jne .LBB3_3 +; VZ-AVX512-NEXT: # %bb.4: # %for.end +; VZ-AVX512-NEXT: addq $16, %rsp +; VZ-AVX512-NEXT: popq %rbx +; VZ-AVX512-NEXT: retq ; ; DISABLE-VZ-LABEL: test03: ; DISABLE-VZ: # %bb.0: # %entry @@ -268,16 +311,27 @@ ;; Check that we also perform vzeroupper when we return from a function. define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind { -; VZ-LABEL: test04: -; VZ: # %bb.0: -; VZ-NEXT: pushq %rax -; VZ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; VZ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; VZ-NEXT: callq do_avx -; VZ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; VZ-NEXT: popq %rax -; VZ-NEXT: vzeroupper -; VZ-NEXT: retq +; VZ-AVX1-LABEL: test04: +; VZ-AVX1: # %bb.0: +; VZ-AVX1-NEXT: pushq %rax +; VZ-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; VZ-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; VZ-AVX1-NEXT: callq do_avx +; VZ-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; VZ-AVX1-NEXT: popq %rax +; VZ-AVX1-NEXT: vzeroupper +; VZ-AVX1-NEXT: retq +; +; VZ-AVX512-LABEL: test04: +; VZ-AVX512: # %bb.0: +; VZ-AVX512-NEXT: pushq %rax +; VZ-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; VZ-AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; VZ-AVX512-NEXT: callq do_avx +; VZ-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; VZ-AVX512-NEXT: popq %rax +; VZ-AVX512-NEXT: vzeroupper +; VZ-AVX512-NEXT: retq ; ; DISABLE-VZ-LABEL: test04: ; DISABLE-VZ: # %bb.0: diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -289,12 +289,12 @@ define <8 x i32> @mul_const5(<8 x i32> %x) { ; X32-LABEL: mul_const5: ; X32: # %bb.0: -; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: mul_const5: ; X64: # %bb.0: -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-NEXT: retq %y = mul <8 x i32> %x, ret <8 x i32> %y diff --git a/llvm/test/CodeGen/X86/avx2-conversions.ll b/llvm/test/CodeGen/X86/avx2-conversions.ll --- a/llvm/test/CodeGen/X86/avx2-conversions.ll +++ b/llvm/test/CodeGen/X86/avx2-conversions.ll @@ -14,8 +14,8 @@ ; ; X32-FAST-LABEL: trunc4: ; X32-FAST: # %bb.0: -; X32-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; X32-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; X32-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; X32-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; X32-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; X32-FAST-NEXT: vzeroupper ; X32-FAST-NEXT: retl @@ -29,8 +29,8 @@ ; ; X64-FAST-LABEL: trunc4: ; X64-FAST: # %bb.0: -; X64-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; X64-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; X64-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; X64-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; X64-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; X64-FAST-NEXT: vzeroupper ; X64-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -164,7 +164,7 @@ define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: test_mm256_and_si256: ; CHECK: # %bb.0: -; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = and <4 x i64> %a0, %a1 ret <4 x i64> %res @@ -223,7 +223,7 @@ define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: test_mm_blend_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <2 x i64> %a0 to <4 x i32> %arg1 = bitcast <2 x i64> %a1 to <4 x i32> @@ -235,7 +235,7 @@ define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_blend_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %arg1 = bitcast <4 x i64> %a1 to <8 x i32> @@ -283,7 +283,7 @@ define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) { ; CHECK-LABEL: test_mm_broadcastd_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <2 x i64> %a0 to <4 x i32> %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer @@ -294,7 +294,7 @@ define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) { ; CHECK-LABEL: test_mm256_broadcastd_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer @@ -305,7 +305,7 @@ define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) { ; CHECK-LABEL: test_mm_broadcastq_epi64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %res @@ -314,7 +314,7 @@ define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) { ; CHECK-LABEL: test_mm256_broadcastq_epi64: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer ret <4 x i64> %res @@ -323,7 +323,7 @@ define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) { ; CHECK-LABEL: test_mm_broadcastsd_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer ret <2 x double> %res @@ -332,7 +332,7 @@ define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) { ; CHECK-LABEL: test_mm256_broadcastsd_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer ret <4 x double> %res @@ -342,7 +342,7 @@ ; CHECK-LABEL: test_mm256_broadcastsi128_si256: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> ret <4 x i64> %res @@ -352,12 +352,12 @@ ; X86-LABEL: test_mm256_broadcastsi128_si256_mem: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_broadcastsi128_si256_mem: ; X64: # %bb.0: -; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-NEXT: retq %a0 = load <2 x i64>, <2 x i64>* %p0 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> @@ -367,7 +367,7 @@ define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) { ; CHECK-LABEL: test_mm_broadcastss_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer ret <4 x float> %res @@ -376,7 +376,7 @@ define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) { ; CHECK-LABEL: test_mm256_broadcastss_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer ret <8 x float> %res @@ -659,7 +659,7 @@ define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind { ; CHECK-LABEL: test_mm256_extracti128_si256: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> @@ -1434,7 +1434,7 @@ ; CHECK-LABEL: test0_mm256_inserti128_si256: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; CHECK-NEXT: ret{{[l|q]}} %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> @@ -1444,7 +1444,7 @@ define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { ; CHECK-LABEL: test1_mm256_inserti128_si256: ; CHECK: # %bb.0: -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> @@ -1898,7 +1898,7 @@ define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: test_mm256_or_si256: ; CHECK: # %bb.0: -; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = or <4 x i64> %a0, %a1 ret <4 x i64> %res @@ -1959,7 +1959,7 @@ define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_permute2x128_si256: ; CHECK: # %bb.0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> ret <4 x i64> %res @@ -1969,7 +1969,7 @@ define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) { ; CHECK-LABEL: test_mm256_permute4x64_epi64: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,2,0] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> ret <4 x i64> %res @@ -1978,7 +1978,7 @@ define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) { ; CHECK-LABEL: test_mm256_permute4x64_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,1,0] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> ret <4 x double> %res @@ -1987,7 +1987,7 @@ define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_permutevar8x32_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %arg1 = bitcast <4 x i64> %a1 to <8 x i32> @@ -2000,7 +2000,7 @@ define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_permutevar8x32_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %arg1 = bitcast <4 x i64> %a1 to <8 x i32> %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1) @@ -2023,7 +2023,7 @@ define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) { ; CHECK-LABEL: test_mm256_shuffle_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4] +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4] ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> @@ -2576,7 +2576,7 @@ define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: test_mm256_unpackhi_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; CHECK-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %arg1 = bitcast <4 x i64> %a1 to <8 x i32> @@ -2588,7 +2588,7 @@ define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: test_mm256_unpackhi_epi64: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> ret <4 x i64> %res @@ -2621,7 +2621,7 @@ define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: test_mm256_unpacklo_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; CHECK-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %arg1 = bitcast <4 x i64> %a1 to <8 x i32> @@ -2633,7 +2633,7 @@ define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: test_mm256_unpacklo_epi64: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> ret <4 x i64> %res @@ -2642,7 +2642,7 @@ define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: test_mm256_xor_si256: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = xor <4 x i64> %a0, %a1 ret <4 x i64> %res diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll --- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -18,7 +18,7 @@ define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: test_x86_avx2_pblendd_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; CHECK-NEXT: ret{{[l|q]}} %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res @@ -29,7 +29,7 @@ define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: test_x86_avx2_pblendd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; CHECK-NEXT: ret{{[l|q]}} %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res @@ -112,7 +112,7 @@ define <2 x i64> @test_x86_avx2_vextracti128(<4 x i64> %a0) { ; CHECK-LABEL: test_x86_avx2_vextracti128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %res = call <2 x i64> @llvm.x86.avx2.vextracti128(<4 x i64> %a0, i8 7) @@ -124,7 +124,7 @@ define <4 x i64> @test_x86_avx2_vinserti128(<4 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: test_x86_avx2_vinserti128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = call <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64> %a0, <2 x i64> %a1, i8 7) ret <4 x i64> %res @@ -135,7 +135,7 @@ define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx2_vbroadcast_sd_pd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0) ret <4 x double> %res @@ -146,7 +146,7 @@ define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps: ; CHECK: ## %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0) ret <4 x float> %res @@ -157,7 +157,7 @@ define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0) ret <8 x float> %res @@ -212,7 +212,7 @@ define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) { ; CHECK-LABEL: test_x86_avx2_pbroadcastd_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0) ret <4 x i32> %res @@ -223,7 +223,7 @@ define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) { ; CHECK-LABEL: test_x86_avx2_pbroadcastd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0) ret <8 x i32> %res @@ -234,7 +234,7 @@ define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) { ; CHECK-LABEL: test_x86_avx2_pbroadcastq_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0) ret <2 x i64> %res @@ -245,7 +245,7 @@ define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) { ; CHECK-LABEL: test_x86_avx2_pbroadcastq_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0) ret <4 x i64> %res @@ -564,7 +564,7 @@ define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_x86_avx2_vperm2i128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; CHECK-NEXT: ret{{[l|q]}} %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll --- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -23,29 +23,29 @@ define <16 x i16> @test_x86_avx2_packssdw_fold() { ; X86-AVX-LABEL: test_x86_avx2_packssdw_fold: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] -; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] +; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packssdw_fold: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovaps {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] -; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] +; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packssdw_fold: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] -; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packssdw_fold: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] -; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> zeroinitializer, <8 x i32> ) @@ -72,29 +72,29 @@ define <32 x i8> @test_x86_avx2_packsswb_fold() { ; X86-AVX-LABEL: test_x86_avx2_packsswb_fold: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packsswb_fold: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovaps {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packsswb_fold: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packsswb_fold: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> , <16 x i16> zeroinitializer) @@ -121,29 +121,29 @@ define <32 x i8> @test_x86_avx2_packuswb_fold() { ; X86-AVX-LABEL: test_x86_avx2_packuswb_fold: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packuswb_fold: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovaps {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packuswb_fold: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packuswb_fold: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> , <16 x i16> zeroinitializer) @@ -815,29 +815,29 @@ define <16 x i16> @test_x86_avx2_packusdw_fold() { ; X86-AVX-LABEL: test_x86_avx2_packusdw_fold: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovaps {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] -; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] -; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] -; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> ) @@ -999,7 +999,7 @@ define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: test_x86_avx2_pblendd_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps $8, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08] +; CHECK-NEXT: vpblendd $8, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x02,0xc0,0x08] ; CHECK-NEXT: # xmm0 = xmm1[0,1,2],xmm0[3] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1] @@ -1011,7 +1011,7 @@ define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: test_x86_avx2_pblendd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps $7, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x07] +; CHECK-NEXT: vpblendd $7, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0x07] ; CHECK-NEXT: # ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1] @@ -1026,12 +1026,12 @@ define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) { ; AVX2-LABEL: test_x86_avx2_permd: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x16,0xc0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x36,0xc0] ; AVX2-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx2_permd: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] +; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xc0] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res @@ -1045,12 +1045,12 @@ define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) { ; AVX2-LABEL: test_x86_avx2_permps: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x16,0xc0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x36,0xc0] ; AVX2-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx2_permps: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] +; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xc0] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1] ret <8 x float> %res diff --git a/llvm/test/CodeGen/X86/avx2-masked-gather.ll b/llvm/test/CodeGen/X86/avx2-masked-gather.ll --- a/llvm/test/CodeGen/X86/avx2-masked-gather.ll +++ b/llvm/test/CodeGen/X86/avx2-masked-gather.ll @@ -787,12 +787,12 @@ define <2 x double> @masked_gather_zeromask(<2 x double*>* %ptr, <2 x double> %dummy, <2 x double> %passthru) { ; X86-LABEL: masked_gather_zeromask: ; X86: # %bb.0: # %entry -; X86-NEXT: vmovaps %xmm1, %xmm0 +; X86-NEXT: vmovdqa %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: masked_gather_zeromask: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps %xmm1, %xmm0 +; X64-NEXT: vmovdqa %xmm1, %xmm0 ; X64-NEXT: retq ; ; NOGATHER-LABEL: masked_gather_zeromask: diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll @@ -145,12 +145,12 @@ ; X32-LABEL: D32: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastss (%eax), %xmm0 +; X32-NEXT: vpbroadcastd (%eax), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: D32: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss (%rdi), %xmm0 +; X64-NEXT: vpbroadcastd (%rdi), %xmm0 ; X64-NEXT: retq entry: %q = load i32, i32* %ptr, align 4 @@ -165,12 +165,12 @@ ; X32-LABEL: DD32: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastss (%eax), %ymm0 +; X32-NEXT: vpbroadcastd (%eax), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: DD32: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss (%rdi), %ymm0 +; X64-NEXT: vpbroadcastd (%rdi), %ymm0 ; X64-NEXT: retq entry: %q = load i32, i32* %ptr, align 4 @@ -189,12 +189,12 @@ ; X32-LABEL: Q64: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X32-NEXT: vpbroadcastq (%eax), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: Q64: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-NEXT: vpbroadcastq (%rdi), %xmm0 ; X64-NEXT: retq entry: %q = load i64, i64* %ptr, align 4 @@ -207,12 +207,12 @@ ; X32-LABEL: QQ64: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastsd (%eax), %ymm0 +; X32-NEXT: vpbroadcastq (%eax), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: QQ64: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastsd (%rdi), %ymm0 +; X64-NEXT: vpbroadcastq (%rdi), %ymm0 ; X64-NEXT: retq entry: %q = load i64, i64* %ptr, align 4 @@ -227,12 +227,12 @@ ; X32-LABEL: broadcast_mem_v4i16_v8i16: ; X32: ## %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X32-NEXT: vpbroadcastq (%eax), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: broadcast_mem_v4i16_v8i16: ; X64: ## %bb.0: -; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-NEXT: vpbroadcastq (%rdi), %xmm0 ; X64-NEXT: retq %load = load <4 x i16>, <4 x i16>* %ptr %shuf = shufflevector <4 x i16> %load, <4 x i16> undef, <8 x i32> @@ -243,12 +243,12 @@ ; X32-LABEL: broadcast_mem_v4i16_v16i16: ; X32: ## %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastsd (%eax), %ymm0 +; X32-NEXT: vpbroadcastq (%eax), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: broadcast_mem_v4i16_v16i16: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastsd (%rdi), %ymm0 +; X64-NEXT: vpbroadcastq (%rdi), %ymm0 ; X64-NEXT: retq %load = load <4 x i16>, <4 x i16>* %ptr %shuf = shufflevector <4 x i16> %load, <4 x i16> undef, <16 x i32> @@ -363,12 +363,12 @@ ; X32-LABEL: load_splat_4i32_4i32_1111: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastss 4(%eax), %xmm0 +; X32-NEXT: vpbroadcastd 4(%eax), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_4i32_4i32_1111: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss 4(%rdi), %xmm0 +; X64-NEXT: vpbroadcastd 4(%rdi), %xmm0 ; X64-NEXT: retq entry: %ld = load <4 x i32>, <4 x i32>* %ptr @@ -380,12 +380,12 @@ ; X32-LABEL: load_splat_8i32_4i32_33333333: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastss 12(%eax), %ymm0 +; X32-NEXT: vpbroadcastd 12(%eax), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_8i32_4i32_33333333: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss 12(%rdi), %ymm0 +; X64-NEXT: vpbroadcastd 12(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <4 x i32>, <4 x i32>* %ptr @@ -397,12 +397,12 @@ ; X32-LABEL: load_splat_8i32_8i32_55555555: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastss 20(%eax), %ymm0 +; X32-NEXT: vpbroadcastd 20(%eax), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_8i32_8i32_55555555: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss 20(%rdi), %ymm0 +; X64-NEXT: vpbroadcastd 20(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <8 x i32>, <8 x i32>* %ptr @@ -414,12 +414,12 @@ ; X32-LABEL: load_splat_4f32_4f32_1111: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastss 4(%eax), %xmm0 +; X32-NEXT: vpbroadcastd 4(%eax), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_4f32_4f32_1111: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss 4(%rdi), %xmm0 +; X64-NEXT: vpbroadcastd 4(%rdi), %xmm0 ; X64-NEXT: retq entry: %ld = load <4 x float>, <4 x float>* %ptr @@ -431,12 +431,12 @@ ; X32-LABEL: load_splat_8f32_4f32_33333333: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastss 12(%eax), %ymm0 +; X32-NEXT: vpbroadcastd 12(%eax), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_8f32_4f32_33333333: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss 12(%rdi), %ymm0 +; X64-NEXT: vpbroadcastd 12(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <4 x float>, <4 x float>* %ptr @@ -448,12 +448,12 @@ ; X32-LABEL: load_splat_8f32_8f32_55555555: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastss 20(%eax), %ymm0 +; X32-NEXT: vpbroadcastd 20(%eax), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_8f32_8f32_55555555: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastss 20(%rdi), %ymm0 +; X64-NEXT: vpbroadcastd 20(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <8 x float>, <8 x float>* %ptr @@ -465,12 +465,12 @@ ; X32-LABEL: load_splat_2i64_2i64_1111: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X32-NEXT: vpbroadcastq 8(%eax), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_2i64_2i64_1111: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-NEXT: vpbroadcastq 8(%rdi), %xmm0 ; X64-NEXT: retq entry: %ld = load <2 x i64>, <2 x i64>* %ptr @@ -482,12 +482,12 @@ ; X32-LABEL: load_splat_4i64_2i64_1111: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastsd 8(%eax), %ymm0 +; X32-NEXT: vpbroadcastq 8(%eax), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_4i64_2i64_1111: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0 +; X64-NEXT: vpbroadcastq 8(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <2 x i64>, <2 x i64>* %ptr @@ -499,12 +499,12 @@ ; X32-LABEL: load_splat_4i64_4i64_2222: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastsd 16(%eax), %ymm0 +; X32-NEXT: vpbroadcastq 16(%eax), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_4i64_4i64_2222: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0 +; X64-NEXT: vpbroadcastq 16(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <4 x i64>, <4 x i64>* %ptr @@ -516,12 +516,12 @@ ; X32-LABEL: load_splat_2f64_2f64_1111: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X32-NEXT: vpbroadcastq 8(%eax), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_2f64_2f64_1111: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-NEXT: vpbroadcastq 8(%rdi), %xmm0 ; X64-NEXT: retq entry: %ld = load <2 x double>, <2 x double>* %ptr @@ -533,12 +533,12 @@ ; X32-LABEL: load_splat_4f64_2f64_1111: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastsd 8(%eax), %ymm0 +; X32-NEXT: vpbroadcastq 8(%eax), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_4f64_2f64_1111: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0 +; X64-NEXT: vpbroadcastq 8(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <2 x double>, <2 x double>* %ptr @@ -550,12 +550,12 @@ ; X32-LABEL: load_splat_4f64_4f64_2222: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastsd 16(%eax), %ymm0 +; X32-NEXT: vpbroadcastq 16(%eax), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_4f64_4f64_2222: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0 +; X64-NEXT: vpbroadcastq 16(%rdi), %ymm0 ; X64-NEXT: retq entry: %ld = load <4 x double>, <4 x double>* %ptr @@ -569,12 +569,12 @@ ; X32-LABEL: I: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X32-NEXT: vpbroadcastq (%eax), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: I: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-NEXT: vpbroadcastq (%rdi), %xmm0 ; X64-NEXT: retq entry: %q = load double, double* %ptr, align 4 @@ -640,12 +640,12 @@ define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp { ; X32-LABEL: _e2: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] +; X32-NEXT: vpbroadcastd {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] ; X32-NEXT: retl ; ; X64-LABEL: _e2: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] +; X64-NEXT: vpbroadcastd {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] ; X64-NEXT: retq %vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1 @@ -657,12 +657,12 @@ define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp { ; X32-LABEL: _e4: ; X32: ## %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> +; X32-NEXT: vmovdqa {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> ; X32-NEXT: retl ; ; X64-LABEL: _e4: ; X64: ## %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> +; X64-NEXT: vmovdqa {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> ; X64-NEXT: retq %vecinit0.i = insertelement <8 x i8> undef, i8 52, i32 0 %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1 @@ -728,7 +728,7 @@ define <8 x i32> @_inreg0(i32 %scalar) nounwind uwtable readnone ssp { ; X32-LABEL: _inreg0: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0 +; X32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm0 ; X32-NEXT: retl ; ; X64-AVX2-LABEL: _inreg0: @@ -749,12 +749,12 @@ define <8 x float> @_inreg1(float %scalar) nounwind uwtable readnone ssp { ; X32-LABEL: _inreg1: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0 +; X32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: _inreg1: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastss %xmm0, %ymm0 +; X64-NEXT: vpbroadcastd %xmm0, %ymm0 ; X64-NEXT: retq %in = insertelement <8 x float> undef, float %scalar, i32 0 %wide = shufflevector <8 x float> %in, <8 x float> undef, <8 x i32> zeroinitializer @@ -764,12 +764,12 @@ define <4 x float> @_inreg2(float %scalar) nounwind uwtable readnone ssp { ; X32-LABEL: _inreg2: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm0 +; X32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: _inreg2: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastss %xmm0, %xmm0 +; X64-NEXT: vpbroadcastd %xmm0, %xmm0 ; X64-NEXT: retq %in = insertelement <4 x float> undef, float %scalar, i32 0 %wide = shufflevector <4 x float> %in, <4 x float> undef, <4 x i32> zeroinitializer @@ -779,12 +779,12 @@ define <4 x double> @_inreg3(double %scalar) nounwind uwtable readnone ssp { ; X32-LABEL: _inreg3: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0 +; X32-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: _inreg3: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastsd %xmm0, %ymm0 +; X64-NEXT: vpbroadcastq %xmm0, %ymm0 ; X64-NEXT: retq %in = insertelement <4 x double> undef, double %scalar, i32 0 %wide = shufflevector <4 x double> %in, <4 x double> undef, <4 x i32> zeroinitializer @@ -794,12 +794,12 @@ define <8 x float> @_inreg8xfloat(<8 x float> %a) { ; X32-LABEL: _inreg8xfloat: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastss %xmm0, %ymm0 +; X32-NEXT: vpbroadcastd %xmm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: _inreg8xfloat: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastss %xmm0, %ymm0 +; X64-NEXT: vpbroadcastd %xmm0, %ymm0 ; X64-NEXT: retq %b = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> zeroinitializer ret <8 x float> %b @@ -808,12 +808,12 @@ define <4 x float> @_inreg4xfloat(<4 x float> %a) { ; X32-LABEL: _inreg4xfloat: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastss %xmm0, %xmm0 +; X32-NEXT: vpbroadcastd %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: _inreg4xfloat: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastss %xmm0, %xmm0 +; X64-NEXT: vpbroadcastd %xmm0, %xmm0 ; X64-NEXT: retq %b = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer ret <4 x float> %b @@ -850,12 +850,12 @@ define <4 x i64> @_inreg4xi64(<4 x i64> %a) { ; X32-LABEL: _inreg4xi64: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastsd %xmm0, %ymm0 +; X32-NEXT: vpbroadcastq %xmm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: _inreg4xi64: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastsd %xmm0, %ymm0 +; X64-NEXT: vpbroadcastq %xmm0, %ymm0 ; X64-NEXT: retq %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> zeroinitializer ret <4 x i64> %b @@ -864,12 +864,12 @@ define <2 x i64> @_inreg2xi64(<2 x i64> %a) { ; X32-LABEL: _inreg2xi64: ; X32: ## %bb.0: -; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X32-NEXT: vpbroadcastq %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: _inreg2xi64: ; X64: ## %bb.0: -; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X64-NEXT: vpbroadcastq %xmm0, %xmm0 ; X64-NEXT: retq %b = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %b @@ -878,12 +878,12 @@ define <4 x double> @_inreg4xdouble(<4 x double> %a) { ; X32-LABEL: _inreg4xdouble: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastsd %xmm0, %ymm0 +; X32-NEXT: vpbroadcastq %xmm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: _inreg4xdouble: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastsd %xmm0, %ymm0 +; X64-NEXT: vpbroadcastq %xmm0, %ymm0 ; X64-NEXT: retq %b = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> zeroinitializer ret <4 x double> %b @@ -892,12 +892,12 @@ define <2 x double> @_inreg2xdouble(<2 x double> %a) { ; X32-LABEL: _inreg2xdouble: ; X32: ## %bb.0: -; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X32-NEXT: vpbroadcastq %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: _inreg2xdouble: ; X64: ## %bb.0: -; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X64-NEXT: vpbroadcastq %xmm0, %xmm0 ; X64-NEXT: retq %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer ret <2 x double> %b @@ -906,12 +906,12 @@ define <8 x i32> @_inreg8xi32(<8 x i32> %a) { ; X32-LABEL: _inreg8xi32: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastss %xmm0, %ymm0 +; X32-NEXT: vpbroadcastd %xmm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: _inreg8xi32: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastss %xmm0, %ymm0 +; X64-NEXT: vpbroadcastd %xmm0, %ymm0 ; X64-NEXT: retq %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> zeroinitializer ret <8 x i32> %b @@ -920,12 +920,12 @@ define <4 x i32> @_inreg4xi32(<4 x i32> %a) { ; X32-LABEL: _inreg4xi32: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastss %xmm0, %xmm0 +; X32-NEXT: vpbroadcastd %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: _inreg4xi32: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastss %xmm0, %xmm0 +; X64-NEXT: vpbroadcastd %xmm0, %xmm0 ; X64-NEXT: retq %b = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %b @@ -966,12 +966,12 @@ define <8 x float> @splat_concat1(float %f) { ; X32-LABEL: splat_concat1: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0 +; X32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: splat_concat1: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastss %xmm0, %ymm0 +; X64-NEXT: vpbroadcastd %xmm0, %ymm0 ; X64-NEXT: retq %1 = insertelement <4 x float> undef, float %f, i32 0 %2 = insertelement <4 x float> %1, float %f, i32 1 @@ -984,12 +984,12 @@ define <8 x float> @splat_concat2(float %f) { ; X32-LABEL: splat_concat2: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0 +; X32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: splat_concat2: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastss %xmm0, %ymm0 +; X64-NEXT: vpbroadcastd %xmm0, %ymm0 ; X64-NEXT: retq %1 = insertelement <4 x float> undef, float %f, i32 0 %2 = insertelement <4 x float> %1, float %f, i32 1 @@ -1006,12 +1006,12 @@ define <4 x double> @splat_concat3(double %d) { ; X32-LABEL: splat_concat3: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0 +; X32-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: splat_concat3: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastsd %xmm0, %ymm0 +; X64-NEXT: vpbroadcastq %xmm0, %ymm0 ; X64-NEXT: retq %1 = insertelement <2 x double> undef, double %d, i32 0 %2 = insertelement <2 x double> %1, double %d, i32 1 @@ -1022,12 +1022,12 @@ define <4 x double> @splat_concat4(double %d) { ; X32-LABEL: splat_concat4: ; X32: ## %bb.0: -; X32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0 +; X32-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: splat_concat4: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastsd %xmm0, %ymm0 +; X64-NEXT: vpbroadcastq %xmm0, %ymm0 ; X64-NEXT: retq %1 = insertelement <2 x double> undef, double %d, i32 0 %2 = insertelement <2 x double> %1, double %d, i32 1 @@ -1042,17 +1042,17 @@ ; X32-AVX2: ## %bb.0: ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX2-NEXT: vbroadcastss (%ecx), %ymm0 -; X32-AVX2-NEXT: vmovups %ymm0, 32(%eax) -; X32-AVX2-NEXT: vmovups %ymm0, (%eax) +; X32-AVX2-NEXT: vpbroadcastd (%ecx), %ymm0 +; X32-AVX2-NEXT: vmovdqu %ymm0, 32(%eax) +; X32-AVX2-NEXT: vmovdqu %ymm0, (%eax) ; X32-AVX2-NEXT: vzeroupper ; X32-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: broadcast_v16i32: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vbroadcastss (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovups %ymm0, 32(%rsi) -; X64-AVX2-NEXT: vmovups %ymm0, (%rsi) +; X64-AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) +; X64-AVX2-NEXT: vmovdqu %ymm0, (%rsi) ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; @@ -1060,15 +1060,15 @@ ; X32-AVX512VL: ## %bb.0: ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512VL-NEXT: vbroadcastss (%ecx), %zmm0 -; X32-AVX512VL-NEXT: vmovups %zmm0, (%eax) +; X32-AVX512VL-NEXT: vpbroadcastd (%ecx), %zmm0 +; X32-AVX512VL-NEXT: vmovdqu64 %zmm0, (%eax) ; X32-AVX512VL-NEXT: vzeroupper ; X32-AVX512VL-NEXT: retl ; ; X64-AVX512VL-LABEL: broadcast_v16i32: ; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vbroadcastss (%rdi), %zmm0 -; X64-AVX512VL-NEXT: vmovups %zmm0, (%rsi) +; X64-AVX512VL-NEXT: vpbroadcastd (%rdi), %zmm0 +; X64-AVX512VL-NEXT: vmovdqu64 %zmm0, (%rsi) ; X64-AVX512VL-NEXT: vzeroupper ; X64-AVX512VL-NEXT: retq %1 = load i32, i32* %a, align 4 @@ -1093,20 +1093,20 @@ ; X32-NEXT: subl $60, %esp ; X32-NEXT: .cfi_def_cfa_offset 64 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %xmm0, (%esp) +; X32-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovdqa %xmm0, (%esp) ; X32-NEXT: vpbroadcastb (%eax), %xmm1 -; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl ; ; X64-LABEL: isel_crash_16b: ; X64: ## %bb.0: ## %eintry -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vpbroadcastb (%rdi), %xmm1 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq eintry: @@ -1135,10 +1135,10 @@ ; X32-NEXT: andl $-32, %esp ; X32-NEXT: subl $128, %esp ; X32-NEXT: movl 8(%ebp), %eax -; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %ymm0, (%esp) +; X32-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovdqa %ymm0, (%esp) ; X32-NEXT: vpbroadcastb (%eax), %ymm1 -; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp @@ -1154,10 +1154,10 @@ ; X64-NEXT: .cfi_def_cfa_register %rbp ; X64-NEXT: andq $-32, %rsp ; X64-NEXT: subq $128, %rsp -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %ymm0, (%rsp) +; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovdqa %ymm0, (%rsp) ; X64-NEXT: vpbroadcastb (%rdi), %ymm1 -; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movq %rbp, %rsp ; X64-NEXT: popq %rbp @@ -1184,20 +1184,20 @@ ; X32-NEXT: subl $60, %esp ; X32-NEXT: .cfi_def_cfa_offset 64 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %xmm0, (%esp) +; X32-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovdqa %xmm0, (%esp) ; X32-NEXT: vpbroadcastw (%eax), %xmm1 -; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl ; ; X64-LABEL: isel_crash_8w: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vpbroadcastw (%rdi), %xmm1 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq entry: @@ -1226,10 +1226,10 @@ ; X32-NEXT: andl $-32, %esp ; X32-NEXT: subl $128, %esp ; X32-NEXT: movl 8(%ebp), %eax -; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %ymm0, (%esp) +; X32-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovdqa %ymm0, (%esp) ; X32-NEXT: vpbroadcastw (%eax), %ymm1 -; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp @@ -1245,10 +1245,10 @@ ; X64-NEXT: .cfi_def_cfa_register %rbp ; X64-NEXT: andq $-32, %rsp ; X64-NEXT: subq $128, %rsp -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %ymm0, (%rsp) +; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovdqa %ymm0, (%rsp) ; X64-NEXT: vpbroadcastw (%rdi), %ymm1 -; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movq %rbp, %rsp ; X64-NEXT: popq %rbp @@ -1275,21 +1275,21 @@ ; X32-NEXT: subl $60, %esp ; X32-NEXT: .cfi_def_cfa_offset 64 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %xmm0, (%esp) -; X32-NEXT: vbroadcastss (%eax), %xmm1 -; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovdqa %xmm0, (%esp) +; X32-NEXT: vpbroadcastd (%eax), %xmm1 +; X32-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl ; ; X64-LABEL: isel_crash_4d: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vbroadcastss (%rdi), %xmm1 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vpbroadcastd (%rdi), %xmm1 +; X64-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq entry: %__a.addr.i = alloca <2 x i64>, align 16 @@ -1317,11 +1317,11 @@ ; X32-NEXT: andl $-32, %esp ; X32-NEXT: subl $128, %esp ; X32-NEXT: movl 8(%ebp), %eax -; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %ymm0, (%esp) -; X32-NEXT: vbroadcastss (%eax), %ymm1 -; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) -; X32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) +; X32-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovdqa %ymm0, (%esp) +; X32-NEXT: vpbroadcastd (%eax), %ymm1 +; X32-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp ; X32-NEXT: vzeroupper @@ -1336,11 +1336,11 @@ ; X64-NEXT: .cfi_def_cfa_register %rbp ; X64-NEXT: andq $-32, %rsp ; X64-NEXT: subq $128, %rsp -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %ymm0, (%rsp) -; X64-NEXT: vbroadcastss (%rdi), %ymm1 -; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovdqa %ymm0, (%rsp) +; X64-NEXT: vpbroadcastd (%rdi), %ymm1 +; X64-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movq %rbp, %rsp ; X64-NEXT: popq %rbp ; X64-NEXT: vzeroupper @@ -1366,21 +1366,21 @@ ; X32-NEXT: subl $60, %esp ; X32-NEXT: .cfi_def_cfa_offset 64 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %xmm0, (%esp) -; X32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovdqa %xmm0, (%esp) +; X32-NEXT: vpbroadcastq (%eax), %xmm1 +; X32-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl ; ; X64-LABEL: isel_crash_2q: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vpbroadcastq (%rdi), %xmm1 +; X64-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq entry: %__a.addr.i = alloca <2 x i64>, align 16 @@ -1407,11 +1407,11 @@ ; X32-NEXT: andl $-32, %esp ; X32-NEXT: subl $128, %esp ; X32-NEXT: movl 8(%ebp), %eax -; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-NEXT: vmovaps %ymm0, (%esp) -; X32-NEXT: vbroadcastsd (%eax), %ymm1 -; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) -; X32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) +; X32-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovdqa %ymm0, (%esp) +; X32-NEXT: vpbroadcastq (%eax), %ymm1 +; X32-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp ; X32-NEXT: vzeroupper @@ -1426,11 +1426,11 @@ ; X64-NEXT: .cfi_def_cfa_register %rbp ; X64-NEXT: andq $-32, %rsp ; X64-NEXT: subq $128, %rsp -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %ymm0, (%rsp) -; X64-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovdqa %ymm0, (%rsp) +; X64-NEXT: vpbroadcastq (%rdi), %ymm1 +; X64-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movq %rbp, %rsp ; X64-NEXT: popq %rbp ; X64-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll --- a/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll @@ -271,16 +271,16 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X32-NEXT: vmovaps %ymm1, (%eax) +; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-NEXT: vmovdqa %ymm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: PR29088: ; X64: # %bb.0: -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-NEXT: vmovaps %ymm1, (%rsi) +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-NEXT: vmovdqa %ymm1, (%rsi) ; X64-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 diff --git a/llvm/test/CodeGen/X86/avx2-vperm.ll b/llvm/test/CodeGen/X86/avx2-vperm.ll --- a/llvm/test/CodeGen/X86/avx2-vperm.ll +++ b/llvm/test/CodeGen/X86/avx2-vperm.ll @@ -5,14 +5,14 @@ define <8 x i32> @perm_cl_int_8x32(<8 x i32> %A) nounwind readnone { ; X32-LABEL: perm_cl_int_8x32: ; X32: # %bb.0: # %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0] -; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0] +; X32-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: perm_cl_int_8x32: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0] -; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0] +; X64-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq entry: %B = shufflevector <8 x i32> %A, <8 x i32> undef, <8 x i32> @@ -23,14 +23,14 @@ define <8 x float> @perm_cl_fp_8x32(<8 x float> %A) nounwind readnone { ; X32-LABEL: perm_cl_fp_8x32: ; X32: # %bb.0: # %entry -; X32-NEXT: vmovaps {{.*#+}} ymm1 = -; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; X32-NEXT: vmovdqa {{.*#+}} ymm1 = +; X32-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: perm_cl_fp_8x32: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = -; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; X64-NEXT: vmovdqa {{.*#+}} ymm1 = +; X64-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq entry: %B = shufflevector <8 x float> %A, <8 x float> undef, <8 x i32> @@ -40,12 +40,12 @@ define <4 x i64> @perm_cl_int_4x64(<4 x i64> %A) nounwind readnone { ; X32-LABEL: perm_cl_int_4x64: ; X32: # %bb.0: # %entry -; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1] +; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,1] ; X32-NEXT: retl ; ; X64-LABEL: perm_cl_int_4x64: ; X64: # %bb.0: # %entry -; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1] +; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,1] ; X64-NEXT: retq entry: %B = shufflevector <4 x i64> %A, <4 x i64> undef, <4 x i32> @@ -55,12 +55,12 @@ define <4 x double> @perm_cl_fp_4x64(<4 x double> %A) nounwind readnone { ; X32-LABEL: perm_cl_fp_4x64: ; X32: # %bb.0: # %entry -; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1] +; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,1] ; X32-NEXT: retl ; ; X64-LABEL: perm_cl_fp_4x64: ; X64: # %bb.0: # %entry -; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1] +; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,1] ; X64-NEXT: retq entry: %B = shufflevector <4 x double> %A, <4 x double> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -664,59 +664,19 @@ } define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind { -; AVX512F-LABEL: orq_broadcast: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: orq_broadcast: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: orq_broadcast: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: orq_broadcast: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512DQ-NEXT: retq -; -; SKX-LABEL: orq_broadcast: -; SKX: # %bb.0: -; SKX-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; SKX-NEXT: retq +; CHECK-LABEL: orq_broadcast: +; CHECK: # %bb.0: +; CHECK-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq %b = or <8 x i64> %a, ret <8 x i64> %b } define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) { -; AVX512F-LABEL: andd512fold: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpandd (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: andd512fold: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpandd (%rdi), %zmm0, %zmm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: andd512fold: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpandd (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: andd512fold: -; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vandps (%rdi), %zmm0, %zmm0 -; AVX512DQ-NEXT: retq -; -; SKX-LABEL: andd512fold: -; SKX: # %bb.0: # %entry -; SKX-NEXT: vandps (%rdi), %zmm0, %zmm0 -; SKX-NEXT: retq +; CHECK-LABEL: andd512fold: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpandd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %a = load <16 x i32>, <16 x i32>* %x, align 4 %b = and <16 x i32> %y, %a @@ -724,30 +684,10 @@ } define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) { -; AVX512F-LABEL: andqbrst: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: andqbrst: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: andqbrst: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: andqbrst: -; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vandpd (%rdi){1to8}, %zmm0, %zmm0 -; AVX512DQ-NEXT: retq -; -; SKX-LABEL: andqbrst: -; SKX: # %bb.0: # %entry -; SKX-NEXT: vandpd (%rdi){1to8}, %zmm0, %zmm0 -; SKX-NEXT: retq +; CHECK-LABEL: andqbrst: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %a = load i64, i64* %ap, align 8 %b = insertelement <8 x i64> undef, i64 %a, i32 0 @@ -1031,30 +971,10 @@ } define <16 x float> @test_fxor(<16 x float> %a) { -; AVX512F-LABEL: test_fxor: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: test_fxor: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: test_fxor: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: test_fxor: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512DQ-NEXT: retq -; -; SKX-LABEL: test_fxor: -; SKX: # %bb.0: -; SKX-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; SKX-NEXT: retq +; CHECK-LABEL: test_fxor: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: retq %res = fsub <16 x float> , %a ret <16 x float>%res @@ -1063,8 +983,8 @@ define <8 x float> @test_fxor_8f32(<8 x float> %a) { ; AVX512F-LABEL: test_fxor_8f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512F-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: test_fxor_8f32: @@ -1074,49 +994,29 @@ ; ; AVX512BW-LABEL: test_fxor_8f32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512BW-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: test_fxor_8f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512DQ-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: retq ; ; SKX-LABEL: test_fxor_8f32: ; SKX: # %bb.0: -; SKX-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; SKX-NEXT: vpxord {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; SKX-NEXT: retq %res = fsub <8 x float> , %a ret <8 x float>%res } define <8 x double> @fabs_v8f64(<8 x double> %p) -; AVX512F-LABEL: fabs_v8f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fabs_v8f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: fabs_v8f64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: fabs_v8f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512DQ-NEXT: retq -; -; SKX-LABEL: fabs_v8f64: -; SKX: # %bb.0: -; SKX-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; SKX-NEXT: retq +; CHECK-LABEL: fabs_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq { %t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p) ret <8 x double> %t @@ -1124,30 +1024,10 @@ declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p) define <16 x float> @fabs_v16f32(<16 x float> %p) -; AVX512F-LABEL: fabs_v16f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fabs_v16f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: fabs_v16f32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: fabs_v16f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512DQ-NEXT: retq -; -; SKX-LABEL: fabs_v16f32: -; SKX: # %bb.0: -; SKX-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; SKX-NEXT: retq +; CHECK-LABEL: fabs_v16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: retq { %t = call <16 x float> @llvm.fabs.v16f32(<16 x float> %p) ret <16 x float> %t diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -449,12 +449,12 @@ ; CHECK-LABEL: bcast_unfold_or_v8i32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB13_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vorps 4096(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpor 4096(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB13_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -483,12 +483,12 @@ ; CHECK-LABEL: bcast_unfold_or_v4i32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB14_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vorps 4096(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpor 4096(%rdi,%rax), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB14_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -550,12 +550,12 @@ ; CHECK-LABEL: bcast_unfold_or_v4i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,3,3,3] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB16_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vorps 8192(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpor 8192(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB16_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -584,12 +584,12 @@ ; CHECK-LABEL: bcast_unfold_or_v2i64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [3,3] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB17_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vorps 8192(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpor 8192(%rdi,%rax), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB17_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -651,12 +651,12 @@ ; CHECK-LABEL: bcast_unfold_fneg_v8f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB19_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vxorps 4096(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpxor 4096(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB19_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -685,12 +685,12 @@ ; CHECK-LABEL: bcast_unfold_fneg_v4f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB20_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vxorps 4096(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpxor 4096(%rdi,%rax), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB20_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -752,12 +752,12 @@ ; CHECK-LABEL: bcast_unfold_fneg_v4f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB22_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vxorps 8192(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpxor 8192(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB22_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -786,12 +786,12 @@ ; CHECK-LABEL: bcast_unfold_fneg_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB23_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vxorps 8192(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpxor 8192(%rdi,%rax), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB23_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -856,12 +856,12 @@ ; CHECK-LABEL: bcast_unfold_fabs_v8f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB25_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vandps 4096(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpand 4096(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB25_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -893,12 +893,12 @@ ; CHECK-LABEL: bcast_unfold_fabs_v4f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB26_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vandps 4096(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpand 4096(%rdi,%rax), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB26_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -966,12 +966,12 @@ ; CHECK-LABEL: bcast_unfold_fabs_v4f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB28_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vandps 8192(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpand 8192(%rdi,%rax), %ymm0, %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB28_1 ; CHECK-NEXT: # %bb.2: # %bb9 @@ -1003,12 +1003,12 @@ ; CHECK-LABEL: bcast_unfold_fabs_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [NaN,NaN] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB29_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vandps 8192(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpand 8192(%rdi,%rax), %xmm0, %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB29_1 ; CHECK-NEXT: # %bb.2: # %bb9 diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll b/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll --- a/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll +++ b/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll @@ -19,7 +19,7 @@ ; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 -; CHECK-NEXT: vmovups %zmm0, (%rdi) +; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) ; CHECK-NEXT: retq allocas: %ptr_cast_for_load = bitcast float* %aa to <16 x float>* diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll b/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll --- a/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll +++ b/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll @@ -9,15 +9,15 @@ ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $112, %rsp ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: vmovups (%rdi), %zmm0 -; CHECK-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill -; CHECK-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; CHECK-NEXT: vmovaps %zmm1, (%rdi) +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, (%rsp) ## 64-byte Spill +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; CHECK-NEXT: vmovdqa64 %zmm1, (%rdi) ; CHECK-NEXT: callq _Print__512 -; CHECK-NEXT: vmovups (%rsp), %zmm0 ## 64-byte Reload +; CHECK-NEXT: vmovdqu64 (%rsp), %zmm0 ## 64-byte Reload ; CHECK-NEXT: callq _Print__512 -; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; CHECK-NEXT: vmovaps %zmm0, (%rbx) +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; CHECK-NEXT: vmovdqa64 %zmm0, (%rbx) ; CHECK-NEXT: addq $112, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -7,17 +7,17 @@ define <16 x i1> @test1() { ; ALL_X64-LABEL: test1: ; ALL_X64: ## %bb.0: -; ALL_X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; ALL_X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; ALL_X64-NEXT: retq ; ; KNL_X32-LABEL: test1: ; KNL_X32: ## %bb.0: -; KNL_X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; KNL_X32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_X32-NEXT: retl ; ; FASTISEL-LABEL: test1: ; FASTISEL: ## %bb.0: -; FASTISEL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; FASTISEL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; FASTISEL-NEXT: retq ret <16 x i1> zeroinitializer } @@ -25,12 +25,12 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) { ; ALL_X64-LABEL: test2: ; ALL_X64: ## %bb.0: -; ALL_X64-NEXT: vandps %xmm1, %xmm0, %xmm0 +; ALL_X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL_X64-NEXT: retq ; ; KNL_X32-LABEL: test2: ; KNL_X32: ## %bb.0: -; KNL_X32-NEXT: vandps %xmm1, %xmm0, %xmm0 +; KNL_X32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL_X32-NEXT: retl ; ; FASTISEL-LABEL: test2: @@ -49,12 +49,12 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) { ; ALL_X64-LABEL: test3: ; ALL_X64: ## %bb.0: -; ALL_X64-NEXT: vandps %xmm1, %xmm0, %xmm0 +; ALL_X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL_X64-NEXT: retq ; ; KNL_X32-LABEL: test3: ; KNL_X32: ## %bb.0: -; KNL_X32-NEXT: vandps %xmm1, %xmm0, %xmm0 +; KNL_X32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL_X32-NEXT: retl ; ; FASTISEL-LABEL: test3: @@ -73,12 +73,12 @@ define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) { ; ALL_X64-LABEL: test4: ; ALL_X64: ## %bb.0: -; ALL_X64-NEXT: vandps %xmm1, %xmm0, %xmm0 +; ALL_X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL_X64-NEXT: retq ; ; KNL_X32-LABEL: test4: ; KNL_X32: ## %bb.0: -; KNL_X32-NEXT: vandps %xmm1, %xmm0, %xmm0 +; KNL_X32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL_X32-NEXT: retl ; ; FASTISEL-LABEL: test4: @@ -274,7 +274,7 @@ ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 ; KNL-NEXT: callq _func8xi1 -; KNL-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; KNL-NEXT: popq %rax ; KNL-NEXT: retq ; @@ -298,7 +298,7 @@ ; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0 ; KNL_X32-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 ; KNL_X32-NEXT: calll _func8xi1 -; KNL_X32-NEXT: vandps LCPI7_0, %xmm0, %xmm0 +; KNL_X32-NEXT: vpand LCPI7_0, %xmm0, %xmm0 ; KNL_X32-NEXT: addl $12, %esp ; KNL_X32-NEXT: retl ; @@ -325,7 +325,7 @@ ; ALL_X64-NEXT: testb $1, %dil ; ALL_X64-NEXT: jne LBB8_2 ; ALL_X64-NEXT: ## %bb.1: -; ALL_X64-NEXT: vmovaps %xmm1, %xmm0 +; ALL_X64-NEXT: vmovdqa %xmm1, %xmm0 ; ALL_X64-NEXT: LBB8_2: ; ALL_X64-NEXT: retq ; @@ -334,7 +334,7 @@ ; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: jne LBB8_2 ; KNL_X32-NEXT: ## %bb.1: -; KNL_X32-NEXT: vmovaps %xmm1, %xmm0 +; KNL_X32-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_X32-NEXT: LBB8_2: ; KNL_X32-NEXT: retl ; @@ -343,7 +343,7 @@ ; FASTISEL-NEXT: testb $1, %dil ; FASTISEL-NEXT: jne LBB8_2 ; FASTISEL-NEXT: ## %bb.1: -; FASTISEL-NEXT: vmovaps %xmm1, %xmm0 +; FASTISEL-NEXT: vmovdqa %xmm1, %xmm0 ; FASTISEL-NEXT: LBB8_2: ; FASTISEL-NEXT: retq %res = select i1 %cond, <16 x i8> %a1, <16 x i8> %a2 @@ -557,9 +557,9 @@ ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: .cfi_offset %rbx, -16 ; KNL-NEXT: movq %rdi, %rbx -; KNL-NEXT: vmovaps (%rdi), %zmm0 +; KNL-NEXT: vmovdqa64 (%rdi), %zmm0 ; KNL-NEXT: callq _test14_callee -; KNL-NEXT: vmovaps %zmm0, (%rbx) +; KNL-NEXT: vmovdqa64 %zmm0, (%rbx) ; KNL-NEXT: popq %rbx ; KNL-NEXT: retq ; @@ -569,9 +569,9 @@ ; SKX-NEXT: .cfi_def_cfa_offset 16 ; SKX-NEXT: .cfi_offset %rbx, -16 ; SKX-NEXT: movq %rdi, %rbx -; SKX-NEXT: vmovaps (%rdi), %zmm0 +; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 ; SKX-NEXT: callq _test14_callee -; SKX-NEXT: vmovaps %zmm0, (%rbx) +; SKX-NEXT: vmovdqa64 %zmm0, (%rbx) ; SKX-NEXT: popq %rbx ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -584,9 +584,9 @@ ; KNL_X32-NEXT: .cfi_def_cfa_offset 16 ; KNL_X32-NEXT: .cfi_offset %esi, -8 ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; KNL_X32-NEXT: vmovaps (%esi), %zmm0 +; KNL_X32-NEXT: vmovdqa64 (%esi), %zmm0 ; KNL_X32-NEXT: calll _test14_callee -; KNL_X32-NEXT: vmovaps %zmm0, (%esi) +; KNL_X32-NEXT: vmovdqa64 %zmm0, (%esi) ; KNL_X32-NEXT: addl $8, %esp ; KNL_X32-NEXT: popl %esi ; KNL_X32-NEXT: retl @@ -597,9 +597,9 @@ ; FASTISEL-NEXT: .cfi_def_cfa_offset 16 ; FASTISEL-NEXT: .cfi_offset %rbx, -16 ; FASTISEL-NEXT: movq %rdi, %rbx -; FASTISEL-NEXT: vmovaps (%rdi), %zmm0 +; FASTISEL-NEXT: vmovdqa64 (%rdi), %zmm0 ; FASTISEL-NEXT: callq _test14_callee -; FASTISEL-NEXT: vmovaps %zmm0, (%rbx) +; FASTISEL-NEXT: vmovdqa64 %zmm0, (%rbx) ; FASTISEL-NEXT: popq %rbx ; FASTISEL-NEXT: vzeroupper ; FASTISEL-NEXT: retq @@ -617,9 +617,9 @@ ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: .cfi_offset %rbx, -16 ; KNL-NEXT: movq %rdi, %rbx -; KNL-NEXT: vmovaps (%rdi), %zmm0 +; KNL-NEXT: vmovdqa64 (%rdi), %zmm0 ; KNL-NEXT: callq _test15_callee -; KNL-NEXT: vmovaps %zmm0, (%rbx) +; KNL-NEXT: vmovdqa64 %zmm0, (%rbx) ; KNL-NEXT: popq %rbx ; KNL-NEXT: retq ; @@ -629,9 +629,9 @@ ; SKX-NEXT: .cfi_def_cfa_offset 16 ; SKX-NEXT: .cfi_offset %rbx, -16 ; SKX-NEXT: movq %rdi, %rbx -; SKX-NEXT: vmovaps (%rdi), %zmm0 +; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 ; SKX-NEXT: callq _test15_callee -; SKX-NEXT: vmovaps %zmm0, (%rbx) +; SKX-NEXT: vmovdqa64 %zmm0, (%rbx) ; SKX-NEXT: popq %rbx ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -644,9 +644,9 @@ ; KNL_X32-NEXT: .cfi_def_cfa_offset 16 ; KNL_X32-NEXT: .cfi_offset %esi, -8 ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; KNL_X32-NEXT: vmovaps (%esi), %zmm0 +; KNL_X32-NEXT: vmovdqa64 (%esi), %zmm0 ; KNL_X32-NEXT: calll _test15_callee -; KNL_X32-NEXT: vmovaps %zmm0, (%esi) +; KNL_X32-NEXT: vmovdqa64 %zmm0, (%esi) ; KNL_X32-NEXT: addl $8, %esp ; KNL_X32-NEXT: popl %esi ; KNL_X32-NEXT: retl @@ -657,9 +657,9 @@ ; FASTISEL-NEXT: .cfi_def_cfa_offset 16 ; FASTISEL-NEXT: .cfi_offset %rbx, -16 ; FASTISEL-NEXT: movq %rdi, %rbx -; FASTISEL-NEXT: vmovaps (%rdi), %zmm0 +; FASTISEL-NEXT: vmovdqa64 (%rdi), %zmm0 ; FASTISEL-NEXT: callq _test15_callee -; FASTISEL-NEXT: vmovaps %zmm0, (%rbx) +; FASTISEL-NEXT: vmovdqa64 %zmm0, (%rbx) ; FASTISEL-NEXT: popq %rbx ; FASTISEL-NEXT: vzeroupper ; FASTISEL-NEXT: retq @@ -3558,8 +3558,8 @@ ; KNL: ## %bb.0: ; KNL-NEXT: subq $24, %rsp ; KNL-NEXT: .cfi_def_cfa_offset 32 -; KNL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; KNL-NEXT: vmovaps %xmm8, (%rsp) +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm8 +; KNL-NEXT: vmovdqa %xmm8, (%rsp) ; KNL-NEXT: callq _v2i1_mem_callee ; KNL-NEXT: addq $24, %rsp ; KNL-NEXT: retq @@ -3568,8 +3568,8 @@ ; SKX: ## %bb.0: ; SKX-NEXT: subq $24, %rsp ; SKX-NEXT: .cfi_def_cfa_offset 32 -; SKX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; SKX-NEXT: vmovaps %xmm8, (%rsp) +; SKX-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm8 +; SKX-NEXT: vmovdqa %xmm8, (%rsp) ; SKX-NEXT: callq _v2i1_mem_callee ; SKX-NEXT: addq $24, %rsp ; SKX-NEXT: vzeroupper @@ -3584,16 +3584,16 @@ ; KNL_X32-NEXT: .cfi_def_cfa_register %ebp ; KNL_X32-NEXT: andl $-64, %esp ; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 -; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 -; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 -; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 -; KNL_X32-NEXT: vmovaps 264(%ebp), %xmm4 -; KNL_X32-NEXT: vmovaps %xmm4, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 -; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: vmovdqa64 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovdqa64 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovdqa64 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovdqa 264(%ebp), %xmm4 +; KNL_X32-NEXT: vmovdqa %xmm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovdqa64 %zmm4, (%esp) ; KNL_X32-NEXT: calll _v2i1_mem_callee ; KNL_X32-NEXT: movl %ebp, %esp ; KNL_X32-NEXT: popl %ebp @@ -3621,8 +3621,8 @@ ; KNL: ## %bb.0: ; KNL-NEXT: subq $24, %rsp ; KNL-NEXT: .cfi_def_cfa_offset 32 -; KNL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; KNL-NEXT: vmovaps %xmm8, (%rsp) +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm8 +; KNL-NEXT: vmovdqa %xmm8, (%rsp) ; KNL-NEXT: callq _v4i1_mem_callee ; KNL-NEXT: addq $24, %rsp ; KNL-NEXT: retq @@ -3631,8 +3631,8 @@ ; SKX: ## %bb.0: ; SKX-NEXT: subq $24, %rsp ; SKX-NEXT: .cfi_def_cfa_offset 32 -; SKX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; SKX-NEXT: vmovaps %xmm8, (%rsp) +; SKX-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm8 +; SKX-NEXT: vmovdqa %xmm8, (%rsp) ; SKX-NEXT: callq _v4i1_mem_callee ; SKX-NEXT: addq $24, %rsp ; SKX-NEXT: vzeroupper @@ -3647,16 +3647,16 @@ ; KNL_X32-NEXT: .cfi_def_cfa_register %ebp ; KNL_X32-NEXT: andl $-64, %esp ; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 -; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 -; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 -; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 -; KNL_X32-NEXT: vmovaps 264(%ebp), %xmm4 -; KNL_X32-NEXT: vmovaps %xmm4, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 -; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: vmovdqa64 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovdqa64 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovdqa64 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovdqa 264(%ebp), %xmm4 +; KNL_X32-NEXT: vmovdqa %xmm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovdqa64 %zmm4, (%esp) ; KNL_X32-NEXT: calll _v4i1_mem_callee ; KNL_X32-NEXT: movl %ebp, %esp ; KNL_X32-NEXT: popl %ebp @@ -3684,8 +3684,8 @@ ; KNL: ## %bb.0: ; KNL-NEXT: subq $24, %rsp ; KNL-NEXT: .cfi_def_cfa_offset 32 -; KNL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; KNL-NEXT: vmovaps %xmm8, (%rsp) +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm8 +; KNL-NEXT: vmovdqa %xmm8, (%rsp) ; KNL-NEXT: callq _v8i1_mem_callee ; KNL-NEXT: addq $24, %rsp ; KNL-NEXT: retq @@ -3694,8 +3694,8 @@ ; SKX: ## %bb.0: ; SKX-NEXT: subq $24, %rsp ; SKX-NEXT: .cfi_def_cfa_offset 32 -; SKX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; SKX-NEXT: vmovaps %xmm8, (%rsp) +; SKX-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm8 +; SKX-NEXT: vmovdqa %xmm8, (%rsp) ; SKX-NEXT: callq _v8i1_mem_callee ; SKX-NEXT: addq $24, %rsp ; SKX-NEXT: vzeroupper @@ -3710,16 +3710,16 @@ ; KNL_X32-NEXT: .cfi_def_cfa_register %ebp ; KNL_X32-NEXT: andl $-64, %esp ; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 -; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 -; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 -; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 -; KNL_X32-NEXT: vmovaps 264(%ebp), %xmm4 -; KNL_X32-NEXT: vmovaps %xmm4, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 -; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: vmovdqa64 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovdqa64 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovdqa64 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovdqa 264(%ebp), %xmm4 +; KNL_X32-NEXT: vmovdqa %xmm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovdqa64 %zmm4, (%esp) ; KNL_X32-NEXT: calll _v8i1_mem_callee ; KNL_X32-NEXT: movl %ebp, %esp ; KNL_X32-NEXT: popl %ebp @@ -3747,8 +3747,8 @@ ; KNL: ## %bb.0: ; KNL-NEXT: subq $24, %rsp ; KNL-NEXT: .cfi_def_cfa_offset 32 -; KNL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; KNL-NEXT: vmovaps %xmm8, (%rsp) +; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm8 +; KNL-NEXT: vmovdqa %xmm8, (%rsp) ; KNL-NEXT: callq _v16i1_mem_callee ; KNL-NEXT: addq $24, %rsp ; KNL-NEXT: retq @@ -3757,8 +3757,8 @@ ; SKX: ## %bb.0: ; SKX-NEXT: subq $24, %rsp ; SKX-NEXT: .cfi_def_cfa_offset 32 -; SKX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 -; SKX-NEXT: vmovaps %xmm8, (%rsp) +; SKX-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm8 +; SKX-NEXT: vmovdqa %xmm8, (%rsp) ; SKX-NEXT: callq _v16i1_mem_callee ; SKX-NEXT: addq $24, %rsp ; SKX-NEXT: vzeroupper @@ -3773,16 +3773,16 @@ ; KNL_X32-NEXT: .cfi_def_cfa_register %ebp ; KNL_X32-NEXT: andl $-64, %esp ; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 -; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 -; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 -; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 -; KNL_X32-NEXT: vmovaps 264(%ebp), %xmm4 -; KNL_X32-NEXT: vmovaps %xmm4, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 -; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: vmovdqa64 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovdqa64 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovdqa64 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovdqa 264(%ebp), %xmm4 +; KNL_X32-NEXT: vmovdqa %xmm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovdqa64 %zmm4, (%esp) ; KNL_X32-NEXT: calll _v16i1_mem_callee ; KNL_X32-NEXT: movl %ebp, %esp ; KNL_X32-NEXT: popl %ebp @@ -3815,8 +3815,8 @@ ; KNL-NEXT: .cfi_def_cfa_register %rbp ; KNL-NEXT: andq $-32, %rsp ; KNL-NEXT: subq $64, %rsp -; KNL-NEXT: vmovaps 16(%rbp), %ymm8 -; KNL-NEXT: vmovaps %ymm8, (%rsp) +; KNL-NEXT: vmovdqa 16(%rbp), %ymm8 +; KNL-NEXT: vmovdqa %ymm8, (%rsp) ; KNL-NEXT: callq _v32i1_mem_callee ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp @@ -3831,8 +3831,8 @@ ; SKX-NEXT: .cfi_def_cfa_register %rbp ; SKX-NEXT: andq $-32, %rsp ; SKX-NEXT: subq $64, %rsp -; SKX-NEXT: vmovaps 16(%rbp), %ymm8 -; SKX-NEXT: vmovaps %ymm8, (%rsp) +; SKX-NEXT: vmovdqa 16(%rbp), %ymm8 +; SKX-NEXT: vmovdqa %ymm8, (%rsp) ; SKX-NEXT: callq _v32i1_mem_callee ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp @@ -3848,16 +3848,16 @@ ; KNL_X32-NEXT: .cfi_def_cfa_register %ebp ; KNL_X32-NEXT: andl $-64, %esp ; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 -; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 -; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 -; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 -; KNL_X32-NEXT: vmovaps 264(%ebp), %ymm4 -; KNL_X32-NEXT: vmovaps %ymm4, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 -; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: vmovdqa64 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovdqa64 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovdqa64 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovdqa 264(%ebp), %ymm4 +; KNL_X32-NEXT: vmovdqa %ymm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovdqa64 %zmm4, (%esp) ; KNL_X32-NEXT: calll _v32i1_mem_callee ; KNL_X32-NEXT: movl %ebp, %esp ; KNL_X32-NEXT: popl %ebp @@ -4020,8 +4020,8 @@ ; SKX-NEXT: .cfi_def_cfa_register %rbp ; SKX-NEXT: andq $-64, %rsp ; SKX-NEXT: subq $128, %rsp -; SKX-NEXT: vmovaps 16(%rbp), %zmm8 -; SKX-NEXT: vmovaps %zmm8, (%rsp) +; SKX-NEXT: vmovdqa64 16(%rbp), %zmm8 +; SKX-NEXT: vmovdqa64 %zmm8, (%rsp) ; SKX-NEXT: callq _v64i1_mem_callee ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp @@ -4037,10 +4037,10 @@ ; KNL_X32-NEXT: .cfi_def_cfa_register %ebp ; KNL_X32-NEXT: andl $-64, %esp ; KNL_X32-NEXT: subl $576, %esp ## imm = 0x240 -; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 -; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 -; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 -; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovdqa64 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovdqa64 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovdqa64 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovdqa64 200(%ebp), %zmm7 ; KNL_X32-NEXT: movl 516(%ebp), %eax ; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: movl 512(%ebp), %eax @@ -4169,10 +4169,10 @@ ; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: movl 264(%ebp), %eax ; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) -; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovdqa64 %zmm4, (%esp) ; KNL_X32-NEXT: calll _v64i1_mem_callee ; KNL_X32-NEXT: movl %ebp, %esp ; KNL_X32-NEXT: popl %ebp diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -170,7 +170,7 @@ ; ; DQNOVL-LABEL: slto4f32_mem: ; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vmovups (%rdi), %ymm0 +; DQNOVL-NEXT: vmovdqu (%rdi), %ymm0 ; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; DQNOVL-NEXT: vzeroupper @@ -674,7 +674,7 @@ ; ALL: # %bb.0: ; ALL-NEXT: vcvtpd2ps %zmm0, %ymm0 ; ALL-NEXT: vcvtpd2ps %zmm1, %ymm1 -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %a = fptrunc <16 x double> %b to <16 x float> ret <16 x float> %a @@ -789,7 +789,7 @@ ; VL: # %bb.0: ; VL-NEXT: vcmpltpd %ymm1, %ymm0, %k1 ; VL-NEXT: vcvtps2pd (%rdi), %ymm2 {%k1} -; VL-NEXT: vmovaps %ymm2, %ymm0 +; VL-NEXT: vmovdqa %ymm2, %ymm0 ; VL-NEXT: retq %b = load <4 x float>, <4 x float>* %p %a = fpext <4 x float> %b to <4 x double> @@ -948,9 +948,9 @@ ; ALL-LABEL: uito16f64: ; ALL: # %bb.0: ; ALL-NEXT: vcvtudq2pd %ymm0, %zmm2 -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: vcvtudq2pd %ymm0, %zmm1 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 ; ALL-NEXT: retq %b = uitofp <16 x i32> %a to <16 x double> ret <16 x double> %b @@ -1062,14 +1062,14 @@ ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; VLDQ-NEXT: vcvtqq2ps %zmm1, %ymm1 -; VLDQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; VLDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; VLDQ-NEXT: retq ; ; DQNOVL-LABEL: slto16f32: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 ; DQNOVL-NEXT: vcvtqq2ps %zmm1, %ymm1 -; DQNOVL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; DQNOVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; DQNOVL-NEXT: retq %b = sitofp <16 x i64> %a to <16 x float> ret <16 x float> %b @@ -1297,14 +1297,14 @@ ; VLDQ: # %bb.0: ; VLDQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; VLDQ-NEXT: vcvtuqq2ps %zmm1, %ymm1 -; VLDQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; VLDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; VLDQ-NEXT: retq ; ; DQNOVL-LABEL: ulto16f32: ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; DQNOVL-NEXT: vcvtuqq2ps %zmm1, %ymm1 -; DQNOVL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; DQNOVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; DQNOVL-NEXT: retq %b = uitofp <16 x i64> %a to <16 x float> ret <16 x float> %b @@ -1847,9 +1847,9 @@ ; ALL-LABEL: sito16f64: ; ALL: # %bb.0: ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm2 -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: vcvtdq2pd %ymm0, %zmm1 -; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 ; ALL-NEXT: retq %b = sitofp <16 x i32> %a to <16 x double> ret <16 x double> %b @@ -2621,7 +2621,7 @@ ; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVLDQ-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; NOVLDQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; NOVLDQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; NOVLDQ-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} ; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -2645,7 +2645,7 @@ ; DQNOVL: # %bb.0: ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; DQNOVL-NEXT: vpmovq2m %zmm0, %k1 -; DQNOVL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; DQNOVL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; DQNOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 ; DQNOVL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} ; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector.ll --- a/llvm/test/CodeGen/X86/avx512-extract-subvector.ll +++ b/llvm/test/CodeGen/X86/avx512-extract-subvector.ll @@ -5,7 +5,7 @@ define <8 x i16> @extract_subvector128_v32i16(<32 x i16> %x) nounwind { ; SKX-LABEL: extract_subvector128_v32i16: ; SKX: ## %bb.0: -; SKX-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> @@ -25,7 +25,7 @@ define <16 x i8> @extract_subvector128_v64i8(<64 x i8> %x) nounwind { ; SKX-LABEL: extract_subvector128_v64i8: ; SKX: ## %bb.0: -; SKX-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> @@ -46,7 +46,7 @@ define <16 x i16> @extract_subvector256_v32i16(<32 x i16> %x) nounwind { ; SKX-LABEL: extract_subvector256_v32i16: ; SKX: ## %bb.0: -; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; SKX-NEXT: retq %r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <16 x i32> ret <16 x i16> %r1 @@ -55,7 +55,7 @@ define <32 x i8> @extract_subvector256_v64i8(<64 x i8> %x) nounwind { ; SKX-LABEL: extract_subvector256_v64i8: ; SKX: ## %bb.0: -; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; SKX-NEXT: retq %r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <32 x i32> ret <32 x i8> %r1 @@ -64,7 +64,7 @@ define void @extract_subvector256_v8f64_store(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v8f64_store: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -77,7 +77,7 @@ define void @extract_subvector256_v8f32_store(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v8f32_store: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -90,7 +90,7 @@ define void @extract_subvector256_v4i64_store(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v4i64_store: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -103,7 +103,7 @@ define void @extract_subvector256_v8i32_store(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v8i32_store: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -116,7 +116,7 @@ define void @extract_subvector256_v16i16_store(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v16i16_store: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -129,7 +129,7 @@ define void @extract_subvector256_v32i8_store(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v32i8_store: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -142,7 +142,7 @@ define void @extract_subvector256_v4f64_store_lo(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v4f64_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vmovdqu %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -155,7 +155,7 @@ define void @extract_subvector256_v4f64_store_lo_align_16(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v4f64_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovdqa %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -168,7 +168,7 @@ define void @extract_subvector256_v4f32_store_lo(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v4f32_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vmovdqu %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -181,7 +181,7 @@ define void @extract_subvector256_v4f32_store_lo_align_16(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v4f32_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovdqa %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -194,7 +194,7 @@ define void @extract_subvector256_v2i64_store_lo(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v2i64_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vmovdqu %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -207,7 +207,7 @@ define void @extract_subvector256_v2i64_store_lo_align_16(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v2i64_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovdqa %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -220,7 +220,7 @@ define void @extract_subvector256_v4i32_store_lo(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v4i32_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vmovdqu %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -233,7 +233,7 @@ define void @extract_subvector256_v4i32_store_lo_align_16(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v4i32_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovdqa %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -246,7 +246,7 @@ define void @extract_subvector256_v8i16_store_lo(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v8i16_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vmovdqu %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -259,7 +259,7 @@ define void @extract_subvector256_v8i16_store_lo_align_16(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v8i16_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovdqa %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -272,7 +272,7 @@ define void @extract_subvector256_v16i8_store_lo(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v16i8_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vmovdqu %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -285,7 +285,7 @@ define void @extract_subvector256_v16i8_store_lo_align_16(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v16i8_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovdqa %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -298,7 +298,7 @@ define void @extract_subvector512_v2f64_store_lo(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v2f64_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vmovdqu %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -311,7 +311,7 @@ define void @extract_subvector512_v2f64_store_lo_align_16(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v2f64_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovdqa %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -324,7 +324,7 @@ define void @extract_subvector512_v4f32_store_lo(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v4f32_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vmovdqu %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -337,7 +337,7 @@ define void @extract_subvector512_v4f32_store_lo_align_16(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v4f32_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovdqa %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -350,7 +350,7 @@ define void @extract_subvector512_v2i64_store_lo(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v2i64_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vmovdqu %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -363,7 +363,7 @@ define void @extract_subvector512_v2i64_store_lo_align_16(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v2i64_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovdqa %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -376,7 +376,7 @@ define void @extract_subvector512_v4i32_store_lo(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v4i32_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vmovdqu %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -389,7 +389,7 @@ define void @extract_subvector512_v4i32_store_lo_align_16(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v4i32_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovdqa %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -402,7 +402,7 @@ define void @extract_subvector512_v8i16_store_lo(i16* nocapture %addr, <32 x i16> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v8i16_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vmovdqu %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -415,7 +415,7 @@ define void @extract_subvector512_v16i8_store_lo(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v16i8_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vmovdqu %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -428,7 +428,7 @@ define void @extract_subvector512_v16i8_store_lo_align_16(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v16i8_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vmovdqa %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -441,7 +441,7 @@ define void @extract_subvector512_v4f64_store_lo(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v4f64_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vmovdqu %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -454,7 +454,7 @@ define void @extract_subvector512_v4f64_store_lo_align_16(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v4f64_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vmovdqu %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -467,7 +467,7 @@ define void @extract_subvector512_v4f64_store_lo_align_32(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v4f64_store_lo_align_32: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vmovdqa %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -480,7 +480,7 @@ define void @extract_subvector512_v8f32_store_lo(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v8f32_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vmovdqu %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -493,7 +493,7 @@ define void @extract_subvector512_v8f32_store_lo_align_16(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v8f32_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vmovdqu %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -506,7 +506,7 @@ define void @extract_subvector512_v8f32_store_lo_align_32(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v8f32_store_lo_align_32: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vmovdqa %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -519,7 +519,7 @@ define void @extract_subvector512_v4i64_store_lo(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v4i64_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vmovdqu %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -532,7 +532,7 @@ define void @extract_subvector512_v4i64_store_lo_align_16(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v4i64_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vmovdqu %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -545,7 +545,7 @@ define void @extract_subvector512_v4i64_store_lo_align_32(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v4i64_store_lo_align_32: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vmovdqa %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -558,7 +558,7 @@ define void @extract_subvector512_v8i32_store_lo(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v8i32_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vmovdqu %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -571,7 +571,7 @@ define void @extract_subvector512_v8i32_store_lo_align_16(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v8i32_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vmovdqu %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -584,7 +584,7 @@ define void @extract_subvector512_v8i32_store_lo_align_32(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v8i32_store_lo_align_32: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vmovdqa %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -597,7 +597,7 @@ define void @extract_subvector512_v16i16_store_lo(i16* nocapture %addr, <32 x i16> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v16i16_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vmovdqu %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -610,7 +610,7 @@ define void @extract_subvector512_v16i16_store_lo_align_16(i16* nocapture %addr, <32 x i16> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v16i16_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vmovdqu %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -623,7 +623,7 @@ define void @extract_subvector512_v16i16_store_lo_align_32(i16* nocapture %addr, <32 x i16> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v16i16_store_lo_align_32: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vmovdqa %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -636,7 +636,7 @@ define void @extract_subvector512_v32i8_store_lo(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v32i8_store_lo: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vmovdqu %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -649,7 +649,7 @@ define void @extract_subvector512_v32i8_store_lo_align_16(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v32i8_store_lo_align_16: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vmovdqu %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: @@ -662,7 +662,7 @@ define void @extract_subvector512_v32i8_store_lo_align_32(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector512_v32i8_store_lo_align_32: ; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vmovdqa %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -48,9 +48,9 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind { ; CHECK-LABEL: test4: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; CHECK-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] -; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm0[0],xmm1[0] +; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %eee = extractelement <8 x i64> %x, i32 4 %rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1 @@ -85,9 +85,9 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp) ; CHECK-NEXT: andl $15, %edi -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: vzeroupper @@ -104,9 +104,9 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp) ; CHECK-NEXT: andl $7, %edi -; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: vzeroupper @@ -123,9 +123,9 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovdqa %ymm0, (%rsp) ; CHECK-NEXT: andl $7, %edi -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: vzeroupper @@ -142,7 +142,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp) ; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: movl (%rsp,%rdi,4), %eax ; CHECK-NEXT: movq %rbp, %rsp @@ -411,9 +411,9 @@ define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) { ; CHECK-LABEL: extract_v16i32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractps $1, %xmm0, %eax -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vextractps $1, %xmm0, (%rdi) +; CHECK-NEXT: vpextrd $1, %xmm0, %eax +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpextrd $1, %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %r1 = extractelement <16 x i32> %x, i32 1 @@ -425,9 +425,9 @@ define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) { ; CHECK-LABEL: extract_v8i32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractps $1, %xmm0, %eax -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vextractps $1, %xmm0, (%rdi) +; CHECK-NEXT: vpextrd $1, %xmm0, %eax +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpextrd $1, %xmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %r1 = extractelement <8 x i32> %x, i32 1 @@ -439,8 +439,8 @@ define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) { ; CHECK-LABEL: extract_v4i32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractps $1, %xmm0, %eax -; CHECK-NEXT: vextractps $3, %xmm0, (%rdi) +; CHECK-NEXT: vpextrd $1, %xmm0, %eax +; CHECK-NEXT: vpextrd $3, %xmm0, (%rdi) ; CHECK-NEXT: retq %r1 = extractelement <4 x i32> %x, i32 1 %r2 = extractelement <4 x i32> %x, i32 3 @@ -721,8 +721,8 @@ define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) { ; CHECK-LABEL: test_insert_128_v8f64: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] -; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm0[0],xmm1[0] +; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %r = insertelement <8 x double> %x, double %y, i32 1 ret <8 x double> %r @@ -1073,7 +1073,7 @@ ; CHECK-LABEL: test_extractelement_variable_v2i64: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: movq -24(%rsp,%rdi,8), %rax ; CHECK-NEXT: retq @@ -1092,7 +1092,7 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovdqa %ymm0, (%rsp) ; CHECK-NEXT: andl $3, %edi ; CHECK-NEXT: movq (%rsp,%rdi,8), %rax ; CHECK-NEXT: movq %rbp, %rsp @@ -1114,7 +1114,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp) ; CHECK-NEXT: andl $7, %edi ; CHECK-NEXT: movq (%rsp,%rdi,8), %rax ; CHECK-NEXT: movq %rbp, %rsp @@ -1129,9 +1129,9 @@ ; CHECK-LABEL: test_extractelement_variable_v2f64: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retq %t2 = extractelement <2 x double> %t1, i32 %index ret double %t2 @@ -1148,9 +1148,9 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovdqa %ymm0, (%rsp) ; CHECK-NEXT: andl $3, %edi -; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: vzeroupper @@ -1170,9 +1170,9 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp) ; CHECK-NEXT: andl $7, %edi -; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: vzeroupper @@ -1185,7 +1185,7 @@ ; CHECK-LABEL: test_extractelement_variable_v4i32: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $3, %edi ; CHECK-NEXT: movl -24(%rsp,%rdi,4), %eax ; CHECK-NEXT: retq @@ -1204,7 +1204,7 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovdqa %ymm0, (%rsp) ; CHECK-NEXT: andl $7, %edi ; CHECK-NEXT: movl (%rsp,%rdi,4), %eax ; CHECK-NEXT: movq %rbp, %rsp @@ -1226,7 +1226,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp) ; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: movl (%rsp,%rdi,4), %eax ; CHECK-NEXT: movq %rbp, %rsp @@ -1241,9 +1241,9 @@ ; CHECK-LABEL: test_extractelement_variable_v4f32: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $3, %edi -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: retq %t2 = extractelement <4 x float> %t1, i32 %index ret float %t2 @@ -1260,9 +1260,9 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovdqa %ymm0, (%rsp) ; CHECK-NEXT: andl $7, %edi -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: vzeroupper @@ -1282,9 +1282,9 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp) ; CHECK-NEXT: andl $15, %edi -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: vzeroupper @@ -1297,7 +1297,7 @@ ; CHECK-LABEL: test_extractelement_variable_v8i16: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $7, %edi ; CHECK-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; CHECK-NEXT: retq @@ -1316,7 +1316,7 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovdqa %ymm0, (%rsp) ; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: movzwl (%rsp,%rdi,2), %eax ; CHECK-NEXT: movq %rbp, %rsp @@ -1338,7 +1338,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp) ; CHECK-NEXT: andl $31, %edi ; CHECK-NEXT: movzwl (%rsp,%rdi,2), %eax ; CHECK-NEXT: movq %rbp, %rsp @@ -1353,7 +1353,7 @@ ; CHECK-LABEL: test_extractelement_variable_v16i8: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: movb -24(%rsp,%rdi), %al ; CHECK-NEXT: retq @@ -1372,7 +1372,7 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovdqa %ymm0, (%rsp) ; CHECK-NEXT: andl $31, %edi ; CHECK-NEXT: movb (%rsp,%rdi), %al ; CHECK-NEXT: movq %rbp, %rsp @@ -1395,7 +1395,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp) ; CHECK-NEXT: andl $63, %edi ; CHECK-NEXT: movb (%rsp,%rdi), %al ; CHECK-NEXT: movq %rbp, %rsp @@ -1418,7 +1418,7 @@ ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp ; CHECK-NEXT: addb %dil, %dil -; CHECK-NEXT: vmovaps %zmm0, (%rsp) +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp) ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: andl $63, %eax ; CHECK-NEXT: movb (%rsp,%rax), %al @@ -1609,8 +1609,8 @@ define <8 x i64> @insert_double_zero(<2 x i64> %a) nounwind { ; CHECK-LABEL: insert_double_zero: ; CHECK: ## %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vinsertf32x4 $2, %xmm0, %zmm1, %zmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %b = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <4 x i32> %d = shufflevector <4 x i64> %b, <4 x i64> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll --- a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll +++ b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll @@ -190,44 +190,44 @@ ; WIN64-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; WIN64-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; WIN64-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-KNL-NEXT: vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-KNL-NEXT: vmovdqa64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; WIN64-KNL-NEXT: andq $-64, %rsp -; WIN64-KNL-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) -; WIN64-KNL-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) +; WIN64-KNL-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) +; WIN64-KNL-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-KNL-NEXT: callq func_float16 -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; WIN64-KNL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; WIN64-KNL-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; WIN64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -245,44 +245,44 @@ ; WIN64-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; WIN64-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; WIN64-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; WIN64-SKX-NEXT: vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; WIN64-SKX-NEXT: vmovdqa64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; WIN64-SKX-NEXT: andq $-64, %rsp -; WIN64-SKX-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) -; WIN64-SKX-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) +; WIN64-SKX-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) +; WIN64-SKX-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-SKX-NEXT: callq func_float16 -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; WIN64-SKX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; WIN64-SKX-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload ; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; WIN64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 8-byte Reload @@ -299,39 +299,39 @@ ; X64-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; X64-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; X64-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; X64-KNL-NEXT: vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-KNL-NEXT: vmovups %zmm16, (%rsp) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-KNL-NEXT: vmovdqu64 %zmm16, (%rsp) ## 64-byte Spill ; X64-KNL-NEXT: callq _func_float16 -; X64-KNL-NEXT: vmovups (%rsp), %zmm16 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload -; X64-KNL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 (%rsp), %zmm16 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload +; X64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload ; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload ; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload ; X64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload @@ -348,39 +348,39 @@ ; X64-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-SKX-NEXT: vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill -; X64-SKX-NEXT: vmovups %zmm16, (%rsp) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; X64-SKX-NEXT: vmovdqu64 %zmm16, (%rsp) ## 64-byte Spill ; X64-SKX-NEXT: callq _func_float16 -; X64-SKX-NEXT: vmovups (%rsp), %zmm16 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload -; X64-SKX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 (%rsp), %zmm16 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload +; X64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload ; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 8-byte Reload ; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 8-byte Reload ; X64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 8-byte Reload @@ -417,7 +417,7 @@ ; WIN64-KNL-NEXT: .seh_stackalloc 40 ; WIN64-KNL-NEXT: .seh_endprologue ; WIN64-KNL-NEXT: # kill: def $dx killed $dx def $edx -; WIN64-KNL-NEXT: vmovaps (%rcx), %zmm0 +; WIN64-KNL-NEXT: vmovdqa64 (%rcx), %zmm0 ; WIN64-KNL-NEXT: kmovw %edx, %k1 ; WIN64-KNL-NEXT: callq func_float16_mask ; WIN64-KNL-NEXT: nop @@ -431,7 +431,7 @@ ; WIN64-SKX-NEXT: .seh_stackalloc 40 ; WIN64-SKX-NEXT: .seh_endprologue ; WIN64-SKX-NEXT: # kill: def $dx killed $dx def $edx -; WIN64-SKX-NEXT: vmovaps (%rcx), %zmm0 +; WIN64-SKX-NEXT: vmovdqa64 (%rcx), %zmm0 ; WIN64-SKX-NEXT: kmovd %edx, %k1 ; WIN64-SKX-NEXT: callq func_float16_mask ; WIN64-SKX-NEXT: nop diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -604,7 +604,7 @@ define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) { ; CHECK-LABEL: test_mm512_broadcastd_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <2 x i64> %a0 to <4 x i32> %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer @@ -658,7 +658,7 @@ define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) { ; CHECK-LABEL: test_mm512_broadcastq_epi64: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer ret <8 x i64> %res @@ -705,7 +705,7 @@ define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) { ; CHECK-LABEL: test_mm512_broadcastsd_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer ret <8 x double> %res @@ -752,7 +752,7 @@ define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) { ; CHECK-LABEL: test_mm512_broadcastss_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %res @@ -987,7 +987,7 @@ define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) { ; CHECK-LABEL: test_mm512_permute_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] +; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> ret <16 x float> %res @@ -1034,7 +1034,7 @@ define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) { ; CHECK-LABEL: test_mm512_permutex_epi64: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4] +; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> ret <8 x i64> %res @@ -1081,7 +1081,7 @@ define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) { ; CHECK-LABEL: test_mm512_permutex_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4] +; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> ret <8 x double> %res @@ -1128,7 +1128,7 @@ define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) { ; CHECK-LABEL: test_mm512_shuffle_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] +; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <8 x i64> %a0 to <16 x i32> %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> @@ -1229,7 +1229,7 @@ define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: test_mm512_unpackhi_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <8 x i64> %a0 to <16 x i32> %arg1 = bitcast <8 x i64> %a1 to <16 x i32> @@ -1286,7 +1286,7 @@ define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: test_mm512_unpackhi_epi64: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> ret <8 x i64> %res @@ -1333,7 +1333,7 @@ define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) { ; CHECK-LABEL: test_mm512_unpackhi_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> ret <8 x double> %res @@ -1380,7 +1380,7 @@ define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) { ; CHECK-LABEL: test_mm512_unpackhi_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> ret <16 x float> %res @@ -1427,7 +1427,7 @@ define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: test_mm512_unpacklo_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; CHECK-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <8 x i64> %a0 to <16 x i32> %arg1 = bitcast <8 x i64> %a1 to <16 x i32> @@ -1484,7 +1484,7 @@ define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: test_mm512_unpacklo_epi64: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> ret <8 x i64> %res @@ -1531,7 +1531,7 @@ define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) { ; CHECK-LABEL: test_mm512_unpacklo_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> ret <8 x double> %res @@ -1578,7 +1578,7 @@ define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) { ; CHECK-LABEL: test_mm512_unpacklo_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; CHECK-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> ret <16 x float> %res @@ -1625,7 +1625,7 @@ define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind { ; CHECK-LABEL: test_mm512_zextpd128_pd512: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> ret <8 x double> %res @@ -1634,7 +1634,7 @@ define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind { ; CHECK-LABEL: test_mm512_zextpd256_pd512: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> ret <8 x double> %res @@ -1643,7 +1643,7 @@ define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind { ; CHECK-LABEL: test_mm512_zextps128_ps512: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> ret <16 x float> %res @@ -1652,7 +1652,7 @@ define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind { ; CHECK-LABEL: test_mm512_zextps256_ps512: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> ret <16 x float> %res @@ -1661,7 +1661,7 @@ define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind { ; CHECK-LABEL: test_mm512_zextsi128_si512: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> ret <8 x i64> %res @@ -1670,7 +1670,7 @@ define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind { ; CHECK-LABEL: test_mm512_zextsi256_si512: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> ret <8 x i64> %res @@ -1890,8 +1890,8 @@ ; X86-NEXT: fildll {{[0-9]+}}(%esp) ; X86-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: .cfi_def_cfa %esp, 4 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -92,7 +92,7 @@ define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1) { ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0xc0] +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0xc0] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> undef, i16 -1) @@ -140,7 +140,7 @@ define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1) { ; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0xc0] +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc0] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> undef, i8 -1) @@ -190,7 +190,7 @@ define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0xc0] +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0xc0] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) ret <16 x i32> %res @@ -235,7 +235,7 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0xc0] +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc0] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) ret <8 x i64> %res @@ -434,7 +434,7 @@ define <8 x double>@test_int_x86_avx512_perm_df_512(<8 x double> %x0, <8 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_perm_df_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermpd $3, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x01,0xc0,0x03] +; CHECK-NEXT: vpermq $3, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x00,0xc0,0x03] ; CHECK-NEXT: ## zmm0 = zmm0[3,0,0,0,7,4,4,4] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1) @@ -486,7 +486,7 @@ define <8 x i64>@test_int_x86_avx512_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2) { ; CHECK-LABEL: test_int_x86_avx512_perm_di_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermpd $3, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x01,0xc0,0x03] +; CHECK-NEXT: vpermq $3, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x00,0xc0,0x03] ; CHECK-NEXT: ## zmm0 = zmm0[3,0,0,0,7,4,4,4] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1) @@ -1024,7 +1024,7 @@ define <16 x float>@test_int_x86_avx512_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermilps $22, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xc0,0x16] +; CHECK-NEXT: vpshufd $22, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x70,0xc0,0x16] ; CHECK-NEXT: ## zmm0 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1) @@ -1074,7 +1074,7 @@ define <16 x i32>@test_int_x86_avx512_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_pshuf_d_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermilps $3, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xc0,0x03] +; CHECK-NEXT: vpshufd $3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x70,0xc0,0x03] ; CHECK-NEXT: ## zmm0 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1) @@ -1268,7 +1268,7 @@ define <8 x double>@test_int_x86_avx512_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_unpckh_pd_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vunpckhpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x15,0xc1] +; CHECK-NEXT: vpunpckhqdq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6d,0xc1] ; CHECK-NEXT: ## zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) @@ -1301,7 +1301,7 @@ define <16 x float>@test_int_x86_avx512_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_unpckh_ps_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vunpckhps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x15,0xc1] +; CHECK-NEXT: vpunpckhdq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x6a,0xc1] ; CHECK-NEXT: ## zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) @@ -1333,7 +1333,7 @@ define <8 x double>@test_int_x86_avx512_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_unpckl_pd_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vunpcklpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x14,0xc1] +; CHECK-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6c,0xc1] ; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) @@ -1366,7 +1366,7 @@ define <16 x float>@test_int_x86_avx512_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_unpckl_ps_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vunpcklps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x14,0xc1] +; CHECK-NEXT: vpunpckldq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x62,0xc1] ; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) @@ -1398,7 +1398,7 @@ define <8 x i64>@test_int_x86_avx512_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { ; CHECK-LABEL: test_int_x86_avx512_punpcklqd_q_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vunpcklpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x14,0xc1] +; CHECK-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6c,0xc1] ; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) @@ -1450,7 +1450,7 @@ define <8 x i64>@test_int_x86_avx512_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { ; CHECK-LABEL: test_int_x86_avx512_punpckhqd_q_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vunpckhpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x15,0xc1] +; CHECK-NEXT: vpunpckhqdq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6d,0xc1] ; CHECK-NEXT: ## zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) @@ -1483,7 +1483,7 @@ define <16 x i32>@test_int_x86_avx512_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_punpckhd_q_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vunpckhps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x15,0xc1] +; CHECK-NEXT: vpunpckhdq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x6a,0xc1] ; CHECK-NEXT: ## zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) @@ -1515,7 +1515,7 @@ define <16 x i32>@test_int_x86_avx512_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_punpckld_q_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vunpcklps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x14,0xc1] +; CHECK-NEXT: vpunpckldq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x62,0xc1] ; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) @@ -1824,13 +1824,13 @@ ; X86-LABEL: test_storent_q_512: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovntps %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x00] +; X86-NEXT: vmovntdq %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7d,0x48,0xe7,0x00] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_storent_q_512: ; X64: ## %bb.0: -; X64-NEXT: vmovntps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x07] +; X64-NEXT: vmovntdq %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7d,0x48,0xe7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data) @@ -1843,13 +1843,13 @@ ; X86-LABEL: test_storent_pd_512: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovntps %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x00] +; X86-NEXT: vmovntdq %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7d,0x48,0xe7,0x00] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_storent_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vmovntps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x07] +; X64-NEXT: vmovntdq %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7d,0x48,0xe7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data) @@ -1862,13 +1862,13 @@ ; X86-LABEL: test_storent_ps_512: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovntps %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x00] +; X86-NEXT: vmovntdq %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7d,0x48,0xe7,0x00] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_storent_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vmovntps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x07] +; X64-NEXT: vmovntdq %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7d,0x48,0xe7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] call void @llvm.x86.avx512.storent.ps.512(i8* %ptr, <16 x float> %data) @@ -4999,14 +4999,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0xe6,0xc8] -; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0xe6,0xc8] -; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) ret <8 x double> %res @@ -5029,14 +5029,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x7a,0xc8] -; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x7a,0xc8] -; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) ret <8 x double> %res @@ -5065,14 +5065,14 @@ ; X86: ## %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtph2ps %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x13,0xc8] -; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtph2ps_512_rrk: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtph2ps %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x13,0xc8] -; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4) ret <16 x float> %res @@ -5699,7 +5699,7 @@ define <4 x double> @test_vextractf64x4(<8 x double> %a) { ; CHECK-LABEL: test_vextractf64x4: ; CHECK: ## %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1b,0xc0,0x01] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc0,0x01] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 1, <4 x double> zeroinitializer, i8 -1) ret <4 x double> %res @@ -5712,7 +5712,7 @@ define <16 x float>@test_int_x86_avx512_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3) { ; CHECK-LABEL: test_int_x86_avx512_insertf32x4_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x18,0xc1,0x01] +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x38,0xc1,0x01] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) ret <16 x float> %res @@ -5757,7 +5757,7 @@ define <16 x i32>@test_int_x86_avx512_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) { ; CHECK-LABEL: test_int_x86_avx512_inserti32x4_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x18,0xc1,0x01] +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x38,0xc1,0x01] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) ret <16 x i32> %res @@ -5802,7 +5802,7 @@ define <8 x double>@test_int_x86_avx512_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3) { ; CHECK-LABEL: test_int_x86_avx512_insertf64x4_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc1,0x01] +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc1,0x01] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) ret <8 x double> %res @@ -5849,7 +5849,7 @@ define <8 x i64>@test_int_x86_avx512_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3) { ; CHECK-LABEL: test_int_x86_avx512_inserti64x4_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc1,0x01] +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc1,0x01] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) ret <8 x i64> %res @@ -6441,7 +6441,7 @@ ; CHECK-LABEL: test_int_x86_avx512_broadcastf64x4_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc0,0x01] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1) @@ -6574,7 +6574,7 @@ ; CHECK-LABEL: test_int_x86_avx512_broadcasti64x4_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc0,0x01] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1) @@ -7402,12 +7402,12 @@ ; X86-LABEL: test_x86_vbroadcast_ss_512: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vbroadcastss (%eax), %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0x00] +; X86-NEXT: vpbroadcastd (%eax), %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_x86_vbroadcast_ss_512: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastss (%rdi), %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0x07] +; X64-NEXT: vpbroadcastd (%rdi), %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x07] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8* %a0) ; <<16 x float>> [#uses=1] ret <16 x float> %res @@ -7418,12 +7418,12 @@ ; X86-LABEL: test_x86_vbroadcast_sd_512: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vbroadcastsd (%eax), %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0x00] +; X86-NEXT: vpbroadcastq (%eax), %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_x86_vbroadcast_sd_512: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastsd (%rdi), %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0x07] +; X64-NEXT: vpbroadcastq (%rdi), %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x07] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8* %a0) ; <<8 x double>> [#uses=1] ret <8 x double> %res @@ -7435,7 +7435,7 @@ define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_permvar_df_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0x16,0xc0] +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0x36,0xc0] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) ret <8 x double> %res @@ -7482,7 +7482,7 @@ define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { ; CHECK-LABEL: test_int_x86_avx512_permvar_di_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0x16,0xc0] +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0x36,0xc0] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) ret <8 x i64> %res @@ -7529,7 +7529,7 @@ define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_permvar_sf_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x16,0xc0] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x36,0xc0] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) ret <16 x float> %res @@ -7574,7 +7574,7 @@ define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_permvar_si_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x16,0xc0] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x36,0xc0] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) ret <16 x i32> %res @@ -10406,8 +10406,8 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm: ; CHECK: ## %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; CHECK-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9] +; CHECK-NEXT: vpblendd $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x01] ; CHECK-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %q = load float, float* %ptr_b diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -5199,7 +5199,7 @@ define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1) { ; CHECK-LABEL: test_int_x86_avx512_permvar_df_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) ret <8 x double> %1 @@ -5250,7 +5250,7 @@ define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1) { ; CHECK-LABEL: test_int_x86_avx512_permvar_di_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) ret <8 x i64> %1 @@ -5301,7 +5301,7 @@ define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1) { ; CHECK-LABEL: test_int_x86_avx512_permvar_sf_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) ret <16 x float> %1 @@ -5350,7 +5350,7 @@ define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1) { ; CHECK-LABEL: test_int_x86_avx512_permvar_si_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) ret <16 x i32> %1 @@ -6505,8 +6505,8 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; CHECK-NEXT: ret{{[l|q]}} %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll --- a/llvm/test/CodeGen/X86/avx512-logic.ll +++ b/llvm/test/CodeGen/X86/avx512-logic.ll @@ -116,29 +116,19 @@ define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind { -; KNL-LABEL: orq_broadcast: -; KNL: ## %bb.0: -; KNL-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: orq_broadcast: -; SKX: ## %bb.0: -; SKX-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; SKX-NEXT: retq +; ALL-LABEL: orq_broadcast: +; ALL: ## %bb.0: +; ALL-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; ALL-NEXT: retq %b = or <8 x i64> %a, ret <8 x i64> %b } define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) { -; KNL-LABEL: andd512fold: -; KNL: ## %bb.0: ## %entry -; KNL-NEXT: vpandd (%rdi), %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: andd512fold: -; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vandps (%rdi), %zmm0, %zmm0 -; SKX-NEXT: retq +; ALL-LABEL: andd512fold: +; ALL: ## %bb.0: ## %entry +; ALL-NEXT: vpandd (%rdi), %zmm0, %zmm0 +; ALL-NEXT: retq entry: %a = load <16 x i32>, <16 x i32>* %x, align 4 %b = and <16 x i32> %y, %a @@ -146,15 +136,10 @@ } define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) { -; KNL-LABEL: andqbrst: -; KNL: ## %bb.0: ## %entry -; KNL-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: andqbrst: -; SKX: ## %bb.0: ## %entry -; SKX-NEXT: vandpd (%rdi){1to8}, %zmm0, %zmm0 -; SKX-NEXT: retq +; ALL-LABEL: andqbrst: +; ALL: ## %bb.0: ## %entry +; ALL-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0 +; ALL-NEXT: retq entry: %a = load i64, i64* %ap, align 8 %b = insertelement <8 x i64> undef, i64 %a, i32 0 @@ -164,29 +149,19 @@ } define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) { -; KNL-LABEL: and_v64i8: -; KNL: ## %bb.0: -; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: and_v64i8: -; SKX: ## %bb.0: -; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; ALL-LABEL: and_v64i8: +; ALL: ## %bb.0: +; ALL-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: retq %res = and <64 x i8> %a, %b ret <64 x i8> %res } define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) { -; KNL-LABEL: andn_v64i8: -; KNL: ## %bb.0: -; KNL-NEXT: vpandnq %zmm0, %zmm1, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: andn_v64i8: -; SKX: ## %bb.0: -; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0 -; SKX-NEXT: retq +; ALL-LABEL: andn_v64i8: +; ALL: ## %bb.0: +; ALL-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: retq %b2 = xor <64 x i8> %b, @or_v64i8(<64 x i8> %a, <64 x i8> %b) { -; KNL-LABEL: or_v64i8: -; KNL: ## %bb.0: -; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: or_v64i8: -; SKX: ## %bb.0: -; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; ALL-LABEL: or_v64i8: +; ALL: ## %bb.0: +; ALL-NEXT: vporq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: retq %res = or <64 x i8> %a, %b ret <64 x i8> %res } define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) { -; KNL-LABEL: xor_v64i8: -; KNL: ## %bb.0: -; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: xor_v64i8: -; SKX: ## %bb.0: -; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; ALL-LABEL: xor_v64i8: +; ALL: ## %bb.0: +; ALL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: retq %res = xor <64 x i8> %a, %b ret <64 x i8> %res } define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) { -; KNL-LABEL: and_v32i16: -; KNL: ## %bb.0: -; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: and_v32i16: -; SKX: ## %bb.0: -; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; ALL-LABEL: and_v32i16: +; ALL: ## %bb.0: +; ALL-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: retq %res = and <32 x i16> %a, %b ret <32 x i16> %res } define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) { -; KNL-LABEL: andn_v32i16: -; KNL: ## %bb.0: -; KNL-NEXT: vpandnq %zmm0, %zmm1, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: andn_v32i16: -; SKX: ## %bb.0: -; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0 -; SKX-NEXT: retq +; ALL-LABEL: andn_v32i16: +; ALL: ## %bb.0: +; ALL-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; ALL-NEXT: retq %b2 = xor <32 x i16> %b, %res = and <32 x i16> %a, %b2 @@ -254,29 +209,19 @@ } define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) { -; KNL-LABEL: or_v32i16: -; KNL: ## %bb.0: -; KNL-NEXT: vporq %zmm1, %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: or_v32i16: -; SKX: ## %bb.0: -; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; ALL-LABEL: or_v32i16: +; ALL: ## %bb.0: +; ALL-NEXT: vporq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: retq %res = or <32 x i16> %a, %b ret <32 x i16> %res } define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) { -; KNL-LABEL: xor_v32i16: -; KNL: ## %bb.0: -; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: xor_v32i16: -; SKX: ## %bb.0: -; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; ALL-LABEL: xor_v32i16: +; ALL: ## %bb.0: +; ALL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: retq %res = xor <32 x i16> %a, %b ret <32 x i16> %res } @@ -441,7 +386,7 @@ ; SKX-LABEL: test_mm512_mask_and_epi32: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vpandd %zmm2, %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %and1.i.i = and <8 x i64> %__a, %__b @@ -463,7 +408,7 @@ ; SKX-LABEL: test_mm512_mask_or_epi32: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vpord %zmm2, %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %or1.i.i = or <8 x i64> %__a, %__b @@ -485,7 +430,7 @@ ; SKX-LABEL: test_mm512_mask_xor_epi32: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vpxord %zmm2, %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %xor1.i.i = xor <8 x i64> %__a, %__b @@ -507,7 +452,7 @@ ; SKX-LABEL: test_mm512_mask_xor_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorpd %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vpxorq %zmm2, %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -529,7 +474,7 @@ ; SKX-LABEL: test_mm512_maskz_xor_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -551,7 +496,7 @@ ; SKX-LABEL: test_mm512_mask_xor_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vpxord %zmm2, %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -573,7 +518,7 @@ ; SKX-LABEL: test_mm512_maskz_xor_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vpxord %zmm1, %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -595,7 +540,7 @@ ; SKX-LABEL: test_mm512_mask_or_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorpd %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -617,7 +562,7 @@ ; SKX-LABEL: test_mm512_maskz_or_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorpd %zmm0, %zmm1, %zmm0 {%k1} {z} +; SKX-NEXT: vporq %zmm0, %zmm1, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -639,7 +584,7 @@ ; SKX-LABEL: test_mm512_mask_or_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -661,7 +606,7 @@ ; SKX-LABEL: test_mm512_maskz_or_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %zmm0, %zmm1, %zmm0 {%k1} {z} +; SKX-NEXT: vpord %zmm0, %zmm1, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -683,7 +628,7 @@ ; SKX-LABEL: test_mm512_mask_and_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: vpandq %zmm1, %zmm2, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -705,7 +650,7 @@ ; SKX-LABEL: test_mm512_maskz_and_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %zmm0, %zmm1, %zmm0 {%k1} {z} +; SKX-NEXT: vpandq %zmm0, %zmm1, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -727,7 +672,7 @@ ; SKX-LABEL: test_mm512_mask_and_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: vpandd %zmm1, %zmm2, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -749,7 +694,7 @@ ; SKX-LABEL: test_mm512_maskz_and_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %zmm0, %zmm1, %zmm0 {%k1} {z} +; SKX-NEXT: vpandd %zmm0, %zmm1, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -771,7 +716,7 @@ ; SKX-LABEL: test_mm512_mask_andnot_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnpd %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vpandnq %zmm2, %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -794,7 +739,7 @@ ; SKX-LABEL: test_mm512_maskz_andnot_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vpandnq %zmm1, %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <8 x i64> @@ -817,7 +762,7 @@ ; SKX-LABEL: test_mm512_mask_andnot_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnps %zmm2, %zmm1, %zmm0 {%k1} +; SKX-NEXT: vpandnd %zmm2, %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -840,7 +785,7 @@ ; SKX-LABEL: test_mm512_maskz_andnot_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vpandnd %zmm1, %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <16 x float> %__A to <16 x i32> @@ -918,8 +863,8 @@ ; SKX-LABEL: ternlog_maskz_or_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %zmm2, %k1 -; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 -; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: vpord %zmm1, %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq %m = icmp slt <16 x i32> %mask, zeroinitializer %a = and <16 x i32> %x, @@ -940,8 +885,8 @@ ; SKX-LABEL: ternlog_maskz_xor_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %zmm2, %k1 -; SKX-NEXT: vandpd {{.*}}(%rip), %zmm0, %zmm0 -; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq %m = icmp slt <8 x i64> %mask, zeroinitializer %a = and <8 x i64> %x, @@ -962,8 +907,8 @@ ; SKX-LABEL: ternlog_maskx_or_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %zmm2, %k1 -; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm2 -; SKX-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm2 +; SKX-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} ; SKX-NEXT: retq %m = icmp slt <16 x i32> %mask, zeroinitializer %a = and <16 x i32> %x, @@ -985,9 +930,9 @@ ; SKX-LABEL: ternlog_masky_or_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %zmm2, %k1 -; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 -; SKX-NEXT: vorps %zmm1, %zmm0, %zmm1 {%k1} -; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: vpord %zmm1, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 ; SKX-NEXT: retq %m = icmp slt <16 x i32> %mask, zeroinitializer %a = and <16 x i32> %x, @@ -1008,8 +953,8 @@ ; SKX-LABEL: ternlog_maskx_xor_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %zmm2, %k1 -; SKX-NEXT: vandpd {{.*}}(%rip), %zmm0, %zmm2 -; SKX-NEXT: vxorpd %zmm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm2 +; SKX-NEXT: vpxorq %zmm1, %zmm2, %zmm0 {%k1} ; SKX-NEXT: retq %m = icmp slt <8 x i64> %mask, zeroinitializer %a = and <8 x i64> %x, @@ -1031,9 +976,9 @@ ; SKX-LABEL: ternlog_masky_xor_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %zmm2, %k1 -; SKX-NEXT: vandpd {{.*}}(%rip), %zmm0, %zmm0 -; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm1 {%k1} -; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm1 {%k1} +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 ; SKX-NEXT: retq %m = icmp slt <8 x i64> %mask, zeroinitializer %a = and <8 x i64> %x, diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -979,12 +979,12 @@ define <4 x i1> @test14() { ; CHECK-LABEL: test14: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1,0,1] ; CHECK-NEXT: retq ; ; X86-LABEL: test14: ; X86: ## %bb.0: -; X86-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,0,1] +; X86-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1,0,1] ; X86-NEXT: retl %a = bitcast i16 21845 to <16 x i1> %b = extractelement <16 x i1> %a, i32 2 @@ -1907,29 +1907,14 @@ } define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) { -; KNL-LABEL: test_build_vec_v32i1: -; KNL: ## %bb.0: -; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: test_build_vec_v32i1: -; SKX: ## %bb.0: -; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 -; SKX-NEXT: retq -; -; AVX512BW-LABEL: test_build_vec_v32i1: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: test_build_vec_v32i1: -; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: retq +; CHECK-LABEL: test_build_vec_v32i1: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq ; ; X86-LABEL: test_build_vec_v32i1: ; X86: ## %bb.0: -; X86-NEXT: vandps LCPI40_0, %zmm0, %zmm0 +; X86-NEXT: vpandq LCPI40_0, %zmm0, %zmm0 ; X86-NEXT: retl %ret = select <32 x i1> , <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret @@ -1957,7 +1942,7 @@ ; ; AVX512DQ-LABEL: test_build_vec_v32i1_optsize: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: test_build_vec_v32i1_optsize: @@ -1992,7 +1977,7 @@ ; ; AVX512DQ-LABEL: test_build_vec_v32i1_pgso: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: test_build_vec_v32i1_pgso: @@ -2006,29 +1991,14 @@ } define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) { -; KNL-LABEL: test_build_vec_v64i1: -; KNL: ## %bb.0: -; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: test_build_vec_v64i1: -; SKX: ## %bb.0: -; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 -; SKX-NEXT: retq -; -; AVX512BW-LABEL: test_build_vec_v64i1: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: test_build_vec_v64i1: -; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: retq +; CHECK-LABEL: test_build_vec_v64i1: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq ; ; X86-LABEL: test_build_vec_v64i1: ; X86: ## %bb.0: -; X86-NEXT: vandps LCPI43_0, %zmm0, %zmm0 +; X86-NEXT: vpandq LCPI43_0, %zmm0, %zmm0 ; X86-NEXT: retl %ret = select <64 x i1> , <64 x i8> %x, <64 x i8> zeroinitializer ret <64 x i8> %ret @@ -4351,8 +4321,8 @@ ; X86-LABEL: store_v128i1_constant: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovaps {{.*#+}} xmm0 = [4294963197,3758096251,4294959101,3221225403] -; X86-NEXT: vmovaps %xmm0, (%eax) +; X86-NEXT: vmovdqa {{.*#+}} xmm0 = [4294963197,3758096251,4294959101,3221225403] +; X86-NEXT: vmovdqa %xmm0, (%eax) ; X86-NEXT: retl entry: store <128 x i1> , <128 x i1>* %R diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll --- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll +++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll @@ -62,8 +62,8 @@ define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) { ; AVX512-LABEL: one_mask_bit_set5: ; AVX512: ## %bb.0: -; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; AVX512-NEXT: vmovlps %xmm0, 48(%rdi) +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, 48(%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1>) diff --git a/llvm/test/CodeGen/X86/avx512-mov.ll b/llvm/test/CodeGen/X86/avx512-mov.ll --- a/llvm/test/CodeGen/X86/avx512-mov.ll +++ b/llvm/test/CodeGen/X86/avx512-mov.ll @@ -31,7 +31,7 @@ define <4 x i32> @test4(i32* %x) { ; CHECK-LABEL: test4: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07] +; CHECK-NEXT: vmovd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07] ; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: retq ## encoding: [0xc3] %y = load i32, i32* %x @@ -42,7 +42,7 @@ define void @test5(float %x, float* %y) { ; CHECK-LABEL: test5: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovss %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07] +; CHECK-NEXT: vmovd %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] store float %x, float* %y, align 4 ret void @@ -51,7 +51,7 @@ define void @test6(double %x, double* %y) { ; CHECK-LABEL: test6: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovsd %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07] +; CHECK-NEXT: vmovq %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd6,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] store double %x, double* %y, align 8 ret void @@ -60,7 +60,7 @@ define float @test7(i32* %x) { ; CHECK-LABEL: test7: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07] +; CHECK-NEXT: vmovd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07] ; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: retq ## encoding: [0xc3] %y = load i32, i32* %x @@ -89,7 +89,7 @@ define <4 x i32> @test10(i32* %x) { ; CHECK-LABEL: test10: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07] +; CHECK-NEXT: vmovd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07] ; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: retq ## encoding: [0xc3] %y = load i32, i32* %x, align 4 @@ -100,7 +100,7 @@ define <4 x float> @test11(float* %x) { ; CHECK-LABEL: test11: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07] +; CHECK-NEXT: vmovd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07] ; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: retq ## encoding: [0xc3] %y = load float, float* %x, align 4 @@ -111,7 +111,7 @@ define <2 x double> @test12(double* %x) { ; CHECK-LABEL: test12: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovsd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07] +; CHECK-NEXT: vmovq (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x07] ; CHECK-NEXT: ## xmm0 = mem[0],zero ; CHECK-NEXT: retq ## encoding: [0xc3] %y = load double, double* %x, align 8 @@ -140,7 +140,7 @@ define <4 x i32> @test15(i32* %x) { ; CHECK-LABEL: test15: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07] +; CHECK-NEXT: vmovd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07] ; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: retq ## encoding: [0xc3] %y = load i32, i32* %x, align 4 @@ -151,7 +151,7 @@ define <16 x i32> @test16(i8 * %addr) { ; CHECK-LABEL: test16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x i32>* %res = load <16 x i32>, <16 x i32>* %vaddr, align 1 @@ -161,7 +161,7 @@ define <16 x i32> @test17(i8 * %addr) { ; CHECK-LABEL: test17: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x i32>* %res = load <16 x i32>, <16 x i32>* %vaddr, align 64 @@ -171,7 +171,7 @@ define void @test18(i8 * %addr, <8 x i64> %data) { ; CHECK-LABEL: test18: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07] +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i64>* store <8 x i64>%data, <8 x i64>* %vaddr, align 64 @@ -181,7 +181,7 @@ define void @test19(i8 * %addr, <16 x i32> %data) { ; CHECK-LABEL: test19: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] +; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x i32>* store <16 x i32>%data, <16 x i32>* %vaddr, align 1 @@ -191,7 +191,7 @@ define void @test20(i8 * %addr, <16 x i32> %data) { ; CHECK-LABEL: test20: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07] +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x i32>* store <16 x i32>%data, <16 x i32>* %vaddr, align 64 @@ -201,7 +201,7 @@ define <8 x i64> @test21(i8 * %addr) { ; CHECK-LABEL: test21: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i64>* %res = load <8 x i64>, <8 x i64>* %vaddr, align 64 @@ -211,7 +211,7 @@ define void @test22(i8 * %addr, <8 x i64> %data) { ; CHECK-LABEL: test22: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] +; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i64>* store <8 x i64>%data, <8 x i64>* %vaddr, align 1 @@ -221,7 +221,7 @@ define <8 x i64> @test23(i8 * %addr) { ; CHECK-LABEL: test23: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i64>* %res = load <8 x i64>, <8 x i64>* %vaddr, align 1 @@ -231,7 +231,7 @@ define void @test24(i8 * %addr, <8 x double> %data) { ; CHECK-LABEL: test24: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07] +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x double>* store <8 x double>%data, <8 x double>* %vaddr, align 64 @@ -241,7 +241,7 @@ define <8 x double> @test25(i8 * %addr) { ; CHECK-LABEL: test25: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x double>* %res = load <8 x double>, <8 x double>* %vaddr, align 64 @@ -251,7 +251,7 @@ define void @test26(i8 * %addr, <16 x float> %data) { ; CHECK-LABEL: test26: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07] +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x float>* store <16 x float>%data, <16 x float>* %vaddr, align 64 @@ -261,7 +261,7 @@ define <16 x float> @test27(i8 * %addr) { ; CHECK-LABEL: test27: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x float>* %res = load <16 x float>, <16 x float>* %vaddr, align 64 @@ -271,7 +271,7 @@ define void @test28(i8 * %addr, <8 x double> %data) { ; CHECK-LABEL: test28: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] +; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x double>* store <8 x double>%data, <8 x double>* %vaddr, align 1 @@ -281,7 +281,7 @@ define <8 x double> @test29(i8 * %addr) { ; CHECK-LABEL: test29: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x double>* %res = load <8 x double>, <8 x double>* %vaddr, align 1 @@ -291,7 +291,7 @@ define void @test30(i8 * %addr, <16 x float> %data) { ; CHECK-LABEL: test30: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] +; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x float>* store <16 x float>%data, <16 x float>* %vaddr, align 1 @@ -301,7 +301,7 @@ define <16 x float> @test31(i8 * %addr) { ; CHECK-LABEL: test31: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x float>* %res = load <16 x float>, <16 x float>* %vaddr, align 1 diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll --- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll @@ -98,10 +98,10 @@ ; X32: # %bb.0: # %entry ; X32-NEXT: pushl %edi ; X32-NEXT: subl $88, %esp -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [2,1,2,1] -; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: vmovaps {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1] -; X32-NEXT: vmovups %zmm0, (%esp) +; X32-NEXT: vmovdqa {{.*#+}} xmm0 = [2,1,2,1] +; X32-NEXT: vmovdqu %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1] +; X32-NEXT: vmovdqu64 %zmm0, (%esp) ; X32-NEXT: movl $1, {{[0-9]+}}(%esp) ; X32-NEXT: movl $2, {{[0-9]+}}(%esp) ; X32-NEXT: movl $2, %eax @@ -129,9 +129,9 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $48, %rsp ; WIN64-NEXT: .seh_stackalloc 48 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 32 -; WIN64-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 16 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: movabsq $4294967298, %rax # imm = 0x100000002 @@ -148,8 +148,8 @@ ; WIN64-NEXT: movq %rax, %r15 ; WIN64-NEXT: movq %rax, %rsi ; WIN64-NEXT: callq test_argv64i1 -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $48, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi @@ -247,16 +247,16 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $40, %rsp ; WIN64-NEXT: .seh_stackalloc 40 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 16 -; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: callq test_retv64i1 ; WIN64-NEXT: kmovq %rax, %k0 ; WIN64-NEXT: vpmovm2b %k0, %zmm0 -; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $40, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi @@ -285,10 +285,10 @@ ; X32: # %bb.0: # %entry ; X32-NEXT: pushl %esp ; X32-NEXT: subl $72, %esp -; X32-NEXT: vmovups %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X32-NEXT: vmovups %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X32-NEXT: vmovups %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X32-NEXT: vmovups %xmm4, (%esp) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm4, (%esp) # 16-byte Spill ; X32-NEXT: kmovd %edx, %k0 ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: kmovd %eax, %k2 @@ -299,10 +299,10 @@ ; X32-NEXT: # kill: def $ymm1 killed $ymm1 killed $zmm1 ; X32-NEXT: # kill: def $ymm2 killed $ymm2 killed $zmm2 ; X32-NEXT: calll _test_argv32i1helper -; X32-NEXT: vmovups (%esp), %xmm4 # 16-byte Reload -; X32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload -; X32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload -; X32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload +; X32-NEXT: vmovdqu (%esp), %xmm4 # 16-byte Reload +; X32-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload +; X32-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload +; X32-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload ; X32-NEXT: addl $72, %esp ; X32-NEXT: popl %esp ; X32-NEXT: vzeroupper @@ -352,14 +352,14 @@ ; LINUXOSX64-NEXT: pushq %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: subq $128, %rsp -; LINUXOSX64-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 144 ; LINUXOSX64-NEXT: .cfi_offset %rsp, -16 ; LINUXOSX64-NEXT: .cfi_offset %xmm8, -144 @@ -379,15 +379,15 @@ ; LINUXOSX64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; LINUXOSX64-NEXT: # kill: def $ymm1 killed $ymm1 killed $zmm1 ; LINUXOSX64-NEXT: # kill: def $ymm2 killed $ymm2 killed $zmm2 -; LINUXOSX64-NEXT: callq test_argv32i1helper -; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; LINUXOSX64-NEXT: callq test_argv32i1helper@PLT +; LINUXOSX64-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; LINUXOSX64-NEXT: addq $128, %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp @@ -417,17 +417,17 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $40, %rsp ; WIN64-NEXT: .seh_stackalloc 40 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 16 -; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: movl $1, %eax ; WIN64-NEXT: movl $1, %ecx ; WIN64-NEXT: movl $1, %edx ; WIN64-NEXT: callq test_argv32i1 -; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $40, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi @@ -482,15 +482,15 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $40, %rsp ; WIN64-NEXT: .seh_stackalloc 40 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 16 -; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: callq test_retv32i1 ; WIN64-NEXT: incl %eax -; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $40, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi @@ -520,10 +520,10 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %esp ; X32-NEXT: subl $72, %esp -; X32-NEXT: vmovups %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X32-NEXT: vmovups %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X32-NEXT: vmovups %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X32-NEXT: vmovups %xmm4, (%esp) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm4, (%esp) # 16-byte Spill ; X32-NEXT: kmovd %edx, %k0 ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: kmovd %eax, %k2 @@ -535,10 +535,10 @@ ; X32-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 ; X32-NEXT: vzeroupper ; X32-NEXT: calll _test_argv16i1helper -; X32-NEXT: vmovups (%esp), %xmm4 # 16-byte Reload -; X32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload -; X32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload -; X32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload +; X32-NEXT: vmovdqu (%esp), %xmm4 # 16-byte Reload +; X32-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload +; X32-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload +; X32-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload ; X32-NEXT: addl $72, %esp ; X32-NEXT: popl %esp ; X32-NEXT: retl @@ -581,14 +581,14 @@ ; LINUXOSX64-NEXT: pushq %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: subq $128, %rsp -; LINUXOSX64-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 144 ; LINUXOSX64-NEXT: .cfi_offset %rsp, -16 ; LINUXOSX64-NEXT: .cfi_offset %xmm8, -144 @@ -609,15 +609,15 @@ ; LINUXOSX64-NEXT: # kill: def $xmm1 killed $xmm1 killed $zmm1 ; LINUXOSX64-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 ; LINUXOSX64-NEXT: vzeroupper -; LINUXOSX64-NEXT: callq test_argv16i1helper -; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; LINUXOSX64-NEXT: callq test_argv16i1helper@PLT +; LINUXOSX64-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; LINUXOSX64-NEXT: addq $128, %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp @@ -645,17 +645,17 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $40, %rsp ; WIN64-NEXT: .seh_stackalloc 40 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 16 -; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: movl $1, %eax ; WIN64-NEXT: movl $1, %ecx ; WIN64-NEXT: movl $1, %edx ; WIN64-NEXT: callq test_argv16i1 -; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $40, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi @@ -712,17 +712,17 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $40, %rsp ; WIN64-NEXT: .seh_stackalloc 40 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 16 -; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: callq test_retv16i1 ; WIN64-NEXT: # kill: def $ax killed $ax def $eax ; WIN64-NEXT: incl %eax ; WIN64-NEXT: # kill: def $ax killed $ax killed $eax -; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $40, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi @@ -754,10 +754,10 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %esp ; X32-NEXT: subl $72, %esp -; X32-NEXT: vmovups %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X32-NEXT: vmovups %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X32-NEXT: vmovups %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X32-NEXT: vmovups %xmm4, (%esp) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm4, (%esp) # 16-byte Spill ; X32-NEXT: kmovd %edx, %k0 ; X32-NEXT: kmovd %ecx, %k1 ; X32-NEXT: kmovd %eax, %k2 @@ -769,10 +769,10 @@ ; X32-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 ; X32-NEXT: vzeroupper ; X32-NEXT: calll _test_argv8i1helper -; X32-NEXT: vmovups (%esp), %xmm4 # 16-byte Reload -; X32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload -; X32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload -; X32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload +; X32-NEXT: vmovdqu (%esp), %xmm4 # 16-byte Reload +; X32-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload +; X32-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload +; X32-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload ; X32-NEXT: addl $72, %esp ; X32-NEXT: popl %esp ; X32-NEXT: retl @@ -815,14 +815,14 @@ ; LINUXOSX64-NEXT: pushq %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: subq $128, %rsp -; LINUXOSX64-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 144 ; LINUXOSX64-NEXT: .cfi_offset %rsp, -16 ; LINUXOSX64-NEXT: .cfi_offset %xmm8, -144 @@ -843,15 +843,15 @@ ; LINUXOSX64-NEXT: # kill: def $xmm1 killed $xmm1 killed $zmm1 ; LINUXOSX64-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 ; LINUXOSX64-NEXT: vzeroupper -; LINUXOSX64-NEXT: callq test_argv8i1helper -; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; LINUXOSX64-NEXT: callq test_argv8i1helper@PLT +; LINUXOSX64-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; LINUXOSX64-NEXT: addq $128, %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp @@ -879,17 +879,17 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $40, %rsp ; WIN64-NEXT: .seh_stackalloc 40 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 16 -; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: movl $1, %eax ; WIN64-NEXT: movl $1, %ecx ; WIN64-NEXT: movl $1, %edx ; WIN64-NEXT: callq test_argv8i1 -; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $40, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi @@ -948,9 +948,9 @@ ; WIN64-NEXT: .seh_pushreg %rdi ; WIN64-NEXT: subq $40, %rsp ; WIN64-NEXT: .seh_stackalloc 40 -; WIN64-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm7, 16 -; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm6, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: callq test_retv8i1 @@ -958,8 +958,8 @@ ; WIN64-NEXT: kmovd %eax, %k0 ; WIN64-NEXT: vpmovm2w %k0, %zmm0 ; WIN64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; WIN64-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; WIN64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; WIN64-NEXT: addq $40, %rsp ; WIN64-NEXT: popq %rdi ; WIN64-NEXT: popq %rsi diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll --- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -355,12 +355,12 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %esp ; X32-NEXT: subl $24, %esp -; X32-NEXT: vmovups %xmm4, (%esp) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm4, (%esp) # 16-byte Spill ; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X32-NEXT: vaddss %xmm4, %xmm0, %xmm0 ; X32-NEXT: calll _test_argRetFloat ; X32-NEXT: vaddss %xmm4, %xmm0, %xmm0 -; X32-NEXT: vmovups (%esp), %xmm4 # 16-byte Reload +; X32-NEXT: vmovdqu (%esp), %xmm4 # 16-byte Reload ; X32-NEXT: addl $24, %esp ; X32-NEXT: popl %esp ; X32-NEXT: retl @@ -371,14 +371,14 @@ ; WIN64-NEXT: .seh_pushreg %rsp ; WIN64-NEXT: subq $16, %rsp ; WIN64-NEXT: .seh_stackalloc 16 -; WIN64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm8, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero ; WIN64-NEXT: vaddss %xmm0, %xmm8, %xmm0 ; WIN64-NEXT: callq test_argRetFloat ; WIN64-NEXT: vaddss %xmm0, %xmm8, %xmm0 -; WIN64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; WIN64-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload ; WIN64-NEXT: addq $16, %rsp ; WIN64-NEXT: popq %rsp ; WIN64-NEXT: retq @@ -389,7 +389,7 @@ ; LINUXOSX64-NEXT: pushq %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: subq $16, %rsp -; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 32 ; LINUXOSX64-NEXT: .cfi_offset %rsp, -16 ; LINUXOSX64-NEXT: .cfi_offset %xmm8, -32 @@ -397,7 +397,7 @@ ; LINUXOSX64-NEXT: vaddss %xmm0, %xmm8, %xmm0 ; LINUXOSX64-NEXT: callq test_argRetFloat ; LINUXOSX64-NEXT: vaddss %xmm0, %xmm8, %xmm0 -; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload ; LINUXOSX64-NEXT: addq $16, %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp @@ -435,12 +435,12 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %esp ; X32-NEXT: subl $24, %esp -; X32-NEXT: vmovups %xmm4, (%esp) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm4, (%esp) # 16-byte Spill ; X32-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero ; X32-NEXT: vaddsd %xmm4, %xmm0, %xmm0 ; X32-NEXT: calll _test_argRetDouble ; X32-NEXT: vaddsd %xmm4, %xmm0, %xmm0 -; X32-NEXT: vmovups (%esp), %xmm4 # 16-byte Reload +; X32-NEXT: vmovdqu (%esp), %xmm4 # 16-byte Reload ; X32-NEXT: addl $24, %esp ; X32-NEXT: popl %esp ; X32-NEXT: retl @@ -451,14 +451,14 @@ ; WIN64-NEXT: .seh_pushreg %rsp ; WIN64-NEXT: subq $16, %rsp ; WIN64-NEXT: .seh_stackalloc 16 -; WIN64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm8, 0 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero ; WIN64-NEXT: vaddsd %xmm0, %xmm8, %xmm0 ; WIN64-NEXT: callq test_argRetDouble ; WIN64-NEXT: vaddsd %xmm0, %xmm8, %xmm0 -; WIN64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; WIN64-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload ; WIN64-NEXT: addq $16, %rsp ; WIN64-NEXT: popq %rsp ; WIN64-NEXT: retq @@ -469,7 +469,7 @@ ; LINUXOSX64-NEXT: pushq %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: subq $16, %rsp -; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 32 ; LINUXOSX64-NEXT: .cfi_offset %rsp, -16 ; LINUXOSX64-NEXT: .cfi_offset %xmm8, -32 @@ -477,7 +477,7 @@ ; LINUXOSX64-NEXT: vaddsd %xmm0, %xmm8, %xmm0 ; LINUXOSX64-NEXT: callq test_argRetDouble ; LINUXOSX64-NEXT: vaddsd %xmm0, %xmm8, %xmm0 -; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload ; LINUXOSX64-NEXT: addq $16, %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp @@ -518,7 +518,7 @@ ; X32-NEXT: andl $-8, %esp ; X32-NEXT: subl $8, %esp ; X32-NEXT: fstpl (%esp) -; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp ; X32-NEXT: retl @@ -527,14 +527,14 @@ ; WIN64: # %bb.0: ; WIN64-NEXT: pushq %rax ; WIN64-NEXT: fstpl (%rsp) -; WIN64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; WIN64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; WIN64-NEXT: popq %rax ; WIN64-NEXT: retq ; ; LINUXOSX64-LABEL: test_argParamf80: ; LINUXOSX64: # %bb.0: ; LINUXOSX64-NEXT: fstpl -{{[0-9]+}}(%rsp) -; LINUXOSX64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; LINUXOSX64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; LINUXOSX64-NEXT: retq %r0 = fptrunc x86_fp80 %a0 to double ret double %r0 @@ -717,7 +717,7 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %esp ; X32-NEXT: subl $40, %esp -; X32-NEXT: vmovups %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X32-NEXT: vmovdqa %xmm1, %xmm4 ; X32-NEXT: vpslld $31, %xmm0, %xmm1 ; X32-NEXT: vpmovd2m %xmm1, %k1 @@ -727,7 +727,7 @@ ; X32-NEXT: calll _test_argRet128Vector ; X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload ; X32-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1} -; X32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm4 # 16-byte Reload +; X32-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm4 # 16-byte Reload ; X32-NEXT: addl $40, %esp ; X32-NEXT: popl %esp ; X32-NEXT: retl @@ -738,7 +738,7 @@ ; WIN64-NEXT: .seh_pushreg %rsp ; WIN64-NEXT: subq $32, %rsp ; WIN64-NEXT: .seh_stackalloc 32 -; WIN64-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; WIN64-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: .seh_savexmm %xmm8, 16 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: vmovdqa %xmm1, %xmm8 @@ -750,7 +750,7 @@ ; WIN64-NEXT: callq test_argRet128Vector ; WIN64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; WIN64-NEXT: vmovdqa32 %xmm8, %xmm0 {%k1} -; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; WIN64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; WIN64-NEXT: addq $32, %rsp ; WIN64-NEXT: popq %rsp ; WIN64-NEXT: retq @@ -761,7 +761,7 @@ ; LINUXOSX64-NEXT: pushq %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: subq $32, %rsp -; LINUXOSX64-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; LINUXOSX64-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 48 ; LINUXOSX64-NEXT: .cfi_offset %rsp, -16 ; LINUXOSX64-NEXT: .cfi_offset %xmm8, -32 @@ -774,7 +774,7 @@ ; LINUXOSX64-NEXT: callq test_argRet128Vector ; LINUXOSX64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; LINUXOSX64-NEXT: vmovdqa32 %xmm8, %xmm0 {%k1} -; LINUXOSX64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; LINUXOSX64-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; LINUXOSX64-NEXT: addq $32, %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp @@ -962,8 +962,8 @@ ; X32-LABEL: testf32_inp: ; X32: # %bb.0: ; X32-NEXT: subl $44, %esp -; X32-NEXT: vmovups %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X32-NEXT: vmovups %xmm6, (%esp) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X32-NEXT: vmovdqu %xmm6, (%esp) # 16-byte Spill ; X32-NEXT: vaddps %zmm2, %zmm0, %zmm6 ; X32-NEXT: vaddps %zmm3, %zmm1, %zmm7 ; X32-NEXT: vmulps %zmm2, %zmm0, %zmm0 @@ -972,8 +972,8 @@ ; X32-NEXT: vsubps %zmm1, %zmm7, %zmm1 ; X32-NEXT: vaddps %zmm4, %zmm0, %zmm0 ; X32-NEXT: vaddps %zmm5, %zmm1, %zmm1 -; X32-NEXT: vmovups (%esp), %xmm6 # 16-byte Reload -; X32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload +; X32-NEXT: vmovdqu (%esp), %xmm6 # 16-byte Reload +; X32-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload ; X32-NEXT: addl $44, %esp ; X32-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/avx512-rotate.ll b/llvm/test/CodeGen/X86/avx512-rotate.ll --- a/llvm/test/CodeGen/X86/avx512-rotate.ll +++ b/llvm/test/CodeGen/X86/avx512-rotate.ll @@ -245,7 +245,7 @@ define <8 x i64> @test_fold_rol_v8i64() { ; CHECK-LABEL: test_fold_rol_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [1,2,4,9223372036854775808,2,4611686018427387904,9223372036854775808,9223372036854775808] +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,2,4,9223372036854775808,2,4611686018427387904,9223372036854775808,9223372036854775808] ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> , <8 x i64> , <8 x i64> zeroinitializer, i8 -1) ret <8 x i64> %res diff --git a/llvm/test/CodeGen/X86/avx512-scalar_mask.ll b/llvm/test/CodeGen/X86/avx512-scalar_mask.ll --- a/llvm/test/CodeGen/X86/avx512-scalar_mask.ll +++ b/llvm/test/CodeGen/X86/avx512-scalar_mask.ll @@ -37,8 +37,8 @@ define <4 x float>@test_const0_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { ; CHECK-LABEL: test_const0_maskz: ; CHECK: ## %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 0, i32 4) ret < 4 x float> %res @@ -57,8 +57,8 @@ define <4 x float>@test_const2_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { ; CHECK-LABEL: test_const2_maskz: ; CHECK: ## %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 2, i32 4) ret < 4 x float> %res diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -114,7 +114,7 @@ ; X86-NEXT: .cfi_def_cfa_register %ebp ; X86-NEXT: andl $-64, %esp ; X86-NEXT: subl $64, %esp -; X86-NEXT: vmovaps 8(%ebp), %zmm1 +; X86-NEXT: vmovdqa64 8(%ebp), %zmm1 ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp ; X86-NEXT: .cfi_def_cfa %esp, 4 @@ -122,7 +122,7 @@ ; ; X64-LABEL: select04: ; X64: # %bb.0: -; X64-NEXT: vmovaps %zmm3, %zmm1 +; X64-NEXT: vmovdqa64 %zmm3, %zmm1 ; X64-NEXT: retq %sel = select <16 x i1> , <16 x double> %a, <16 x double> %b ret <16 x double> %sel @@ -439,7 +439,7 @@ ; X86-NEXT: testb $1, {{[0-9]+}}(%esp) ; X86-NEXT: jne .LBB14_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: vmovaps %zmm1, %zmm0 +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: .LBB14_2: ; X86-NEXT: retl ; @@ -448,7 +448,7 @@ ; X64-NEXT: testb $1, %dil ; X64-NEXT: jne .LBB14_2 ; X64-NEXT: # %bb.1: -; X64-NEXT: vmovaps %zmm1, %zmm0 +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ; X64-NEXT: .LBB14_2: ; X64-NEXT: retq %a = select i1 %c, <32 x i16> %x, <32 x i16> %y @@ -461,7 +461,7 @@ ; X86-NEXT: testb $1, {{[0-9]+}}(%esp) ; X86-NEXT: jne .LBB15_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: vmovaps %zmm1, %zmm0 +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: .LBB15_2: ; X86-NEXT: retl ; @@ -470,7 +470,7 @@ ; X64-NEXT: testb $1, %dil ; X64-NEXT: jne .LBB15_2 ; X64-NEXT: # %bb.1: -; X64-NEXT: vmovaps %zmm1, %zmm0 +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ; X64-NEXT: .LBB15_2: ; X64-NEXT: retq %a = select i1 %c, <64 x i8> %x, <64 x i8> %y diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-fp.ll b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-fp.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-fp.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-fp.ll @@ -4,7 +4,7 @@ define <4 x double> @test_double_to_4(double %s) { ; CHECK-LABEL: test_double_to_4: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: retq %vec = insertelement <2 x double> undef, double %s, i32 0 %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> @@ -125,7 +125,7 @@ define <8 x double> @test_double_to_8(double %s) { ; CHECK-LABEL: test_double_to_8: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 ; CHECK-NEXT: retq %vec = insertelement <2 x double> undef, double %s, i32 0 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> @@ -246,7 +246,7 @@ define <4 x float> @test_float_to_4(float %s) { ; CHECK-LABEL: test_float_to_4: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 ; CHECK-NEXT: retq %vec = insertelement <2 x float> undef, float %s, i32 0 %res = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> @@ -367,7 +367,7 @@ define <8 x float> @test_float_to_8(float %s) { ; CHECK-LABEL: test_float_to_8: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 ; CHECK-NEXT: retq %vec = insertelement <2 x float> undef, float %s, i32 0 %res = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> @@ -488,7 +488,7 @@ define <16 x float> @test_float_to_16(float %s) { ; CHECK-LABEL: test_float_to_16: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 ; CHECK-NEXT: retq %vec = insertelement <2 x float> undef, float %s, i32 0 %res = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> @@ -609,7 +609,7 @@ define <4 x double> @test_double_to_4_mem(double* %p) { ; CHECK-LABEL: test_double_to_4_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0 +; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 ; CHECK-NEXT: retq %s = load double, double* %p %vec = insertelement <2 x double> undef, double %s, i32 0 @@ -735,7 +735,7 @@ define <8 x double> @test_double_to_8_mem(double* %p) { ; CHECK-LABEL: test_double_to_8_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 +; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 ; CHECK-NEXT: retq %s = load double, double* %p %vec = insertelement <2 x double> undef, double %s, i32 0 @@ -861,7 +861,7 @@ define <4 x float> @test_float_to_4_mem(float* %p) { ; CHECK-LABEL: test_float_to_4_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss (%rdi), %xmm0 +; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 ; CHECK-NEXT: retq %s = load float, float* %p %vec = insertelement <2 x float> undef, float %s, i32 0 @@ -987,7 +987,7 @@ define <8 x float> @test_float_to_8_mem(float* %p) { ; CHECK-LABEL: test_float_to_8_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss (%rdi), %ymm0 +; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 ; CHECK-NEXT: retq %s = load float, float* %p %vec = insertelement <2 x float> undef, float %s, i32 0 @@ -1113,7 +1113,7 @@ define <16 x float> @test_float_to_16_mem(float* %p) { ; CHECK-LABEL: test_float_to_16_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss (%rdi), %zmm0 +; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 ; CHECK-NEXT: retq %s = load float, float* %p %vec = insertelement <2 x float> undef, float %s, i32 0 diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll @@ -1970,7 +1970,7 @@ define <4 x i32> @test_i32_to_4_mem(i32* %p) { ; CHECK-LABEL: test_i32_to_4_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss (%rdi), %xmm0 +; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 ; CHECK-NEXT: retq %s = load i32, i32* %p %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -2088,7 +2088,7 @@ define <8 x i32> @test_i32_to_8_mem(i32* %p) { ; CHECK-LABEL: test_i32_to_8_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss (%rdi), %ymm0 +; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 ; CHECK-NEXT: retq %s = load i32, i32* %p %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -2206,7 +2206,7 @@ define <16 x i32> @test_i32_to_16_mem(i32* %p) { ; CHECK-LABEL: test_i32_to_16_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss (%rdi), %zmm0 +; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 ; CHECK-NEXT: retq %s = load i32, i32* %p %vec = insertelement <2 x i32> undef, i32 %s, i32 0 @@ -2324,7 +2324,7 @@ define <2 x i64> @test_i64_to_2_mem(i64* %p) { ; CHECK-LABEL: test_i64_to_2_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 ; CHECK-NEXT: retq %s = load i64, i64* %p %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -2388,7 +2388,7 @@ define <4 x i64> @test_i64_to_4_mem(i64* %p) { ; CHECK-LABEL: test_i64_to_4_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0 +; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 ; CHECK-NEXT: retq %s = load i64, i64* %p %vec = insertelement <2 x i64> undef, i64 %s, i32 0 @@ -2506,7 +2506,7 @@ define <8 x i64> @test_i64_to_8_mem(i64* %p) { ; CHECK-LABEL: test_i64_to_8_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 +; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 ; CHECK-NEXT: retq %s = load i64, i64* %p %vec = insertelement <2 x i64> undef, i64 %s, i32 0 diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-fp.ll b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-fp.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-fp.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-fp.ll @@ -4,7 +4,7 @@ define <8 x float> @test_2xfloat_to_8xfloat(<8 x float> %vec) { ; CHECK-LABEL: test_2xfloat_to_8xfloat: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> ret <8 x float> %res @@ -116,7 +116,7 @@ define <16 x float> @test_2xfloat_to_16xfloat(<16 x float> %vec) { ; CHECK-LABEL: test_2xfloat_to_16xfloat: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> ret <16 x float> %res @@ -579,7 +579,7 @@ define <8 x float> @test_2xfloat_to_8xfloat_mem(<2 x float>* %vp) { ; CHECK-LABEL: test_2xfloat_to_8xfloat_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0 +; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 ; CHECK-NEXT: retq %vec = load <2 x float>, <2 x float>* %vp %res = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> @@ -696,7 +696,7 @@ define <16 x float> @test_2xfloat_to_16xfloat_mem(<2 x float>* %vp) { ; CHECK-LABEL: test_2xfloat_to_16xfloat_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 +; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 ; CHECK-NEXT: retq %vec = load <2 x float>, <2 x float>* %vp %res = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll @@ -4,7 +4,7 @@ define <4 x i32> @test_2xi32_to_4xi32(<4 x i32> %vec) { ; CHECK-LABEL: test_2xi32_to_4xi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ; CHECK-NEXT: retq %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> ret <4 x i32> %res @@ -108,7 +108,7 @@ define <8 x i32> @test_2xi32_to_8xi32(<8 x i32> %vec) { ; CHECK-LABEL: test_2xi32_to_8xi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -212,7 +212,7 @@ define <16 x i32> @test_2xi32_to_16xi32(<16 x i32> %vec) { ; CHECK-LABEL: test_2xi32_to_16xi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res @@ -316,7 +316,7 @@ define <4 x i32> @test_2xi32_to_4xi32_mem(<2 x i32>* %vp) { ; CHECK-LABEL: test_2xi32_to_4xi32_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> @@ -425,7 +425,7 @@ define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) { ; CHECK-LABEL: test_2xi32_to_8xi32_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0 +; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> @@ -534,7 +534,7 @@ define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) { ; CHECK-LABEL: test_2xi32_to_16xi32_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 +; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/duplicate-low.ll b/llvm/test/CodeGen/X86/avx512-shuffles/duplicate-low.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/duplicate-low.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/duplicate-low.ll @@ -4,7 +4,7 @@ define <2 x double> @test_2xdouble_dup_low(<2 x double> %vec) { ; CHECK-LABEL: test_2xdouble_dup_low: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ; CHECK-NEXT: retq %res = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> ret <2 x double> %res @@ -64,7 +64,7 @@ define <2 x double> @test_2xdouble_dup_low_mem(<2 x double>* %vp) { ; CHECK-LABEL: test_2xdouble_dup_low_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 ; CHECK-NEXT: retq %vec = load <2 x double>, <2 x double>* %vp %res = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll @@ -6,7 +6,7 @@ define <4 x float> @test_4xfloat_perm_mask0(<4 x float> %vec) { ; CHECK-LABEL: test_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,1] ; CHECK-NEXT: retq %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> ret <4 x float> %res @@ -92,7 +92,7 @@ define <4 x float> @test_4xfloat_perm_mask3(<4 x float> %vec) { ; CHECK-LABEL: test_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,2] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,2] ; CHECK-NEXT: retq %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> ret <4 x float> %res @@ -126,7 +126,7 @@ define <4 x float> @test_4xfloat_perm_mem_mask0(<4 x float>* %vp) { ; CHECK-LABEL: test_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,1,3] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,3,1,3] ; CHECK-NEXT: retq %vec = load <4 x float>, <4 x float>* %vp %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> @@ -219,7 +219,7 @@ define <4 x float> @test_4xfloat_perm_mem_mask3(<4 x float>* %vp) { ; CHECK-LABEL: test_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,3,0] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,3,0] ; CHECK-NEXT: retq %vec = load <4 x float>, <4 x float>* %vp %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> @@ -342,7 +342,7 @@ define <8 x float> @test_8xfloat_perm_imm_mask3(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_perm_imm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,1,0,6,6,5,4] +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,1,0,6,6,5,4] ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> ret <8 x float> %res @@ -586,7 +586,7 @@ define <8 x float> @test_8xfloat_perm_imm_mem_mask3(<8 x float>* %vp) { ; CHECK-LABEL: test_8xfloat_perm_imm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,0,3,3,4,4,7,7] +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,0,3,3,4,4,7,7] ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> @@ -835,7 +835,7 @@ define <16 x float> @test_16xfloat_perm_imm_mask3(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_perm_imm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14] +; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14] ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> ret <16 x float> %res @@ -1079,7 +1079,7 @@ define <16 x float> @test_16xfloat_perm_imm_mem_mask3(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_perm_imm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[1,0,3,1,5,4,7,5,9,8,11,9,13,12,15,13] +; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = mem[1,0,3,1,5,4,7,5,9,8,11,9,13,12,15,13] ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -911,8 +911,8 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,3,2] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4,0,3,2] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1009,8 +1009,8 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [5,3,2,5] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,3,2,5] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1330,8 +1330,8 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1430,8 +1430,8 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,0,13] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,0,0,13] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1470,8 +1470,8 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4] -; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4] +; CHECK-NEXT: vpermd 32(%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1773,8 +1773,8 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [12,9,4,10] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [12,9,4,10] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1785,7 +1785,7 @@ define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) { ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3] ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1849,8 +1849,8 @@ define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp) { ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 -; CHECK-NEXT: vunpckhpd 16(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[1] +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[1] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> @@ -1917,8 +1917,8 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,1] ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> ret <4 x i64> %res @@ -2239,7 +2239,7 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd $136, (%rdi), %ymm0 # ymm0 = mem[0,2,0,2] +; CHECK-NEXT: vpermq $136, (%rdi), %ymm0 # ymm0 = mem[0,2,0,2] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2520,8 +2520,8 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) { ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %xmm0 -; CHECK-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3] +; CHECK-NEXT: vmovdqa 32(%rdi), %xmm0 +; CHECK-NEXT: vpblendd $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2696,8 +2696,8 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,5,2] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,5,2] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3037,8 +3037,8 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,8,9,10] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4,8,9,10] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3142,8 +3142,8 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [10,2,11,6] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [10,2,11,6] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3511,7 +3511,7 @@ define <2 x double> @test_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) { ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3] ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3579,8 +3579,8 @@ define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp) { ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 -; CHECK-NEXT: vblendps $3, 16(%rdi), %xmm0, %xmm0 # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-NEXT: vpblendd $3, 16(%rdi), %xmm0, %xmm0 # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: retq %vec = load <4 x double>, <4 x double>* %vp %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> @@ -3922,9 +3922,9 @@ define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/permute.ll @@ -512,8 +512,8 @@ define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -599,8 +599,8 @@ define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -634,8 +634,8 @@ define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) { ; CHECK-LABEL: test_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] -; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] +; CHECK-NEXT: vpermd (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> @@ -728,8 +728,8 @@ define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) { ; CHECK-LABEL: test_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] -; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] +; CHECK-NEXT: vpermd (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> @@ -766,8 +766,8 @@ define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res @@ -853,8 +853,8 @@ define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res @@ -888,8 +888,8 @@ define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) { ; CHECK-LABEL: test_16xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] -; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] +; CHECK-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> @@ -982,8 +982,8 @@ define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) { ; CHECK-LABEL: test_16xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] -; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] +; CHECK-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> @@ -1020,7 +1020,7 @@ define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) { ; CHECK-LABEL: test_4xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,3,1] ; CHECK-NEXT: retq %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> ret <4 x i64> %res @@ -1100,7 +1100,7 @@ define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) { ; CHECK-LABEL: test_4xi64_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; CHECK-NEXT: retq %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> ret <4 x i64> %res @@ -1132,7 +1132,7 @@ define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) { ; CHECK-LABEL: test_4xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = mem[2,1,2,0] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> @@ -1219,7 +1219,7 @@ define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) { ; CHECK-LABEL: test_4xi64_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = mem[2,0,1,3] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> @@ -1254,8 +1254,8 @@ define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> ret <8 x i64> %res @@ -1339,7 +1339,7 @@ define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_perm_imm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] +; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> ret <8 x i64> %res @@ -1421,8 +1421,8 @@ define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> ret <8 x i64> %res @@ -1480,8 +1480,8 @@ define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) { ; CHECK-LABEL: test_8xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] -; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] +; CHECK-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> @@ -1572,7 +1572,7 @@ define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) { ; CHECK-LABEL: test_8xi64_perm_imm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] +; CHECK-NEXT: vpermq {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> @@ -1661,8 +1661,8 @@ define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) { ; CHECK-LABEL: test_8xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] -; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] +; CHECK-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> @@ -1725,8 +1725,8 @@ define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> ret <8 x float> %res @@ -1818,8 +1818,8 @@ define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> ret <8 x float> %res @@ -1855,8 +1855,8 @@ define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) { ; CHECK-LABEL: test_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] -; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] +; CHECK-NEXT: vpermd (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> @@ -1955,8 +1955,8 @@ define <8 x float> @test_8xfloat_perm_mem_mask3(<8 x float>* %vp) { ; CHECK-LABEL: test_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] -; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] +; CHECK-NEXT: vpermd (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> @@ -1995,8 +1995,8 @@ define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> ret <16 x float> %res @@ -2088,8 +2088,8 @@ define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> ret <16 x float> %res @@ -2125,8 +2125,8 @@ define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] -; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] +; CHECK-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> @@ -2225,8 +2225,8 @@ define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] -; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] +; CHECK-NEXT: vpermd (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> @@ -2265,7 +2265,7 @@ define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) { ; CHECK-LABEL: test_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; CHECK-NEXT: retq %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> ret <4 x double> %res @@ -2351,7 +2351,7 @@ define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) { ; CHECK-LABEL: test_4xdouble_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,2] ; CHECK-NEXT: retq %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> ret <4 x double> %res @@ -2385,7 +2385,7 @@ define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) { ; CHECK-LABEL: test_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = mem[0,0,2,0] ; CHECK-NEXT: retq %vec = load <4 x double>, <4 x double>* %vp %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> @@ -2478,7 +2478,7 @@ define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) { ; CHECK-LABEL: test_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,3,2] ; CHECK-NEXT: retq %vec = load <4 x double>, <4 x double>* %vp %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> @@ -2515,8 +2515,8 @@ define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> ret <8 x double> %res @@ -2606,7 +2606,7 @@ define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_perm_imm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] +; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> ret <8 x double> %res @@ -2694,8 +2694,8 @@ define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] +; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> ret <8 x double> %res @@ -2757,8 +2757,8 @@ define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) { ; CHECK-LABEL: test_8xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] -; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] +; CHECK-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> @@ -2855,7 +2855,7 @@ define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) { ; CHECK-LABEL: test_8xdouble_perm_imm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] +; CHECK-NEXT: vpermq {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> @@ -2950,8 +2950,8 @@ define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) { ; CHECK-LABEL: test_8xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] -; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] +; CHECK-NEXT: vpermq (%rdi), %zmm0, %zmm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll @@ -2091,7 +2091,7 @@ define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) { ; CHECK-LABEL: test_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,3,0] ; CHECK-NEXT: retq %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> ret <4 x i32> %res @@ -2171,7 +2171,7 @@ define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) { ; CHECK-LABEL: test_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,3] ; CHECK-NEXT: retq %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> ret <4 x i32> %res @@ -2203,7 +2203,7 @@ define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) { ; CHECK-LABEL: test_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,3,3] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> @@ -2290,7 +2290,7 @@ define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) { ; CHECK-LABEL: test_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,1,0] ; CHECK-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %vp %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> @@ -2325,7 +2325,7 @@ define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -2405,7 +2405,7 @@ define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -2437,7 +2437,7 @@ define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) { ; CHECK-LABEL: test_8xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> @@ -2524,7 +2524,7 @@ define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) { ; CHECK-LABEL: test_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> @@ -2559,7 +2559,7 @@ define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] +; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res @@ -2639,7 +2639,7 @@ define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] +; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res @@ -2671,7 +2671,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) { ; CHECK-LABEL: test_16xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] +; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> @@ -2758,7 +2758,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) { ; CHECK-LABEL: test_16xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] +; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll b/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll @@ -4,7 +4,7 @@ define <4 x float> @test_4xfloat_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2) { ; CHECK-LABEL: test_4xfloat_unpack_low_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: retq %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> ret <4 x float> %res @@ -90,7 +90,7 @@ define <4 x float> @test_4xfloat_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2) { ; CHECK-LABEL: test_4xfloat_unpack_low_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: retq %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> ret <4 x float> %res @@ -124,7 +124,7 @@ define <4 x float> @test_4xfloat_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) { ; CHECK-LABEL: test_4xfloat_unpack_low_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: retq %vec2 = load <4 x float>, <4 x float>* %vec2p %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> @@ -220,7 +220,7 @@ define <4 x float> @test_4xfloat_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) { ; CHECK-LABEL: test_4xfloat_unpack_low_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: retq %vec2 = load <4 x float>, <4 x float>* %vec2p %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> @@ -258,7 +258,7 @@ define <8 x float> @test_8xfloat_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2) { ; CHECK-LABEL: test_8xfloat_unpack_low_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; CHECK-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res @@ -344,7 +344,7 @@ define <8 x float> @test_8xfloat_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2) { ; CHECK-LABEL: test_8xfloat_unpack_low_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; CHECK-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res @@ -378,7 +378,7 @@ define <8 x float> @test_8xfloat_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) { ; CHECK-LABEL: test_8xfloat_unpack_low_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; CHECK-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -474,7 +474,7 @@ define <8 x float> @test_8xfloat_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) { ; CHECK-LABEL: test_8xfloat_unpack_low_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; CHECK-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -512,7 +512,7 @@ define <16 x float> @test_16xfloat_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2) { ; CHECK-LABEL: test_16xfloat_unpack_low_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; CHECK-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res @@ -598,7 +598,7 @@ define <16 x float> @test_16xfloat_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2) { ; CHECK-LABEL: test_16xfloat_unpack_low_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; CHECK-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res @@ -632,7 +632,7 @@ define <16 x float> @test_16xfloat_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) { ; CHECK-LABEL: test_16xfloat_unpack_low_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] +; CHECK-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] ; CHECK-NEXT: retq %vec2 = load <16 x float>, <16 x float>* %vec2p %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> @@ -728,7 +728,7 @@ define <16 x float> @test_16xfloat_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) { ; CHECK-LABEL: test_16xfloat_unpack_low_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] +; CHECK-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] ; CHECK-NEXT: retq %vec2 = load <16 x float>, <16 x float>* %vec2p %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> @@ -766,7 +766,7 @@ define <2 x double> @test_2xdouble_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2) { ; CHECK-LABEL: test_2xdouble_unpack_low_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> ret <2 x double> %res @@ -826,7 +826,7 @@ define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) { ; CHECK-LABEL: test_2xdouble_unpack_low_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: retq %vec2 = load <2 x double>, <2 x double>* %vec2p %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> @@ -893,7 +893,7 @@ define <4 x double> @test_4xdouble_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2) { ; CHECK-LABEL: test_4xdouble_unpack_low_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; CHECK-NEXT: retq %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res @@ -979,7 +979,7 @@ define <4 x double> @test_4xdouble_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2) { ; CHECK-LABEL: test_4xdouble_unpack_low_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; CHECK-NEXT: retq %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res @@ -1013,7 +1013,7 @@ define <4 x double> @test_4xdouble_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) { ; CHECK-LABEL: test_4xdouble_unpack_low_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -1109,7 +1109,7 @@ define <4 x double> @test_4xdouble_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) { ; CHECK-LABEL: test_4xdouble_unpack_low_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -1147,7 +1147,7 @@ define <8 x double> @test_8xdouble_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2) { ; CHECK-LABEL: test_8xdouble_unpack_low_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res @@ -1233,7 +1233,7 @@ define <8 x double> @test_8xdouble_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2) { ; CHECK-LABEL: test_8xdouble_unpack_low_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res @@ -1267,7 +1267,7 @@ define <8 x double> @test_8xdouble_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) { ; CHECK-LABEL: test_8xdouble_unpack_low_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] ; CHECK-NEXT: retq %vec2 = load <8 x double>, <8 x double>* %vec2p %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> @@ -1363,7 +1363,7 @@ define <8 x double> @test_8xdouble_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) { ; CHECK-LABEL: test_8xdouble_unpack_low_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] ; CHECK-NEXT: retq %vec2 = load <8 x double>, <8 x double>* %vec2p %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> @@ -1401,7 +1401,7 @@ define <4 x float> @test_4xfloat_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2) { ; CHECK-LABEL: test_4xfloat_unpack_high_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-NEXT: retq %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> ret <4 x float> %res @@ -1487,7 +1487,7 @@ define <4 x float> @test_4xfloat_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2) { ; CHECK-LABEL: test_4xfloat_unpack_high_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-NEXT: retq %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> ret <4 x float> %res @@ -1521,7 +1521,7 @@ define <4 x float> @test_4xfloat_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) { ; CHECK-LABEL: test_4xfloat_unpack_high_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: retq %vec2 = load <4 x float>, <4 x float>* %vec2p %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> @@ -1617,7 +1617,7 @@ define <4 x float> @test_4xfloat_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) { ; CHECK-LABEL: test_4xfloat_unpack_high_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: retq %vec2 = load <4 x float>, <4 x float>* %vec2p %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> @@ -1655,7 +1655,7 @@ define <8 x float> @test_8xfloat_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2) { ; CHECK-LABEL: test_8xfloat_unpack_high_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; CHECK-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res @@ -1741,7 +1741,7 @@ define <8 x float> @test_8xfloat_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2) { ; CHECK-LABEL: test_8xfloat_unpack_high_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; CHECK-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res @@ -1775,7 +1775,7 @@ define <8 x float> @test_8xfloat_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) { ; CHECK-LABEL: test_8xfloat_unpack_high_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; CHECK-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -1871,7 +1871,7 @@ define <8 x float> @test_8xfloat_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) { ; CHECK-LABEL: test_8xfloat_unpack_high_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; CHECK-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -1909,7 +1909,7 @@ define <16 x float> @test_16xfloat_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2) { ; CHECK-LABEL: test_16xfloat_unpack_high_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res @@ -1995,7 +1995,7 @@ define <16 x float> @test_16xfloat_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2) { ; CHECK-LABEL: test_16xfloat_unpack_high_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res @@ -2029,7 +2029,7 @@ define <16 x float> @test_16xfloat_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) { ; CHECK-LABEL: test_16xfloat_unpack_high_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] +; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] ; CHECK-NEXT: retq %vec2 = load <16 x float>, <16 x float>* %vec2p %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> @@ -2125,7 +2125,7 @@ define <16 x float> @test_16xfloat_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) { ; CHECK-LABEL: test_16xfloat_unpack_high_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] +; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] ; CHECK-NEXT: retq %vec2 = load <16 x float>, <16 x float>* %vec2p %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> @@ -2163,7 +2163,7 @@ define <2 x double> @test_2xdouble_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2) { ; CHECK-LABEL: test_2xdouble_unpack_high_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-NEXT: retq %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> ret <2 x double> %res @@ -2223,7 +2223,7 @@ define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) { ; CHECK-LABEL: test_2xdouble_unpack_high_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1] ; CHECK-NEXT: retq %vec2 = load <2 x double>, <2 x double>* %vec2p %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> @@ -2290,7 +2290,7 @@ define <4 x double> @test_4xdouble_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2) { ; CHECK-LABEL: test_4xdouble_unpack_high_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; CHECK-NEXT: retq %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res @@ -2376,7 +2376,7 @@ define <4 x double> @test_4xdouble_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2) { ; CHECK-LABEL: test_4xdouble_unpack_high_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; CHECK-NEXT: retq %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res @@ -2410,7 +2410,7 @@ define <4 x double> @test_4xdouble_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) { ; CHECK-LABEL: test_4xdouble_unpack_high_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -2506,7 +2506,7 @@ define <4 x double> @test_4xdouble_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) { ; CHECK-LABEL: test_4xdouble_unpack_high_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -2544,7 +2544,7 @@ define <8 x double> @test_8xdouble_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2) { ; CHECK-LABEL: test_8xdouble_unpack_high_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res @@ -2630,7 +2630,7 @@ define <8 x double> @test_8xdouble_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2) { ; CHECK-LABEL: test_8xdouble_unpack_high_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res @@ -2664,7 +2664,7 @@ define <8 x double> @test_8xdouble_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) { ; CHECK-LABEL: test_8xdouble_unpack_high_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] +; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] ; CHECK-NEXT: retq %vec2 = load <8 x double>, <8 x double>* %vec2p %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> @@ -2760,7 +2760,7 @@ define <8 x double> @test_8xdouble_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) { ; CHECK-LABEL: test_8xdouble_unpack_high_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] +; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] ; CHECK-NEXT: retq %vec2 = load <8 x double>, <8 x double>* %vec2p %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll --- a/llvm/test/CodeGen/X86/avx512-trunc.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc.ll @@ -269,7 +269,7 @@ define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 { ; ALL-LABEL: trunc_qd_128: ; ALL: ## %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; ALL-NEXT: retq %x = trunc <2 x i64> %i to <2 x i32> ret <2 x i32> %x @@ -278,8 +278,8 @@ define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 { ; KNL-LABEL: trunc_qd_128_mem: ; KNL: ## %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL-NEXT: vmovlps %xmm0, (%rdi) +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL-NEXT: vmovq %xmm0, (%rdi) ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qd_128_mem: diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=ALL -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw | FileCheck %s --check-prefix=ALL +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW define <16 x i32> @_inreg16xi32(i32 %a) { ; ALL-LABEL: _inreg16xi32: @@ -25,7 +25,7 @@ define <16 x float> @_ss16xfloat_v4(<4 x float> %a) { ; ALL-LABEL: _ss16xfloat_v4: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 ; ALL-NEXT: retq %b = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %b @@ -34,7 +34,7 @@ define <16 x float> @_inreg16xfloat(float %a) { ; ALL-LABEL: _inreg16xfloat: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 ; ALL-NEXT: retq %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer @@ -71,7 +71,7 @@ define <16 x float> @_ss16xfloat_load(float* %a.ptr) { ; ALL-LABEL: _ss16xfloat_load: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastss (%rdi), %zmm0 +; ALL-NEXT: vpbroadcastd (%rdi), %zmm0 ; ALL-NEXT: retq %a = load float, float* %a.ptr %b = insertelement <16 x float> undef, float %a, i32 0 @@ -110,7 +110,7 @@ define <8 x double> @_inreg8xdouble(double %a) { ; ALL-LABEL: _inreg8xdouble: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 ; ALL-NEXT: retq %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer @@ -149,7 +149,7 @@ define <8 x double> @_sd8xdouble_load(double* %a.ptr) { ; ALL-LABEL: _sd8xdouble_load: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 +; ALL-NEXT: vpbroadcastq (%rdi), %zmm0 ; ALL-NEXT: retq %a = load double, double* %a.ptr %b = insertelement <8 x double> undef, double %a, i32 0 @@ -190,7 +190,7 @@ define <16 x i32> @_xmm16xi32(<16 x i32> %a) { ; ALL-LABEL: _xmm16xi32: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 ; ALL-NEXT: retq %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32> %b @@ -199,7 +199,7 @@ define <16 x float> @_xmm16xfloat(<16 x float> %a) { ; ALL-LABEL: _xmm16xfloat: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 ; ALL-NEXT: retq %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %b @@ -227,7 +227,7 @@ define <8 x double> @test_set1_pd(double %d) #2 { ; ALL-LABEL: test_set1_pd: ; ALL: # %bb.0: # %entry -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 ; ALL-NEXT: retq entry: %vecinit.i = insertelement <8 x double> undef, double %d, i32 0 @@ -261,7 +261,7 @@ define <16 x float> @test_set1_ps(float %f) #2 { ; ALL-LABEL: test_set1_ps: ; ALL: # %bb.0: # %entry -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 ; ALL-NEXT: retq entry: %vecinit.i = insertelement <16 x float> undef, float %f, i32 0 @@ -313,7 +313,7 @@ define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) { ; ALL-LABEL: test_mm512_broadcastsd_pd: ; ALL: # %bb.0: # %entry -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 ; ALL-NEXT: retq entry: %0 = extractelement <2 x double> %a, i32 0 @@ -331,7 +331,7 @@ define <16 x float> @test1(<8 x float>%a) { ; ALL-LABEL: test1: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 ; ALL-NEXT: retq %res = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> zeroinitializer ret <16 x float>%res @@ -340,7 +340,7 @@ define <8 x double> @test2(<4 x double>%a) { ; ALL-LABEL: test2: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 ; ALL-NEXT: retq %res = shufflevector <4 x double> %a, <4 x double> undef, <8 x i32> zeroinitializer ret <8 x double>%res @@ -379,7 +379,7 @@ define <16 x i32> @_invec8xi32(<8 x i32>%a) { ; ALL-LABEL: _invec8xi32: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 ; ALL-NEXT: retq %res = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32>%res @@ -388,7 +388,7 @@ define <8 x i64> @_invec4xi64(<4 x i64>%a) { ; ALL-LABEL: _invec4xi64: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 ; ALL-NEXT: retq %res = shufflevector <4 x i64> %a, <4 x i64> undef, <8 x i32> zeroinitializer ret <8 x i64>%res @@ -402,8 +402,8 @@ ; ALL-NEXT: .cfi_def_cfa_offset 32 ; ALL-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; ALL-NEXT: callq func_f32 -; ALL-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload +; ALL-NEXT: callq func_f32@PLT +; ALL-NEXT: vpbroadcastd (%rsp), %zmm0 # 16-byte Folded Reload ; ALL-NEXT: addq $24, %rsp ; ALL-NEXT: .cfi_def_cfa_offset 8 ; ALL-NEXT: retq @@ -422,8 +422,8 @@ ; ALL-NEXT: .cfi_def_cfa_offset 32 ; ALL-NEXT: vaddsd %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; ALL-NEXT: callq func_f64 -; ALL-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload +; ALL-NEXT: callq func_f64@PLT +; ALL-NEXT: vpbroadcastq (%rsp), %zmm0 # 16-byte Folded Reload ; ALL-NEXT: addq $24, %rsp ; ALL-NEXT: .cfi_def_cfa_offset 8 ; ALL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll --- a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll @@ -194,9 +194,9 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { ; X64-AVX512-LABEL: PR29088: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX512-NEXT: vmovaps %ymm1, (%rsi) +; X64-AVX512-NEXT: vmovdqa %ymm1, (%rsi) ; X64-AVX512-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -1387,12 +1387,12 @@ define <2 x i64> @PR41066(<2 x i64> %t0, <2 x double> %x, <2 x double> %y) { ; AVX512-LABEL: PR41066: ; AVX512: ## %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x57,0xc0] +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] ; AVX512-NEXT: retq ## encoding: [0xc3] ; ; SKX-LABEL: PR41066: ; SKX: ## %bb.0: -; SKX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc0] +; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] ; SKX-NEXT: retq ## encoding: [0xc3] %t1 = fcmp ogt <2 x double> %x, %y %t2 = select <2 x i1> %t1, <2 x i64> , <2 x i64> zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512-vselect-crash.ll b/llvm/test/CodeGen/X86/avx512-vselect-crash.ll --- a/llvm/test/CodeGen/X86/avx512-vselect-crash.ll +++ b/llvm/test/CodeGen/X86/avx512-vselect-crash.ll @@ -4,7 +4,7 @@ define <16 x i32> @test() { ; CHECK-LABEL: test: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq entry: %0 = icmp slt <16 x i32> undef, undef diff --git a/llvm/test/CodeGen/X86/avx512bw-mov.ll b/llvm/test/CodeGen/X86/avx512bw-mov.ll --- a/llvm/test/CodeGen/X86/avx512bw-mov.ll +++ b/llvm/test/CodeGen/X86/avx512bw-mov.ll @@ -4,7 +4,7 @@ define <64 x i8> @test1(i8 * %addr) { ; CHECK-LABEL: test1: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 ; CHECK-NEXT: retq %vaddr = bitcast i8* %addr to <64 x i8>* %res = load <64 x i8>, <64 x i8>* %vaddr, align 1 @@ -14,7 +14,7 @@ define void @test2(i8 * %addr, <64 x i8> %data) { ; CHECK-LABEL: test2: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %zmm0, (%rdi) +; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) ; CHECK-NEXT: retq %vaddr = bitcast i8* %addr to <64 x i8>* store <64 x i8>%data, <64 x i8>* %vaddr, align 1 @@ -50,7 +50,7 @@ define <32 x i16> @test5(i8 * %addr) { ; CHECK-LABEL: test5: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 +; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 ; CHECK-NEXT: retq %vaddr = bitcast i8* %addr to <32 x i16>* %res = load <32 x i16>, <32 x i16>* %vaddr, align 1 @@ -60,7 +60,7 @@ define void @test6(i8 * %addr, <32 x i16> %data) { ; CHECK-LABEL: test6: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %zmm0, (%rdi) +; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) ; CHECK-NEXT: retq %vaddr = bitcast i8* %addr to <32 x i16>* store <32 x i16>%data, <32 x i16>* %vaddr, align 1 diff --git a/llvm/test/CodeGen/X86/avx512bwvl-mov.ll b/llvm/test/CodeGen/X86/avx512bwvl-mov.ll --- a/llvm/test/CodeGen/X86/avx512bwvl-mov.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-mov.ll @@ -4,7 +4,7 @@ define <32 x i8> @test_256_1(i8 * %addr) { ; CHECK-LABEL: test_256_1: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <32 x i8>* %res = load <32 x i8>, <32 x i8>* %vaddr, align 1 @@ -14,7 +14,7 @@ define void @test_256_2(i8 * %addr, <32 x i8> %data) { ; CHECK-LABEL: test_256_2: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07] +; CHECK-NEXT: vmovdqu %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <32 x i8>* store <32 x i8>%data, <32 x i8>* %vaddr, align 1 @@ -50,7 +50,7 @@ define <16 x i16> @test_256_5(i8 * %addr) { ; CHECK-LABEL: test_256_5: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x i16>* %res = load <16 x i16>, <16 x i16>* %vaddr, align 1 @@ -60,7 +60,7 @@ define void @test_256_6(i8 * %addr, <16 x i16> %data) { ; CHECK-LABEL: test_256_6: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07] +; CHECK-NEXT: vmovdqu %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x i16>* store <16 x i16>%data, <16 x i16>* %vaddr, align 1 @@ -96,7 +96,7 @@ define <16 x i8> @test_128_1(i8 * %addr) { ; CHECK-LABEL: test_128_1: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x i8>* %res = load <16 x i8>, <16 x i8>* %vaddr, align 1 @@ -106,7 +106,7 @@ define void @test_128_2(i8 * %addr, <16 x i8> %data) { ; CHECK-LABEL: test_128_2: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] +; CHECK-NEXT: vmovdqu %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <16 x i8>* store <16 x i8>%data, <16 x i8>* %vaddr, align 1 @@ -142,7 +142,7 @@ define <8 x i16> @test_128_5(i8 * %addr) { ; CHECK-LABEL: test_128_5: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i16>* %res = load <8 x i16>, <8 x i16>* %vaddr, align 1 @@ -152,7 +152,7 @@ define void @test_128_6(i8 * %addr, <8 x i16> %data) { ; CHECK-LABEL: test_128_6: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] +; CHECK-NEXT: vmovdqu %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i16>* store <8 x i16>%data, <8 x i16>* %vaddr, align 1 diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -7,7 +7,7 @@ define <2 x double>@test_int_x86_avx512_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vextractf64x2_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x39,0xc0,0x01] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) @@ -57,7 +57,7 @@ define <8 x float>@test_int_x86_avx512_vextractf32x8(<16 x float> %x0, <8 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vextractf32x8: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1b,0xc0,0x01] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc0,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1) ret <8 x float> %res @@ -102,7 +102,7 @@ define <16 x float>@test_int_x86_avx512_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3) { ; CHECK-LABEL: test_int_x86_avx512_insertf32x8_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc1,0x01] +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc1,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1) ret <16 x float> %res @@ -147,7 +147,7 @@ define <8 x double>@test_int_x86_avx512_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3) { ; CHECK-LABEL: test_int_x86_avx512_insertf64x2_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x18,0xc1,0x01] +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x38,0xc1,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1) ret <8 x double> %res @@ -192,7 +192,7 @@ define <16 x i32>@test_int_x86_avx512_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3) { ; CHECK-LABEL: test_int_x86_avx512_inserti32x8_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc1,0x01] +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc1,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) ret <16 x i32> %res @@ -237,7 +237,7 @@ define <8 x i64>@test_int_x86_avx512_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3) { ; CHECK-LABEL: test_int_x86_avx512_inserti64x2_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x18,0xc1,0x01] +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x38,0xc1,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) ret <8 x i64> %res @@ -320,7 +320,7 @@ ; CHECK-LABEL: test_int_x86_avx512_broadcastf32x8_512: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc0,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 -1) @@ -394,8 +394,8 @@ ; CHECK-LABEL: test_int_x86_avx512_broadcastf64x2_512: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc0,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 -1) @@ -473,7 +473,7 @@ ; CHECK-LABEL: test_int_x86_avx512_broadcasti32x8_512: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc0,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 -1) @@ -547,8 +547,8 @@ ; CHECK-LABEL: test_int_x86_avx512_broadcasti64x2_512: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc0,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 -1) diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -5,7 +5,7 @@ define <4 x float> @test_mask_andnot_ps_rr_128(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: test_mask_andnot_ps_rr_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0xc1] +; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1) ret <4 x float> %res @@ -15,15 +15,15 @@ ; X86-LABEL: test_mask_andnot_ps_rrk_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandnps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x55,0xd1] -; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: vpandnd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdf,0xd1] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rrk_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandnps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x55,0xd1] -; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: vpandnd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdf,0xd1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) ret <4 x float> %res @@ -33,13 +33,13 @@ ; X86-LABEL: test_mask_andnot_ps_rrkz_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandnps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x55,0xc1] +; X86-NEXT: vpandnd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdf,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rrkz_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandnps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x55,0xc1] +; X64-NEXT: vpandnd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdf,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) ret <4 x float> %res @@ -49,12 +49,12 @@ ; X86-LABEL: test_mask_andnot_ps_rm_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandnps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0x00] +; X86-NEXT: vpandn (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rm_128: ; X64: # %bb.0: -; X64-NEXT: vandnps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0x07] +; X64-NEXT: vpandn (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1) @@ -66,15 +66,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x55,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vpandnd (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdf,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmk_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x55,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vpandnd (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdf,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) @@ -86,13 +86,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x55,0x00] +; X86-NEXT: vpandnd (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmkz_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x55,0x07] +; X64-NEXT: vpandnd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) @@ -103,12 +103,12 @@ ; X86-LABEL: test_mask_andnot_ps_rmb_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandnps (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x55,0x00] +; X86-NEXT: vpandnd (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x18,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmb_128: ; X64: # %bb.0: -; X64-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x55,0x07] +; X64-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x18,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -122,15 +122,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x55,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vpandnd (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0xdf,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmbk_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x55,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0xdf,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -144,13 +144,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x55,0x00] +; X86-NEXT: vpandnd (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmbkz_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x55,0x07] +; X64-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -164,7 +164,7 @@ define <8 x float> @test_mask_andnot_ps_rr_256(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: test_mask_andnot_ps_rr_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x55,0xc1] +; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1) ret <8 x float> %res @@ -174,15 +174,15 @@ ; X86-LABEL: test_mask_andnot_ps_rrk_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandnps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x55,0xd1] -; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: vpandnd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdf,0xd1] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rrk_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandnps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x55,0xd1] -; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: vpandnd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdf,0xd1] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) ret <8 x float> %res @@ -192,13 +192,13 @@ ; X86-LABEL: test_mask_andnot_ps_rrkz_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandnps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x55,0xc1] +; X86-NEXT: vpandnd %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rrkz_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandnps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x55,0xc1] +; X64-NEXT: vpandnd %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) ret <8 x float> %res @@ -208,12 +208,12 @@ ; X86-LABEL: test_mask_andnot_ps_rm_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandnps (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x55,0x00] +; X86-NEXT: vpandn (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rm_256: ; X64: # %bb.0: -; X64-NEXT: vandnps (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x55,0x07] +; X64-NEXT: vpandn (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1) @@ -225,15 +225,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x55,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vpandnd (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdf,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmk_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x55,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vpandnd (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdf,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) @@ -245,13 +245,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x55,0x00] +; X86-NEXT: vpandnd (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmkz_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x55,0x07] +; X64-NEXT: vpandnd (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) @@ -262,12 +262,12 @@ ; X86-LABEL: test_mask_andnot_ps_rmb_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandnps (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x55,0x00] +; X86-NEXT: vpandnd (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x38,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmb_256: ; X64: # %bb.0: -; X64-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x55,0x07] +; X64-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x38,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -281,15 +281,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x55,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vpandnd (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0xdf,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmbk_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x55,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0xdf,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -303,13 +303,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x55,0x00] +; X86-NEXT: vpandnd (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmbkz_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x55,0x07] +; X64-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -323,7 +323,7 @@ define <16 x float> @test_mask_andnot_ps_rr_512(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: test_mask_andnot_ps_rr_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vandnps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x55,0xc1] +; CHECK-NEXT: vpandnd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xdf,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1) ret <16 x float> %res @@ -333,15 +333,15 @@ ; X86-LABEL: test_mask_andnot_ps_rrk_512: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandnps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x55,0xd1] -; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: vpandnd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdf,0xd1] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rrk_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandnps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x55,0xd1] -; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: vpandnd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdf,0xd1] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) ret <16 x float> %res @@ -351,13 +351,13 @@ ; X86-LABEL: test_mask_andnot_ps_rrkz_512: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x55,0xc1] +; X86-NEXT: vpandnd %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdf,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rrkz_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x55,0xc1] +; X64-NEXT: vpandnd %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdf,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) ret <16 x float> %res @@ -367,12 +367,12 @@ ; X86-LABEL: test_mask_andnot_ps_rm_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandnps (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x55,0x00] +; X86-NEXT: vpandnd (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rm_512: ; X64: # %bb.0: -; X64-NEXT: vandnps (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x55,0x07] +; X64-NEXT: vpandnd (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1) @@ -384,15 +384,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x55,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vpandnd (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdf,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmk_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x55,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vpandnd (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdf,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) @@ -404,13 +404,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x55,0x00] +; X86-NEXT: vpandnd (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmkz_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x55,0x07] +; X64-NEXT: vpandnd (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) @@ -421,12 +421,12 @@ ; X86-LABEL: test_mask_andnot_ps_rmb_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandnps (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x55,0x00] +; X86-NEXT: vpandnd (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x58,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmb_512: ; X64: # %bb.0: -; X64-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x55,0x07] +; X64-NEXT: vpandnd (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x58,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -440,15 +440,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x55,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vpandnd (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x59,0xdf,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmbk_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x55,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vpandnd (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x59,0xdf,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -462,13 +462,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandnps (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x55,0x00] +; X86-NEXT: vpandnd (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xd9,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_ps_rmbkz_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x55,0x07] +; X64-NEXT: vpandnd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xd9,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -482,7 +482,7 @@ define <4 x float> @test_mask_and_ps_rr_128(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: test_mask_and_ps_rr_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1) ret <4 x float> %res @@ -492,15 +492,15 @@ ; X86-LABEL: test_mask_and_ps_rrk_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x54,0xd1] -; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: vpandd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdb,0xd1] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rrk_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x54,0xd1] -; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: vpandd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdb,0xd1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) ret <4 x float> %res @@ -510,13 +510,13 @@ ; X86-LABEL: test_mask_and_ps_rrkz_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x54,0xc1] +; X86-NEXT: vpandd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdb,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rrkz_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x54,0xc1] +; X64-NEXT: vpandd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdb,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) ret <4 x float> %res @@ -526,12 +526,12 @@ ; X86-LABEL: test_mask_and_ps_rm_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0x00] +; X86-NEXT: vpand (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rm_128: ; X64: # %bb.0: -; X64-NEXT: vandps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0x07] +; X64-NEXT: vpand (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1) @@ -543,15 +543,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x54,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vpandd (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdb,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmk_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x54,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vpandd (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdb,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) @@ -563,13 +563,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x54,0x00] +; X86-NEXT: vpandd (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmkz_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x54,0x07] +; X64-NEXT: vpandd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) @@ -580,12 +580,12 @@ ; X86-LABEL: test_mask_and_ps_rmb_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandps (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x54,0x00] +; X86-NEXT: vpandd (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x18,0xdb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmb_128: ; X64: # %bb.0: -; X64-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x54,0x07] +; X64-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x18,0xdb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -599,15 +599,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x54,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vpandd (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0xdb,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmbk_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x54,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0xdb,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -621,13 +621,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x54,0x00] +; X86-NEXT: vpandd (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0xdb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmbkz_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x54,0x07] +; X64-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0xdb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -641,7 +641,7 @@ define <8 x float> @test_mask_and_ps_rr_256(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: test_mask_and_ps_rr_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x54,0xc1] +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdb,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1) ret <8 x float> %res @@ -651,15 +651,15 @@ ; X86-LABEL: test_mask_and_ps_rrk_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x54,0xd1] -; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: vpandd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdb,0xd1] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rrk_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x54,0xd1] -; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: vpandd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdb,0xd1] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) ret <8 x float> %res @@ -669,13 +669,13 @@ ; X86-LABEL: test_mask_and_ps_rrkz_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x54,0xc1] +; X86-NEXT: vpandd %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rrkz_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x54,0xc1] +; X64-NEXT: vpandd %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) ret <8 x float> %res @@ -685,12 +685,12 @@ ; X86-LABEL: test_mask_and_ps_rm_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandps (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x54,0x00] +; X86-NEXT: vpand (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rm_256: ; X64: # %bb.0: -; X64-NEXT: vandps (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x54,0x07] +; X64-NEXT: vpand (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1) @@ -702,15 +702,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x54,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vpandd (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdb,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmk_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x54,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vpandd (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdb,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) @@ -722,13 +722,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x54,0x00] +; X86-NEXT: vpandd (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmkz_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x54,0x07] +; X64-NEXT: vpandd (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) @@ -739,12 +739,12 @@ ; X86-LABEL: test_mask_and_ps_rmb_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandps (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x54,0x00] +; X86-NEXT: vpandd (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x38,0xdb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmb_256: ; X64: # %bb.0: -; X64-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x54,0x07] +; X64-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x38,0xdb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -758,15 +758,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x54,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vpandd (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0xdb,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmbk_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x54,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0xdb,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -780,13 +780,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x54,0x00] +; X86-NEXT: vpandd (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0xdb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmbkz_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x54,0x07] +; X64-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0xdb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -800,7 +800,7 @@ define <16 x float> @test_mask_and_ps_rr_512(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: test_mask_and_ps_rr_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x54,0xc1] +; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xdb,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1) ret <16 x float> %res @@ -810,15 +810,15 @@ ; X86-LABEL: test_mask_and_ps_rrk_512: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x54,0xd1] -; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdb,0xd1] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rrk_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x54,0xd1] -; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdb,0xd1] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) ret <16 x float> %res @@ -828,13 +828,13 @@ ; X86-LABEL: test_mask_and_ps_rrkz_512: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vandps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x54,0xc1] +; X86-NEXT: vpandd %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdb,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rrkz_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vandps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x54,0xc1] +; X64-NEXT: vpandd %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdb,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) ret <16 x float> %res @@ -844,12 +844,12 @@ ; X86-LABEL: test_mask_and_ps_rm_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandps (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x54,0x00] +; X86-NEXT: vpandd (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xdb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rm_512: ; X64: # %bb.0: -; X64-NEXT: vandps (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x54,0x07] +; X64-NEXT: vpandd (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xdb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1) @@ -861,15 +861,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x54,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vpandd (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdb,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmk_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x54,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vpandd (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdb,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) @@ -881,13 +881,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x54,0x00] +; X86-NEXT: vpandd (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmkz_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x54,0x07] +; X64-NEXT: vpandd (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) @@ -898,12 +898,12 @@ ; X86-LABEL: test_mask_and_ps_rmb_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandps (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x54,0x00] +; X86-NEXT: vpandd (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x58,0xdb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmb_512: ; X64: # %bb.0: -; X64-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x54,0x07] +; X64-NEXT: vpandd (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x58,0xdb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -917,15 +917,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x54,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vpandd (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x59,0xdb,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmbk_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x54,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vpandd (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x59,0xdb,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -939,13 +939,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vandps (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x54,0x00] +; X86-NEXT: vpandd (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xd9,0xdb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_ps_rmbkz_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x54,0x07] +; X64-NEXT: vpandd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xd9,0xdb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -959,7 +959,7 @@ define <4 x float> @test_mask_or_ps_rr_128(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: test_mask_or_ps_rr_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1] +; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1) ret <4 x float> %res @@ -969,15 +969,15 @@ ; X86-LABEL: test_mask_or_ps_rrk_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vorps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x56,0xd1] -; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: vpord %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xeb,0xd1] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rrk_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vorps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x56,0xd1] -; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: vpord %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xeb,0xd1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) ret <4 x float> %res @@ -987,13 +987,13 @@ ; X86-LABEL: test_mask_or_ps_rrkz_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vorps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x56,0xc1] +; X86-NEXT: vpord %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xeb,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rrkz_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vorps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x56,0xc1] +; X64-NEXT: vpord %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xeb,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) ret <4 x float> %res @@ -1003,12 +1003,12 @@ ; X86-LABEL: test_mask_or_ps_rm_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vorps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0x00] +; X86-NEXT: vpor (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rm_128: ; X64: # %bb.0: -; X64-NEXT: vorps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0x07] +; X64-NEXT: vpor (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1) @@ -1020,15 +1020,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x56,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vpord (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xeb,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmk_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x56,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vpord (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xeb,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) @@ -1040,13 +1040,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x56,0x00] +; X86-NEXT: vpord (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xeb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmkz_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x56,0x07] +; X64-NEXT: vpord (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xeb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) @@ -1057,12 +1057,12 @@ ; X86-LABEL: test_mask_or_ps_rmb_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vorps (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x56,0x00] +; X86-NEXT: vpord (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x18,0xeb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmb_128: ; X64: # %bb.0: -; X64-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x56,0x07] +; X64-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x18,0xeb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -1076,15 +1076,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x56,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vpord (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0xeb,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmbk_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x56,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0xeb,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -1098,13 +1098,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x56,0x00] +; X86-NEXT: vpord (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0xeb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmbkz_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x56,0x07] +; X64-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0xeb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -1118,7 +1118,7 @@ define <8 x float> @test_mask_or_ps_rr_256(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: test_mask_or_ps_rr_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x56,0xc1] +; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xeb,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1) ret <8 x float> %res @@ -1128,15 +1128,15 @@ ; X86-LABEL: test_mask_or_ps_rrk_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vorps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x56,0xd1] -; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: vpord %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xeb,0xd1] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rrk_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vorps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x56,0xd1] -; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: vpord %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xeb,0xd1] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) ret <8 x float> %res @@ -1146,13 +1146,13 @@ ; X86-LABEL: test_mask_or_ps_rrkz_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vorps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x56,0xc1] +; X86-NEXT: vpord %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rrkz_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vorps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x56,0xc1] +; X64-NEXT: vpord %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) ret <8 x float> %res @@ -1162,12 +1162,12 @@ ; X86-LABEL: test_mask_or_ps_rm_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vorps (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x56,0x00] +; X86-NEXT: vpor (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xeb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rm_256: ; X64: # %bb.0: -; X64-NEXT: vorps (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x56,0x07] +; X64-NEXT: vpor (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xeb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1) @@ -1179,15 +1179,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x56,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vpord (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xeb,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmk_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x56,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vpord (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xeb,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) @@ -1199,13 +1199,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x56,0x00] +; X86-NEXT: vpord (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmkz_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x56,0x07] +; X64-NEXT: vpord (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) @@ -1216,12 +1216,12 @@ ; X86-LABEL: test_mask_or_ps_rmb_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vorps (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x56,0x00] +; X86-NEXT: vpord (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x38,0xeb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmb_256: ; X64: # %bb.0: -; X64-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x56,0x07] +; X64-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x38,0xeb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -1235,15 +1235,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x56,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vpord (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0xeb,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmbk_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x56,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0xeb,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -1257,13 +1257,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x56,0x00] +; X86-NEXT: vpord (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0xeb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmbkz_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x56,0x07] +; X64-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0xeb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -1277,7 +1277,7 @@ define <16 x float> @test_mask_or_ps_rr_512(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: test_mask_or_ps_rr_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vorps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x56,0xc1] +; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xeb,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1) ret <16 x float> %res @@ -1287,15 +1287,15 @@ ; X86-LABEL: test_mask_or_ps_rrk_512: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vorps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x56,0xd1] -; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xeb,0xd1] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rrk_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vorps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x56,0xd1] -; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xeb,0xd1] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) ret <16 x float> %res @@ -1305,13 +1305,13 @@ ; X86-LABEL: test_mask_or_ps_rrkz_512: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vorps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x56,0xc1] +; X86-NEXT: vpord %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xeb,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rrkz_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vorps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x56,0xc1] +; X64-NEXT: vpord %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xeb,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) ret <16 x float> %res @@ -1321,12 +1321,12 @@ ; X86-LABEL: test_mask_or_ps_rm_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vorps (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x56,0x00] +; X86-NEXT: vpord (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xeb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rm_512: ; X64: # %bb.0: -; X64-NEXT: vorps (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x56,0x07] +; X64-NEXT: vpord (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xeb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1) @@ -1338,15 +1338,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x56,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vpord (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xeb,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmk_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x56,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vpord (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xeb,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) @@ -1358,13 +1358,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x56,0x00] +; X86-NEXT: vpord (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xeb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmkz_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x56,0x07] +; X64-NEXT: vpord (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xeb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) @@ -1375,12 +1375,12 @@ ; X86-LABEL: test_mask_or_ps_rmb_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vorps (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x56,0x00] +; X86-NEXT: vpord (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x58,0xeb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmb_512: ; X64: # %bb.0: -; X64-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x56,0x07] +; X64-NEXT: vpord (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x58,0xeb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -1394,15 +1394,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x56,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vpord (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x59,0xeb,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmbk_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x56,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vpord (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x59,0xeb,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -1416,13 +1416,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vorps (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x56,0x00] +; X86-NEXT: vpord (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xd9,0xeb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_ps_rmbkz_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x56,0x07] +; X64-NEXT: vpord (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xd9,0xeb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -1436,7 +1436,7 @@ define <4 x float> @test_mask_xor_ps_rr_128(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: test_mask_xor_ps_rr_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1] +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1) ret <4 x float> %res @@ -1446,15 +1446,15 @@ ; X86-LABEL: test_mask_xor_ps_rrk_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vxorps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x57,0xd1] -; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: vpxord %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xef,0xd1] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rrk_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vxorps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x57,0xd1] -; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: vpxord %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xef,0xd1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) ret <4 x float> %res @@ -1464,13 +1464,13 @@ ; X86-LABEL: test_mask_xor_ps_rrkz_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x57,0xc1] +; X86-NEXT: vpxord %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xef,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rrkz_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x57,0xc1] +; X64-NEXT: vpxord %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xef,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) ret <4 x float> %res @@ -1480,12 +1480,12 @@ ; X86-LABEL: test_mask_xor_ps_rm_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vxorps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0x00] +; X86-NEXT: vpxor (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rm_128: ; X64: # %bb.0: -; X64-NEXT: vxorps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0x07] +; X64-NEXT: vpxor (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1) @@ -1497,15 +1497,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x57,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vpxord (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xef,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmk_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x57,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vpxord (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xef,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) @@ -1517,13 +1517,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x57,0x00] +; X86-NEXT: vpxord (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xef,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmkz_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x57,0x07] +; X64-NEXT: vpxord (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xef,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x float>, <4 x float>* %ptr_b %res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask) @@ -1534,12 +1534,12 @@ ; X86-LABEL: test_mask_xor_ps_rmb_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vxorps (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x57,0x00] +; X86-NEXT: vpxord (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x18,0xef,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmb_128: ; X64: # %bb.0: -; X64-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x18,0x57,0x07] +; X64-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x18,0xef,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -1553,15 +1553,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x57,0x08] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vpxord (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0xef,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmbk_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x57,0x0f] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0xef,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -1575,13 +1575,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x57,0x00] +; X86-NEXT: vpxord (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0xef,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmbkz_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x57,0x07] +; X64-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -1595,7 +1595,7 @@ define <8 x float> @test_mask_xor_ps_rr_256(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: test_mask_xor_ps_rr_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x57,0xc1] +; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xef,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1) ret <8 x float> %res @@ -1605,15 +1605,15 @@ ; X86-LABEL: test_mask_xor_ps_rrk_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vxorps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x57,0xd1] -; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: vpxord %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xef,0xd1] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rrk_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vxorps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x57,0xd1] -; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: vpxord %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xef,0xd1] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) ret <8 x float> %res @@ -1623,13 +1623,13 @@ ; X86-LABEL: test_mask_xor_ps_rrkz_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vxorps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x57,0xc1] +; X86-NEXT: vpxord %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xef,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rrkz_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x57,0xc1] +; X64-NEXT: vpxord %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xef,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) ret <8 x float> %res @@ -1639,12 +1639,12 @@ ; X86-LABEL: test_mask_xor_ps_rm_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vxorps (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x57,0x00] +; X86-NEXT: vpxor (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xef,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rm_256: ; X64: # %bb.0: -; X64-NEXT: vxorps (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x57,0x07] +; X64-NEXT: vpxor (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xef,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1) @@ -1656,15 +1656,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x57,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vpxord (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xef,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmk_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x57,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vpxord (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xef,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) @@ -1676,13 +1676,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x57,0x00] +; X86-NEXT: vpxord (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xef,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmkz_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x57,0x07] +; X64-NEXT: vpxord (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xef,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x float>, <8 x float>* %ptr_b %res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask) @@ -1693,12 +1693,12 @@ ; X86-LABEL: test_mask_xor_ps_rmb_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vxorps (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x57,0x00] +; X86-NEXT: vpxord (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x38,0xef,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmb_256: ; X64: # %bb.0: -; X64-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x38,0x57,0x07] +; X64-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x38,0xef,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -1712,15 +1712,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x57,0x08] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vpxord (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0xef,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmbk_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x39,0x57,0x0f] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0xef,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -1734,13 +1734,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x57,0x00] +; X86-NEXT: vpxord (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0xef,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmbkz_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xb9,0x57,0x07] +; X64-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0xef,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 @@ -1754,7 +1754,7 @@ define <16 x float> @test_mask_xor_ps_rr_512(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: test_mask_xor_ps_rr_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x57,0xc1] +; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xef,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1) ret <16 x float> %res @@ -1764,15 +1764,15 @@ ; X86-LABEL: test_mask_xor_ps_rrk_512: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vxorps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x57,0xd1] -; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X86-NEXT: vpxord %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xef,0xd1] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rrk_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vxorps %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x57,0xd1] -; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; X64-NEXT: vpxord %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xef,0xd1] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) ret <16 x float> %res @@ -1782,13 +1782,13 @@ ; X86-LABEL: test_mask_xor_ps_rrkz_512: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x57,0xc1] +; X86-NEXT: vpxord %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xef,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rrkz_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x57,0xc1] +; X64-NEXT: vpxord %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xef,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) ret <16 x float> %res @@ -1798,12 +1798,12 @@ ; X86-LABEL: test_mask_xor_ps_rm_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vxorps (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x57,0x00] +; X86-NEXT: vpxord (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xef,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rm_512: ; X64: # %bb.0: -; X64-NEXT: vxorps (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x57,0x07] +; X64-NEXT: vpxord (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xef,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1) @@ -1815,15 +1815,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x57,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vpxord (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xef,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmk_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x57,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vpxord (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xef,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) @@ -1835,13 +1835,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x57,0x00] +; X86-NEXT: vpxord (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xef,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmkz_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xc9,0x57,0x07] +; X64-NEXT: vpxord (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xef,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <16 x float>, <16 x float>* %ptr_b %res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask) @@ -1852,12 +1852,12 @@ ; X86-LABEL: test_mask_xor_ps_rmb_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vxorps (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x57,0x00] +; X86-NEXT: vpxord (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x58,0xef,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmb_512: ; X64: # %bb.0: -; X64-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x58,0x57,0x07] +; X64-NEXT: vpxord (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x58,0xef,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -1871,15 +1871,15 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x57,0x08] -; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: vpxord (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x59,0xef,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmbk_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x59,0x57,0x0f] -; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: vpxord (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x59,0xef,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -1893,13 +1893,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vxorps (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x57,0x00] +; X86-NEXT: vpxord (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xd9,0xef,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_ps_rmbkz_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xd9,0x57,0x07] +; X64-NEXT: vpxord (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xd9,0xef,0x07] ; X64-NEXT: retq # encoding: [0xc3] %q = load float, float* %ptr_b %vecinit.i = insertelement <16 x float> undef, float %q, i32 0 @@ -2391,7 +2391,7 @@ define <2 x double>@test_int_x86_avx512_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vextractf64x2_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x39,0xc0,0x01] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) @@ -2441,7 +2441,7 @@ define <4 x double>@test_int_x86_avx512_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3) { ; CHECK-LABEL: test_int_x86_avx512_insertf64x2_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc1,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) ret <4 x double> %res @@ -2486,7 +2486,7 @@ define <4 x i64>@test_int_x86_avx512_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3) { ; CHECK-LABEL: test_int_x86_avx512_inserti64x2_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc1,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) ret <4 x i64> %res @@ -2604,7 +2604,7 @@ ; CHECK-LABEL: test_int_x86_avx512_broadcastf64x2_256: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 -1) @@ -2678,7 +2678,7 @@ ; CHECK-LABEL: test_int_x86_avx512_broadcasti64x2_256: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 -1) @@ -2815,14 +2815,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0xe6,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0xe6,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) ret <2 x double> %res @@ -2844,14 +2844,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2pd %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0xe6,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2pd %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0xe6,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) ret <4 x double> %res @@ -2873,14 +2873,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0x7a,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0x7a,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) ret <2 x double> %res @@ -2902,14 +2902,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2pd %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0x7a,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2pd %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0x7a,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) ret <4 x double> %res @@ -2994,7 +2994,7 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3002,7 +3002,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) @@ -3026,7 +3026,7 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3034,7 +3034,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -18,14 +18,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtpd2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7b,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7b,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) ret <2 x i64> %res @@ -47,14 +47,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtpd2qq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7b,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2qq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7b,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) ret <4 x i64> %res @@ -76,14 +76,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtpd2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x79,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x79,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) ret <2 x i64> %res @@ -105,14 +105,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtpd2uqq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x79,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2uqq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x79,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) ret <4 x i64> %res @@ -134,14 +134,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtps2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7b,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7b,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) ret <2 x i64> %res @@ -326,14 +326,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtps2qq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7b,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2qq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7b,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) ret <4 x i64> %res @@ -355,14 +355,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtps2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x79,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x79,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) ret <2 x i64> %res @@ -546,14 +546,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtps2uqq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x79,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2uqq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x79,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) ret <4 x i64> %res @@ -575,14 +575,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) ret <4 x float> %res @@ -603,14 +603,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> @@ -633,7 +633,7 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -641,7 +641,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %cvt1 = sitofp <4 x i64> %x0 to <4 x float> @@ -667,14 +667,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7a,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7a,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) ret <2 x i64> %res @@ -696,14 +696,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7a,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7a,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) ret <4 x i64> %res @@ -725,14 +725,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x78,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x78,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) ret <2 x i64> %res @@ -754,14 +754,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x78,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x78,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) ret <4 x i64> %res @@ -783,14 +783,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) ret <2 x i64> %res @@ -975,14 +975,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7a,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7a,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) ret <4 x i64> %res @@ -1004,14 +1004,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) ret <4 x float> %res @@ -1032,14 +1032,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> @@ -1063,7 +1063,7 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1071,7 +1071,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %cvt = uitofp <4 x i64> %x0 to <4 x float> @@ -1097,14 +1097,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) ret <2 x i64> %res @@ -1289,14 +1289,14 @@ ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x78,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x78,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) ret <4 x i64> %res diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -2084,7 +2084,7 @@ define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) { ; CHECK-LABEL: test_mm_broadcastd_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <2 x i64> %a0 to <4 x i32> %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2142,7 +2142,7 @@ define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) { ; CHECK-LABEL: test_mm256_broadcastd_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <2 x i64> %a0 to <4 x i32> %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer @@ -2196,7 +2196,7 @@ define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) { ; CHECK-LABEL: test_mm_broadcastq_epi64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %res @@ -2247,7 +2247,7 @@ define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) { ; CHECK-LABEL: test_mm256_broadcastq_epi64: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer ret <4 x i64> %res @@ -2298,7 +2298,7 @@ define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) { ; CHECK-LABEL: test_mm256_broadcastsd_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer ret <4 x double> %res @@ -2349,7 +2349,7 @@ define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) { ; CHECK-LABEL: test_mm_broadcastss_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer ret <4 x float> %res @@ -2400,7 +2400,7 @@ define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) { ; CHECK-LABEL: test_mm256_broadcastss_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer ret <8 x float> %res @@ -2447,7 +2447,7 @@ define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) { ; CHECK-LABEL: test_mm_movddup_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer ret <2 x double> %res @@ -2745,7 +2745,7 @@ define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) { ; CHECK-LABEL: test_mm256_permutex_epi64: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> ret <4 x i64> %res @@ -2796,7 +2796,7 @@ define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) { ; CHECK-LABEL: test_mm256_permutex_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> ret <4 x double> %res @@ -2847,7 +2847,7 @@ define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: test_mm_shuffle_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> ret <2 x double> %res diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -136,12 +136,12 @@ ; X86-LABEL: test_int_x86_avx512_pbroadcastd_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vbroadcastss (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x00] +; X86-NEXT: vpbroadcastd (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_pbroadcastd_256: ; X64: # %bb.0: -; X64-NEXT: vbroadcastss (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x07] +; X64-NEXT: vpbroadcastd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x07] ; X64-NEXT: retq # encoding: [0xc3] %y_32 = load i32, i32 * %y_ptr %y = insertelement <4 x i32> undef, i32 %y_32, i32 0 @@ -194,7 +194,7 @@ define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) ret <4 x i32> %res @@ -241,7 +241,7 @@ define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1) ret <4 x i64> %res @@ -288,8 +288,7 @@ define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1) { ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] -; CHECK-NEXT: # xmm0 = xmm0[0,0] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1) ret <2 x i64> %res @@ -336,7 +335,7 @@ define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1) { ; CHECK-LABEL: test_x86_vbroadcast_sd_pd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> undef, i8 -1) ret <4 x double> %res @@ -383,7 +382,7 @@ define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xc0] +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) ret <8 x float> %res @@ -430,7 +429,7 @@ define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> undef, i8 -1) ret <4 x float> %res @@ -686,8 +685,7 @@ define <2 x double>@test_int_x86_avx512_movddup_128(<2 x double> %x0, <2 x double> %x1) { ; CHECK-LABEL: test_int_x86_avx512_movddup_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] -; CHECK-NEXT: # xmm0 = xmm0[0,0] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1) ret <2 x double> %res @@ -895,7 +893,7 @@ define <8 x float>@test_int_x86_avx512_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps $22, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x16] +; CHECK-NEXT: vpshufd $22, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x70,0xc0,0x16] ; CHECK-NEXT: # ymm0 = ymm0[2,1,1,0,6,5,5,4] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1) @@ -947,7 +945,7 @@ define <4 x float>@test_int_x86_avx512_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps $22, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x16] +; CHECK-NEXT: vpshufd $22, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc0,0x16] ; CHECK-NEXT: # xmm0 = xmm0[2,1,1,0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1) @@ -999,7 +997,7 @@ define <4 x double>@test_int_x86_avx512_perm_df_256(<4 x double> %x0, i32 %x1, <4 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_perm_df_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x01,0xc0,0x03] +; CHECK-NEXT: vpermq $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x00,0xc0,0x03] ; CHECK-NEXT: # ymm0 = ymm0[3,0,0,0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 -1) @@ -1051,7 +1049,7 @@ define <4 x i64>@test_int_x86_avx512_perm_di_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2) { ; CHECK-LABEL: test_int_x86_avx512_perm_di_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x01,0xc0,0x03] +; CHECK-NEXT: vpermq $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x00,0xc0,0x03] ; CHECK-NEXT: # ymm0 = ymm0[3,0,0,0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1) @@ -1971,7 +1969,7 @@ define <4 x i32>@test_int_x86_avx512_pshuf_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_pshuf_d_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x03] +; CHECK-NEXT: vpshufd $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc0,0x03] ; CHECK-NEXT: # xmm0 = xmm0[3,0,0,0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1) @@ -2023,7 +2021,7 @@ define <8 x i32>@test_int_x86_avx512_pshuf_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_pshuf_d_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x03] +; CHECK-NEXT: vpshufd $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x70,0xc0,0x03] ; CHECK-NEXT: # ymm0 = ymm0[3,0,0,0,7,4,4,4] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1) @@ -2357,7 +2355,7 @@ define <2 x double>@test_int_x86_avx512_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_unpckh_pd_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1] +; CHECK-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6d,0xc1] ; CHECK-NEXT: # xmm0 = xmm0[1],xmm1[1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) @@ -2390,7 +2388,7 @@ define <4 x double>@test_int_x86_avx512_unpckh_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_unpckh_pd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x15,0xc1] +; CHECK-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6d,0xc1] ; CHECK-NEXT: # ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) @@ -2423,7 +2421,7 @@ define <4 x float>@test_int_x86_avx512_unpckh_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_unpckh_ps_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1] +; CHECK-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6a,0xc1] ; CHECK-NEXT: # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) @@ -2456,7 +2454,7 @@ define <8 x float>@test_int_x86_avx512_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_unpckh_ps_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x15,0xc1] +; CHECK-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6a,0xc1] ; CHECK-NEXT: # ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) @@ -2489,7 +2487,7 @@ define <2 x double>@test_int_x86_avx512_unpckl_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_unpckl_pd_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1] +; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xc1] ; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) @@ -2522,7 +2520,7 @@ define <4 x double>@test_int_x86_avx512_unpckl_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_unpckl_pd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x14,0xc1] +; CHECK-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6c,0xc1] ; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) @@ -2555,7 +2553,7 @@ define <4 x float>@test_int_x86_avx512_unpckl_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_unpckl_ps_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1] +; CHECK-NEXT: vpunpckldq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x62,0xc1] ; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) @@ -2588,7 +2586,7 @@ define <8 x float>@test_int_x86_avx512_unpckl_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_unpckl_ps_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x14,0xc1] +; CHECK-NEXT: vpunpckldq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x62,0xc1] ; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) @@ -2621,7 +2619,7 @@ define <4 x i32>@test_int_x86_avx512_ask_punpckhd_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_ask_punpckhd_q_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1] +; CHECK-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6a,0xc1] ; CHECK-NEXT: # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -2654,7 +2652,7 @@ define <4 x i32>@test_int_x86_avx512_punpckld_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_punpckld_q_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1] +; CHECK-NEXT: vpunpckldq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x62,0xc1] ; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -2687,7 +2685,7 @@ define <8 x i32>@test_int_x86_avx512_punpckhd_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_punpckhd_q_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x15,0xc1] +; CHECK-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6a,0xc1] ; CHECK-NEXT: # ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -2720,7 +2718,7 @@ define <8 x i32>@test_int_x86_avx512_punpckld_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_punpckld_q_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x14,0xc1] +; CHECK-NEXT: vpunpckldq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x62,0xc1] ; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -2753,7 +2751,7 @@ define <2 x i64>@test_int_x86_avx512_punpckhqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { ; CHECK-LABEL: test_int_x86_avx512_punpckhqd_q_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1] +; CHECK-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6d,0xc1] ; CHECK-NEXT: # xmm0 = xmm0[1],xmm1[1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -2786,7 +2784,7 @@ define <2 x i64>@test_int_x86_avx512_punpcklqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { ; CHECK-LABEL: test_int_x86_avx512_punpcklqd_q_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1] +; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xc1] ; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -2819,7 +2817,7 @@ define <4 x i64>@test_int_x86_avx512_punpcklqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { ; CHECK-LABEL: test_int_x86_avx512_punpcklqd_q_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x14,0xc1] +; CHECK-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6c,0xc1] ; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -2852,7 +2850,7 @@ define <4 x i64>@test_int_x86_avx512_punpckhqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { ; CHECK-LABEL: test_int_x86_avx512_punpckhqd_q_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x15,0xc1] +; CHECK-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6d,0xc1] ; CHECK-NEXT: # ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -2883,7 +2881,7 @@ define <4 x i32> @test_mask_and_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_mask_and_epi32_rr_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x54,0xc1] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res @@ -2929,12 +2927,12 @@ ; X86-LABEL: test_mask_and_epi32_rm_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandps (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x54,0x00] +; X86-NEXT: vpand (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_epi32_rm_128: ; X64: # %bb.0: -; X64-NEXT: vandps (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x54,0x07] +; X64-NEXT: vpand (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) @@ -3048,7 +3046,7 @@ define <8 x i32> @test_mask_and_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: test_mask_and_epi32_rr_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x54,0xc1] +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdb,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res @@ -3094,12 +3092,12 @@ ; X86-LABEL: test_mask_and_epi32_rm_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandps (%eax), %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x54,0x00] +; X86-NEXT: vpand (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_and_epi32_rm_256: ; X64: # %bb.0: -; X64-NEXT: vandps (%rdi), %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x54,0x07] +; X64-NEXT: vpand (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) @@ -3213,7 +3211,7 @@ define <4 x i32> @test_mask_or_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_mask_or_epi32_rr_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x56,0xc1] +; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res @@ -3259,12 +3257,12 @@ ; X86-LABEL: test_mask_or_epi32_rm_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vorps (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x56,0x00] +; X86-NEXT: vpor (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_epi32_rm_128: ; X64: # %bb.0: -; X64-NEXT: vorps (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x56,0x07] +; X64-NEXT: vpor (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) @@ -3378,7 +3376,7 @@ define <8 x i32> @test_mask_or_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: test_mask_or_epi32_rr_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x56,0xc1] +; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xeb,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res @@ -3424,12 +3422,12 @@ ; X86-LABEL: test_mask_or_epi32_rm_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vorps (%eax), %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x56,0x00] +; X86-NEXT: vpor (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xeb,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_or_epi32_rm_256: ; X64: # %bb.0: -; X64-NEXT: vorps (%rdi), %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x56,0x07] +; X64-NEXT: vpor (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xeb,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) @@ -3543,7 +3541,7 @@ define <4 x i32> @test_mask_xor_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_mask_xor_epi32_rr_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc1] +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res @@ -3589,12 +3587,12 @@ ; X86-LABEL: test_mask_xor_epi32_rm_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vxorps (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0x00] +; X86-NEXT: vpxor (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_epi32_rm_128: ; X64: # %bb.0: -; X64-NEXT: vxorps (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0x07] +; X64-NEXT: vpxor (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) @@ -3708,7 +3706,7 @@ define <8 x i32> @test_mask_xor_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: test_mask_xor_epi32_rr_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x57,0xc1] +; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xef,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res @@ -3754,12 +3752,12 @@ ; X86-LABEL: test_mask_xor_epi32_rm_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vxorps (%eax), %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x57,0x00] +; X86-NEXT: vpxor (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xef,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_xor_epi32_rm_256: ; X64: # %bb.0: -; X64-NEXT: vxorps (%rdi), %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x57,0x07] +; X64-NEXT: vpxor (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xef,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) @@ -3873,7 +3871,7 @@ define <4 x i32> @test_mask_andnot_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_mask_andnot_epi32_rr_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x55,0xc1] +; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) ret <4 x i32> %res @@ -3919,12 +3917,12 @@ ; X86-LABEL: test_mask_andnot_epi32_rm_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandnps (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x55,0x00] +; X86-NEXT: vpandn (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_epi32_rm_128: ; X64: # %bb.0: -; X64-NEXT: vandnps (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x55,0x07] +; X64-NEXT: vpandn (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x i32>, <4 x i32>* %ptr_b %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1) @@ -4038,7 +4036,7 @@ define <8 x i32> @test_mask_andnot_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: test_mask_andnot_epi32_rr_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x55,0xc1] +; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) ret <8 x i32> %res @@ -4084,12 +4082,12 @@ ; X86-LABEL: test_mask_andnot_epi32_rm_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandnps (%eax), %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x55,0x00] +; X86-NEXT: vpandn (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_epi32_rm_256: ; X64: # %bb.0: -; X64-NEXT: vandnps (%rdi), %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x55,0x07] +; X64-NEXT: vpandn (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <8 x i32>, <8 x i32>* %ptr_b %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1) @@ -4203,7 +4201,7 @@ define <2 x i64> @test_mask_andnot_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_mask_andnot_epi64_rr_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x55,0xc1] +; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1) ret <2 x i64> %res @@ -4249,12 +4247,12 @@ ; X86-LABEL: test_mask_andnot_epi64_rm_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandnps (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x55,0x00] +; X86-NEXT: vpandn (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_epi64_rm_128: ; X64: # %bb.0: -; X64-NEXT: vandnps (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x55,0x07] +; X64-NEXT: vpandn (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <2 x i64>, <2 x i64>* %ptr_b %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1) @@ -4368,7 +4366,7 @@ define <4 x i64> @test_mask_andnot_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: test_mask_andnot_epi64_rr_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x55,0xc1] +; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1) ret <4 x i64> %res @@ -4414,12 +4412,12 @@ ; X86-LABEL: test_mask_andnot_epi64_rm_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vandnps (%eax), %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x55,0x00] +; X86-NEXT: vpandn (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_epi64_rm_256: ; X64: # %bb.0: -; X64-NEXT: vandnps (%rdi), %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x55,0x07] +; X64-NEXT: vpandn (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <4 x i64>, <4 x i64>* %ptr_b %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1) @@ -5563,7 +5561,7 @@ define <8 x float>@test_int_x86_avx512_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3) { ; CHECK-LABEL: test_int_x86_avx512_shuf_f32x4_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; CHECK-NEXT: vpblendd $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0xf0] ; CHECK-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1) @@ -5617,7 +5615,7 @@ define <4 x double>@test_int_x86_avx512_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_shuf_f64x2_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; CHECK-NEXT: vpblendd $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0xf0] ; CHECK-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1) @@ -5671,7 +5669,7 @@ define <8 x i32>@test_int_x86_avx512_shuf_i32x4_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_shuf_i32x4_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; CHECK-NEXT: vpblendd $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0xf0] ; CHECK-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1) @@ -5704,7 +5702,7 @@ define <4 x i64>@test_int_x86_avx512_shuf_i64x2_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3) { ; CHECK-LABEL: test_int_x86_avx512_shuf_i64x2_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; CHECK-NEXT: vpblendd $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0xf0] ; CHECK-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1) @@ -8715,14 +8713,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0xe6,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0xe6,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) ret <2 x double> %res @@ -8745,14 +8743,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0xe6,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0xe6,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) ret <4 x double> %res @@ -8775,14 +8773,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x7a,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x7a,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) ret <2 x double> %res @@ -8805,14 +8803,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x7a,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x7a,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) ret <4 x double> %res @@ -9162,7 +9160,7 @@ define <4 x float>@test_int_x86_avx512_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vextractf32x4_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x39,0xc0,0x01] ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1) @@ -9214,7 +9212,7 @@ define <8 x float>@test_int_x86_avx512_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3) { ; CHECK-LABEL: test_int_x86_avx512_insertf32x4_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc1,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) ret <8 x float> %res @@ -9261,7 +9259,7 @@ define <8 x i32>@test_int_x86_avx512_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3) { ; CHECK-LABEL: test_int_x86_avx512_inserti32x4_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc1,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1) ret <8 x i32> %res @@ -10451,7 +10449,7 @@ ; CHECK-LABEL: test_int_x86_avx512_broadcastf32x4_256: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 -1) ret <8 x float> %res @@ -10524,7 +10522,7 @@ ; CHECK-LABEL: test_int_x86_avx512_broadcasti32x4_256: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1) ret <8 x i32> %res @@ -11696,14 +11694,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x5b,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x5b,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) ret <4 x float> %res @@ -11726,14 +11724,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2ps %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x5b,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2ps %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x5b,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) ret <8 x float> %res @@ -11754,14 +11752,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtph2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x13,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtph2ps_128_rrk: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtph2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x13,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> %a1, i8 %mask) ret <4 x float> %res @@ -11801,14 +11799,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtph2ps %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x13,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtph2ps_256_rrk: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtph2ps %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x13,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> %a1, i8 %mask) ret <8 x float> %res @@ -11851,7 +11849,7 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0xe6,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -11859,7 +11857,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0xe6,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) @@ -11884,7 +11882,7 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x5a,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -11892,7 +11890,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x5a,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 %x2) @@ -11916,14 +11914,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x5a,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x5a,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 %x2) ret <4 x double> %res @@ -11946,14 +11944,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x5a,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x5a,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 %x2) ret <2 x double> %res @@ -11977,7 +11975,7 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xe6,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -11985,7 +11983,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xe6,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) @@ -12009,14 +12007,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x5b,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x5b,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) ret <4 x i32> %res @@ -12039,14 +12037,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x5b,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x5b,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) ret <8 x i32> %res @@ -12057,7 +12055,7 @@ define <8 x float>@test_int_x86_avx512_permvar_sf_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_permvar_sf_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) ret <8 x float> %res @@ -12104,7 +12102,7 @@ define <8 x i32>@test_int_x86_avx512_permvar_si_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_permvar_si_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) ret <8 x i32> %res @@ -12151,7 +12149,7 @@ define <4 x double>@test_int_x86_avx512_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_permvar_df_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0] +; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x36,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) ret <4 x double> %res @@ -12198,7 +12196,7 @@ define <4 x i64>@test_int_x86_avx512_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { ; CHECK-LABEL: test_int_x86_avx512_permvar_di_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0] +; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x36,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) ret <4 x i64> %res @@ -12445,14 +12443,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x7a,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x7a,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) ret <4 x float> %res @@ -12475,14 +12473,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2ps %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x7a,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2ps %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x7a,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) ret <8 x float> %res diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -3438,14 +3438,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0xe6,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0xe6,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> @@ -3469,14 +3469,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2ps: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) ret <4 x float> %res @@ -3498,14 +3498,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> @@ -3529,14 +3529,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) ret <4 x i32> %res @@ -3558,14 +3558,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> @@ -3590,7 +3590,7 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2udq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x79,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3598,7 +3598,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2udq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x79,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) @@ -3622,14 +3622,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x5b,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x5b,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) ret <4 x i32> %res @@ -3652,14 +3652,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x5b,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x5b,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) ret <8 x i32> %res @@ -3682,14 +3682,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x79,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x79,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) ret <4 x i32> %res @@ -3712,14 +3712,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2udq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x79,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2udq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x79,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) ret <8 x i32> %res @@ -3742,14 +3742,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) ret <4 x i32> %res @@ -3771,14 +3771,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> @@ -3802,14 +3802,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) ret <4 x i32> %res @@ -3831,14 +3831,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> @@ -3863,7 +3863,7 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x78,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3871,7 +3871,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x78,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) @@ -3895,14 +3895,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x78,0xc8] -; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x78,0xc8] -; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) ret <4 x i32> %res @@ -3925,14 +3925,14 @@ ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x78,0xc8] -; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x78,0xc8] -; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) ret <8 x i32> %res @@ -4791,7 +4791,7 @@ define <4 x double>@test_int_x86_avx512_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { ; CHECK-LABEL: test_int_x86_avx512_permvar_df_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0] +; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x36,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) ret <4 x double> %1 @@ -4844,7 +4844,7 @@ define <4 x i64>@test_int_x86_avx512_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { ; CHECK-LABEL: test_int_x86_avx512_permvar_di_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0] +; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x36,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1) ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/X86/avx512vl-logic.ll b/llvm/test/CodeGen/X86/avx512vl-logic.ll --- a/llvm/test/CodeGen/X86/avx512vl-logic.ll +++ b/llvm/test/CodeGen/X86/avx512vl-logic.ll @@ -229,7 +229,7 @@ ; SKX-LABEL: test_mm256_mask_andnot_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnpd %ymm2, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vpandnq %ymm2, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -253,7 +253,7 @@ ; SKX-LABEL: test_mm256_maskz_andnot_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnpd %ymm1, %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpandnq %ymm1, %ymm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -277,7 +277,7 @@ ; SKX-LABEL: test_mm_mask_andnot_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnpd %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vpandnq %xmm2, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -301,7 +301,7 @@ ; SKX-LABEL: test_mm_maskz_andnot_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnpd %xmm1, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vpandnq %xmm1, %xmm0, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -325,7 +325,7 @@ ; SKX-LABEL: test_mm256_mask_andnot_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnps %ymm2, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vpandnd %ymm2, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -348,7 +348,7 @@ ; SKX-LABEL: test_mm256_maskz_andnot_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnps %ymm1, %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpandnd %ymm1, %ymm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -371,7 +371,7 @@ ; SKX-LABEL: test_mm_mask_andnot_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnps %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vpandnd %xmm2, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -395,7 +395,7 @@ ; SKX-LABEL: test_mm_maskz_andnot_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnps %xmm1, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vpandnd %xmm1, %xmm0, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -419,7 +419,7 @@ ; SKX-LABEL: test_mm256_mask_and_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: vpandq %ymm1, %ymm2, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -442,7 +442,7 @@ ; SKX-LABEL: test_mm256_maskz_and_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %ymm0, %ymm1, %ymm0 {%k1} {z} +; SKX-NEXT: vpandq %ymm0, %ymm1, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -465,7 +465,7 @@ ; SKX-LABEL: test_mm_mask_and_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %xmm1, %xmm2, %xmm0 {%k1} +; SKX-NEXT: vpandq %xmm1, %xmm2, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -488,7 +488,7 @@ ; SKX-LABEL: test_mm_maskz_and_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %xmm0, %xmm1, %xmm0 {%k1} {z} +; SKX-NEXT: vpandq %xmm0, %xmm1, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -511,7 +511,7 @@ ; SKX-LABEL: test_mm256_mask_and_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: vpandd %ymm1, %ymm2, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -533,7 +533,7 @@ ; SKX-LABEL: test_mm256_maskz_and_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %ymm0, %ymm1, %ymm0 {%k1} {z} +; SKX-NEXT: vpandd %ymm0, %ymm1, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -555,7 +555,7 @@ ; SKX-LABEL: test_mm_mask_and_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %xmm1, %xmm2, %xmm0 {%k1} +; SKX-NEXT: vpandd %xmm1, %xmm2, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -578,7 +578,7 @@ ; SKX-LABEL: test_mm_maskz_and_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %xmm0, %xmm1, %xmm0 {%k1} {z} +; SKX-NEXT: vpandd %xmm0, %xmm1, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -601,7 +601,7 @@ ; SKX-LABEL: test_mm256_mask_xor_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorpd %ymm2, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vpxorq %ymm2, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -624,7 +624,7 @@ ; SKX-LABEL: test_mm256_maskz_xor_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorpd %ymm1, %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxorq %ymm1, %ymm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -647,7 +647,7 @@ ; SKX-LABEL: test_mm_mask_xor_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorpd %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vpxorq %xmm2, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -670,7 +670,7 @@ ; SKX-LABEL: test_mm_maskz_xor_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorpd %xmm1, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vpxorq %xmm1, %xmm0, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -693,7 +693,7 @@ ; SKX-LABEL: test_mm256_mask_xor_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %ymm2, %ymm1, %ymm0 {%k1} +; SKX-NEXT: vpxord %ymm2, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -715,7 +715,7 @@ ; SKX-LABEL: test_mm256_maskz_xor_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %ymm1, %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxord %ymm1, %ymm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -737,7 +737,7 @@ ; SKX-LABEL: test_mm_mask_xor_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vpxord %xmm2, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -760,7 +760,7 @@ ; SKX-LABEL: test_mm_maskz_xor_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %xmm1, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vpxord %xmm1, %xmm0, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -783,7 +783,7 @@ ; SKX-LABEL: test_mm256_mask_or_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorpd %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: vporq %ymm1, %ymm2, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -806,7 +806,7 @@ ; SKX-LABEL: test_mm256_maskz_or_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorpd %ymm0, %ymm1, %ymm0 {%k1} {z} +; SKX-NEXT: vporq %ymm0, %ymm1, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x double> %__A to <4 x i64> @@ -829,7 +829,7 @@ ; SKX-LABEL: test_mm_mask_or_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorpd %xmm1, %xmm2, %xmm0 {%k1} +; SKX-NEXT: vporq %xmm1, %xmm2, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -852,7 +852,7 @@ ; SKX-LABEL: test_mm_maskz_or_pd: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorpd %xmm0, %xmm1, %xmm0 {%k1} {z} +; SKX-NEXT: vporq %xmm0, %xmm1, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <2 x double> %__A to <2 x i64> @@ -875,7 +875,7 @@ ; SKX-LABEL: test_mm256_mask_or_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: vpord %ymm1, %ymm2, %ymm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -897,7 +897,7 @@ ; SKX-LABEL: test_mm256_maskz_or_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %ymm0, %ymm1, %ymm0 {%k1} {z} +; SKX-NEXT: vpord %ymm0, %ymm1, %ymm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <8 x float> %__A to <8 x i32> @@ -919,7 +919,7 @@ ; SKX-LABEL: test_mm_mask_or_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %xmm1, %xmm2, %xmm0 {%k1} +; SKX-NEXT: vpord %xmm1, %xmm2, %xmm0 {%k1} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -942,7 +942,7 @@ ; SKX-LABEL: test_mm_maskz_or_ps: ; SKX: ## %bb.0: ## %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %xmm0, %xmm1, %xmm0 {%k1} {z} +; SKX-NEXT: vpord %xmm0, %xmm1, %xmm0 {%k1} {z} ; SKX-NEXT: retq entry: %0 = bitcast <4 x float> %__A to <4 x i32> @@ -1040,8 +1040,8 @@ ; SKX-LABEL: ternlog_maskz_or_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %xmm3, %k1 -; SKX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; SKX-NEXT: vorps %xmm1, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vpord %xmm1, %xmm0, %xmm0 {%k1} {z} ; SKX-NEXT: retq %m = icmp slt <4 x i32> %mask, zeroinitializer %a = and <4 x i32> %x, @@ -1062,8 +1062,8 @@ ; SKX-LABEL: ternlog_maskz_or_and_mask_ymm: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %ymm2, %k1 -; SKX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: vorps %ymm1, %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vpord %ymm1, %ymm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq %m = icmp slt <8 x i32> %mask, zeroinitializer %a = and <8 x i32> %x, @@ -1084,8 +1084,8 @@ ; SKX-LABEL: ternlog_maskz_xor_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %xmm2, %k1 -; SKX-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm0 -; SKX-NEXT: vxorpd %xmm1, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vpxorq %xmm1, %xmm0, %xmm0 {%k1} {z} ; SKX-NEXT: retq %m = icmp slt <2 x i64> %mask, zeroinitializer %a = and <2 x i64> %x, @@ -1106,8 +1106,8 @@ ; SKX-LABEL: ternlog_maskz_xor_and_mask_ymm: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %ymm2, %k1 -; SKX-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: vxorpd %ymm1, %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vpxorq %ymm1, %ymm0, %ymm0 {%k1} {z} ; SKX-NEXT: retq %m = icmp slt <4 x i64> %mask, zeroinitializer %a = and <4 x i64> %x, @@ -1128,8 +1128,8 @@ ; SKX-LABEL: ternlog_maskx_or_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %xmm3, %k1 -; SKX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm2 -; SKX-NEXT: vorps %xmm1, %xmm2, %xmm0 {%k1} +; SKX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 +; SKX-NEXT: vpord %xmm1, %xmm2, %xmm0 {%k1} ; SKX-NEXT: retq %m = icmp slt <4 x i32> %mask, zeroinitializer %a = and <4 x i32> %x, @@ -1150,8 +1150,8 @@ ; SKX-LABEL: ternlog_maskx_or_and_mask_ymm: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %ymm2, %k1 -; SKX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm2 -; SKX-NEXT: vorps %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm2 +; SKX-NEXT: vpord %ymm1, %ymm2, %ymm0 {%k1} ; SKX-NEXT: retq %m = icmp slt <8 x i32> %mask, zeroinitializer %a = and <8 x i32> %x, @@ -1172,8 +1172,8 @@ ; SKX-LABEL: ternlog_maskx_xor_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %xmm2, %k1 -; SKX-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm2 -; SKX-NEXT: vxorpd %xmm1, %xmm2, %xmm0 {%k1} +; SKX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 +; SKX-NEXT: vpxorq %xmm1, %xmm2, %xmm0 {%k1} ; SKX-NEXT: retq %m = icmp slt <2 x i64> %mask, zeroinitializer %a = and <2 x i64> %x, @@ -1194,8 +1194,8 @@ ; SKX-LABEL: ternlog_maskx_xor_and_mask_ymm: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %ymm2, %k1 -; SKX-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm2 -; SKX-NEXT: vxorpd %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm2 +; SKX-NEXT: vpxorq %ymm1, %ymm2, %ymm0 {%k1} ; SKX-NEXT: retq %m = icmp slt <4 x i64> %mask, zeroinitializer %a = and <4 x i64> %x, @@ -1217,9 +1217,9 @@ ; SKX-LABEL: ternlog_masky_or_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %xmm3, %k1 -; SKX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; SKX-NEXT: vorps %xmm1, %xmm0, %xmm1 {%k1} -; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vpord %xmm1, %xmm0, %xmm1 {%k1} +; SKX-NEXT: vmovdqa %xmm1, %xmm0 ; SKX-NEXT: retq %m = icmp slt <4 x i32> %mask, zeroinitializer %a = and <4 x i32> %x, @@ -1240,8 +1240,8 @@ ; SKX-LABEL: ternlog_masky_or_and_mask_ymm: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovd2m %ymm2, %k1 -; SKX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm2 -; SKX-NEXT: vorps %ymm1, %ymm2, %ymm0 {%k1} +; SKX-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm2 +; SKX-NEXT: vpord %ymm1, %ymm2, %ymm0 {%k1} ; SKX-NEXT: retq %m = icmp slt <8 x i32> %mask, zeroinitializer %a = and <8 x i32> %x, @@ -1263,9 +1263,9 @@ ; SKX-LABEL: ternlog_masky_xor_and_mask: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %xmm2, %k1 -; SKX-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm0 -; SKX-NEXT: vxorpd %xmm1, %xmm0, %xmm1 {%k1} -; SKX-NEXT: vmovapd %xmm1, %xmm0 +; SKX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vpxorq %xmm1, %xmm0, %xmm1 {%k1} +; SKX-NEXT: vmovdqa %xmm1, %xmm0 ; SKX-NEXT: retq %m = icmp slt <2 x i64> %mask, zeroinitializer %a = and <2 x i64> %x, @@ -1287,9 +1287,9 @@ ; SKX-LABEL: ternlog_masky_xor_and_mask_ymm: ; SKX: ## %bb.0: ; SKX-NEXT: vpmovq2m %ymm2, %k1 -; SKX-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: vxorpd %ymm1, %ymm0, %ymm1 {%k1} -; SKX-NEXT: vmovapd %ymm1, %ymm0 +; SKX-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vpxorq %ymm1, %ymm0, %ymm1 {%k1} +; SKX-NEXT: vmovdqa %ymm1, %ymm0 ; SKX-NEXT: retq %m = icmp slt <4 x i64> %mask, zeroinitializer %a = and <4 x i64> %x, diff --git a/llvm/test/CodeGen/X86/avx512vl-mov.ll b/llvm/test/CodeGen/X86/avx512vl-mov.ll --- a/llvm/test/CodeGen/X86/avx512vl-mov.ll +++ b/llvm/test/CodeGen/X86/avx512vl-mov.ll @@ -4,7 +4,7 @@ define <8 x i32> @test_256_1(i8 * %addr) { ; CHECK-LABEL: test_256_1: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i32>* %res = load <8 x i32>, <8 x i32>* %vaddr, align 1 @@ -14,7 +14,7 @@ define <8 x i32> @test_256_2(i8 * %addr) { ; CHECK-LABEL: test_256_2: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i32>* %res = load <8 x i32>, <8 x i32>* %vaddr, align 32 @@ -24,7 +24,7 @@ define void @test_256_3(i8 * %addr, <4 x i64> %data) { ; CHECK-LABEL: test_256_3: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x07] +; CHECK-NEXT: vmovdqa %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x i64>* store <4 x i64>%data, <4 x i64>* %vaddr, align 32 @@ -34,7 +34,7 @@ define void @test_256_4(i8 * %addr, <8 x i32> %data) { ; CHECK-LABEL: test_256_4: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07] +; CHECK-NEXT: vmovdqu %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i32>* store <8 x i32>%data, <8 x i32>* %vaddr, align 1 @@ -44,7 +44,7 @@ define void @test_256_5(i8 * %addr, <8 x i32> %data) { ; CHECK-LABEL: test_256_5: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x07] +; CHECK-NEXT: vmovdqa %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x i32>* store <8 x i32>%data, <8 x i32>* %vaddr, align 32 @@ -54,7 +54,7 @@ define <4 x i64> @test_256_6(i8 * %addr) { ; CHECK-LABEL: test_256_6: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x i64>* %res = load <4 x i64>, <4 x i64>* %vaddr, align 32 @@ -64,7 +64,7 @@ define void @test_256_7(i8 * %addr, <4 x i64> %data) { ; CHECK-LABEL: test_256_7: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07] +; CHECK-NEXT: vmovdqu %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x i64>* store <4 x i64>%data, <4 x i64>* %vaddr, align 1 @@ -74,7 +74,7 @@ define <4 x i64> @test_256_8(i8 * %addr) { ; CHECK-LABEL: test_256_8: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x i64>* %res = load <4 x i64>, <4 x i64>* %vaddr, align 1 @@ -84,7 +84,7 @@ define void @test_256_9(i8 * %addr, <4 x double> %data) { ; CHECK-LABEL: test_256_9: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x07] +; CHECK-NEXT: vmovdqa %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x double>* store <4 x double>%data, <4 x double>* %vaddr, align 32 @@ -94,7 +94,7 @@ define <4 x double> @test_256_10(i8 * %addr) { ; CHECK-LABEL: test_256_10: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x double>* %res = load <4 x double>, <4 x double>* %vaddr, align 32 @@ -104,7 +104,7 @@ define void @test_256_11(i8 * %addr, <8 x float> %data) { ; CHECK-LABEL: test_256_11: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x07] +; CHECK-NEXT: vmovdqa %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x float>* store <8 x float>%data, <8 x float>* %vaddr, align 32 @@ -114,7 +114,7 @@ define <8 x float> @test_256_12(i8 * %addr) { ; CHECK-LABEL: test_256_12: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x float>* %res = load <8 x float>, <8 x float>* %vaddr, align 32 @@ -124,7 +124,7 @@ define void @test_256_13(i8 * %addr, <4 x double> %data) { ; CHECK-LABEL: test_256_13: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07] +; CHECK-NEXT: vmovdqu %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x double>* store <4 x double>%data, <4 x double>* %vaddr, align 1 @@ -134,7 +134,7 @@ define <4 x double> @test_256_14(i8 * %addr) { ; CHECK-LABEL: test_256_14: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x double>* %res = load <4 x double>, <4 x double>* %vaddr, align 1 @@ -144,7 +144,7 @@ define void @test_256_15(i8 * %addr, <8 x float> %data) { ; CHECK-LABEL: test_256_15: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07] +; CHECK-NEXT: vmovdqu %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x float>* store <8 x float>%data, <8 x float>* %vaddr, align 1 @@ -154,7 +154,7 @@ define <8 x float> @test_256_16(i8 * %addr) { ; CHECK-LABEL: test_256_16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x float>* %res = load <8 x float>, <8 x float>* %vaddr, align 1 @@ -376,7 +376,7 @@ define <4 x i32> @test_128_1(i8 * %addr) { ; CHECK-LABEL: test_128_1: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x i32>* %res = load <4 x i32>, <4 x i32>* %vaddr, align 1 @@ -386,7 +386,7 @@ define <4 x i32> @test_128_2(i8 * %addr) { ; CHECK-LABEL: test_128_2: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x i32>* %res = load <4 x i32>, <4 x i32>* %vaddr, align 16 @@ -396,7 +396,7 @@ define void @test_128_3(i8 * %addr, <2 x i64> %data) { ; CHECK-LABEL: test_128_3: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; CHECK-NEXT: vmovdqa %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <2 x i64>* store <2 x i64>%data, <2 x i64>* %vaddr, align 16 @@ -406,7 +406,7 @@ define void @test_128_4(i8 * %addr, <4 x i32> %data) { ; CHECK-LABEL: test_128_4: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] +; CHECK-NEXT: vmovdqu %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x i32>* store <4 x i32>%data, <4 x i32>* %vaddr, align 1 @@ -416,7 +416,7 @@ define void @test_128_5(i8 * %addr, <4 x i32> %data) { ; CHECK-LABEL: test_128_5: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; CHECK-NEXT: vmovdqa %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x i32>* store <4 x i32>%data, <4 x i32>* %vaddr, align 16 @@ -426,7 +426,7 @@ define <2 x i64> @test_128_6(i8 * %addr) { ; CHECK-LABEL: test_128_6: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <2 x i64>* %res = load <2 x i64>, <2 x i64>* %vaddr, align 16 @@ -436,7 +436,7 @@ define void @test_128_7(i8 * %addr, <2 x i64> %data) { ; CHECK-LABEL: test_128_7: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] +; CHECK-NEXT: vmovdqu %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <2 x i64>* store <2 x i64>%data, <2 x i64>* %vaddr, align 1 @@ -446,7 +446,7 @@ define <2 x i64> @test_128_8(i8 * %addr) { ; CHECK-LABEL: test_128_8: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <2 x i64>* %res = load <2 x i64>, <2 x i64>* %vaddr, align 1 @@ -456,7 +456,7 @@ define void @test_128_9(i8 * %addr, <2 x double> %data) { ; CHECK-LABEL: test_128_9: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; CHECK-NEXT: vmovdqa %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <2 x double>* store <2 x double>%data, <2 x double>* %vaddr, align 16 @@ -466,7 +466,7 @@ define <2 x double> @test_128_10(i8 * %addr) { ; CHECK-LABEL: test_128_10: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <2 x double>* %res = load <2 x double>, <2 x double>* %vaddr, align 16 @@ -476,7 +476,7 @@ define void @test_128_11(i8 * %addr, <4 x float> %data) { ; CHECK-LABEL: test_128_11: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; CHECK-NEXT: vmovdqa %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x float>* store <4 x float>%data, <4 x float>* %vaddr, align 16 @@ -486,7 +486,7 @@ define <4 x float> @test_128_12(i8 * %addr) { ; CHECK-LABEL: test_128_12: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x float>* %res = load <4 x float>, <4 x float>* %vaddr, align 16 @@ -496,7 +496,7 @@ define void @test_128_13(i8 * %addr, <2 x double> %data) { ; CHECK-LABEL: test_128_13: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] +; CHECK-NEXT: vmovdqu %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <2 x double>* store <2 x double>%data, <2 x double>* %vaddr, align 1 @@ -506,7 +506,7 @@ define <2 x double> @test_128_14(i8 * %addr) { ; CHECK-LABEL: test_128_14: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <2 x double>* %res = load <2 x double>, <2 x double>* %vaddr, align 1 @@ -516,7 +516,7 @@ define void @test_128_15(i8 * %addr, <4 x float> %data) { ; CHECK-LABEL: test_128_15: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] +; CHECK-NEXT: vmovdqu %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x float>* store <4 x float>%data, <4 x float>* %vaddr, align 1 @@ -526,7 +526,7 @@ define <4 x float> @test_128_16(i8 * %addr) { ; CHECK-LABEL: test_128_16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] +; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <4 x float>* %res = load <4 x float>, <4 x float>* %vaddr, align 1 diff --git a/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vbroadcast.ll @@ -9,8 +9,8 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq func_f32 -; CHECK-NEXT: vbroadcastss (%rsp), %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: callq func_f32@PLT +; CHECK-NEXT: vpbroadcastd (%rsp), %ymm0 # 16-byte Folded Reload ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -28,8 +28,8 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq func_f32 -; CHECK-NEXT: vbroadcastss (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: callq func_f32@PLT +; CHECK-NEXT: vpbroadcastd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -48,8 +48,8 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: vaddsd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq func_f64 -; CHECK-NEXT: vbroadcastsd (%rsp), %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: callq func_f64@PLT +; CHECK-NEXT: vpbroadcastq (%rsp), %ymm0 # 16-byte Folded Reload ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -63,7 +63,7 @@ define <8 x float> @_inreg8xfloat(float %a) { ; CHECK-LABEL: _inreg8xfloat: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 ; CHECK-NEXT: retq %b = insertelement <8 x float> undef, float %a, i32 0 %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer @@ -99,7 +99,7 @@ define <4 x float> @_inreg4xfloat(float %a) { ; CHECK-LABEL: _inreg4xfloat: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 ; CHECK-NEXT: retq %b = insertelement <4 x float> undef, float %a, i32 0 %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer @@ -135,7 +135,7 @@ define <4 x double> @_inreg4xdouble(double %a) { ; CHECK-LABEL: _inreg4xdouble: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: retq %b = insertelement <4 x double> undef, double %a, i32 0 %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll @@ -86,7 +86,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] -; X86-NEXT: vmovaps (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x02] +; X86-NEXT: vmovdqa (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x02] ; X86-NEXT: vp2intersectd (%ecx), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x01] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] @@ -98,7 +98,7 @@ ; ; X64-LABEL: test_mm256_2intersect_epi32_p: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] +; X64-NEXT: vmovdqa (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07] ; X64-NEXT: vp2intersectd (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x06] ; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] ; X64-NEXT: kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0] @@ -131,7 +131,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-NEXT: vmovaps (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x06] +; X86-NEXT: vmovdqa (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x06] ; X86-NEXT: vp2intersectq (%edx), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x02] ; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] @@ -148,7 +148,7 @@ ; ; X64-LABEL: test_mm256_2intersect_epi64_p: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07] +; X64-NEXT: vmovdqa (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07] ; X64-NEXT: vp2intersectq (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x06] ; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] @@ -181,7 +181,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] -; X86-NEXT: vbroadcastss (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x02] +; X86-NEXT: vpbroadcastd (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x02] ; X86-NEXT: vp2intersectd (%ecx){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x01] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] @@ -193,7 +193,7 @@ ; ; X64-LABEL: test_mm256_2intersect_epi32_b: ; X64: # %bb.0: # %entry -; X64-NEXT: vbroadcastss (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x07] +; X64-NEXT: vpbroadcastd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x07] ; X64-NEXT: vp2intersectd (%rsi){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x06] ; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] ; X64-NEXT: kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0] @@ -228,7 +228,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-NEXT: vbroadcastsd (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x06] +; X86-NEXT: vpbroadcastq (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x06] ; X86-NEXT: vp2intersectq (%edx){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x02] ; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] @@ -245,7 +245,7 @@ ; ; X64-LABEL: test_mm256_2intersect_epi64_b: ; X64: # %bb.0: # %entry -; X64-NEXT: vbroadcastsd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x07] +; X64-NEXT: vpbroadcastq (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x07] ; X64-NEXT: vp2intersectq (%rsi){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x06] ; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] @@ -370,7 +370,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-NEXT: vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06] +; X86-NEXT: vmovdqa (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x06] ; X86-NEXT: vp2intersectd (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x02] ; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] @@ -386,7 +386,7 @@ ; ; X64-LABEL: test_mm_2intersect_epi32_p: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; X64-NEXT: vmovdqa (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] ; X64-NEXT: vp2intersectd (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x06] ; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] @@ -424,7 +424,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-NEXT: vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06] +; X86-NEXT: vmovdqa (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x06] ; X86-NEXT: vp2intersectq (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x02] ; X86-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] ; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] @@ -440,7 +440,7 @@ ; ; X64-LABEL: test_mm_2intersect_epi64_p: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; X64-NEXT: vmovdqa (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] ; X64-NEXT: vp2intersectq (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x06] ; X64-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] ; X64-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] @@ -476,7 +476,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-NEXT: vbroadcastss (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x06] +; X86-NEXT: vpbroadcastd (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x06] ; X86-NEXT: vp2intersectd (%edx){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x02] ; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] @@ -492,7 +492,7 @@ ; ; X64-LABEL: test_mm_2intersect_epi32_b: ; X64: # %bb.0: # %entry -; X64-NEXT: vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07] +; X64-NEXT: vpbroadcastd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x07] ; X64-NEXT: vp2intersectd (%rsi){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x06] ; X64-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X64-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] @@ -532,8 +532,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-NEXT: vmovddup (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x06] -; X86-NEXT: # xmm0 = mem[0,0] +; X86-NEXT: vpbroadcastq (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x06] ; X86-NEXT: vp2intersectq (%edx){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x02] ; X86-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] ; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] @@ -549,8 +548,7 @@ ; ; X64-LABEL: test_mm_2intersect_epi64_b: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovddup (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x07] -; X64-NEXT: # xmm0 = mem[0,0] +; X64-NEXT: vpbroadcastq (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x07] ; X64-NEXT: vp2intersectq (%rsi){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x06] ; X64-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] ; X64-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] diff --git a/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll @@ -76,7 +76,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-NEXT: vmovaps (%esi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0x06] +; X86-NEXT: vmovdqa64 (%esi), %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x06] ; X86-NEXT: vp2intersectd (%edx), %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0x02] ; X86-NEXT: kmovw %k0, (%ecx) # encoding: [0xc5,0xf8,0x91,0x01] ; X86-NEXT: kmovw %k1, (%eax) # encoding: [0xc5,0xf8,0x91,0x08] @@ -87,7 +87,7 @@ ; ; X64-LABEL: test_mm512_2intersect_epi32_p: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] +; X64-NEXT: vmovdqa64 (%rdi), %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x07] ; X64-NEXT: vp2intersectd (%rsi), %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0x06] ; X64-NEXT: kmovw %k0, (%rdx) # encoding: [0xc5,0xf8,0x91,0x02] ; X64-NEXT: kmovw %k1, (%rcx) # encoding: [0xc5,0xf8,0x91,0x09] @@ -114,7 +114,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] -; X86-NEXT: vmovaps (%edx), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0x02] +; X86-NEXT: vmovdqa64 (%edx), %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x02] ; X86-NEXT: vp2intersectq (%ecx), %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0x01] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] @@ -126,7 +126,7 @@ ; ; X64-LABEL: test_mm512_2intersect_epi64_p: ; X64: # %bb.0: # %entry -; X64-NEXT: vmovaps (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] +; X64-NEXT: vmovdqa64 (%rdi), %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x07] ; X64-NEXT: vp2intersectq (%rsi), %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0x06] ; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] ; X64-NEXT: kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0] @@ -158,7 +158,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-NEXT: vbroadcastss (%esi), %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x18,0x06] +; X86-NEXT: vpbroadcastd (%esi), %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x58,0x06] ; X86-NEXT: vp2intersectd (%edx){1to16}, %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x58,0x68,0x02] ; X86-NEXT: kmovw %k0, (%ecx) # encoding: [0xc5,0xf8,0x91,0x01] ; X86-NEXT: kmovw %k1, (%eax) # encoding: [0xc5,0xf8,0x91,0x08] @@ -169,7 +169,7 @@ ; ; X64-LABEL: test_mm512_2intersect_epi32_b: ; X64: # %bb.0: # %entry -; X64-NEXT: vbroadcastss (%rdi), %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x18,0x07] +; X64-NEXT: vpbroadcastd (%rdi), %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x58,0x07] ; X64-NEXT: vp2intersectd (%rsi){1to16}, %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x58,0x68,0x06] ; X64-NEXT: kmovw %k0, (%rdx) # encoding: [0xc5,0xf8,0x91,0x02] ; X64-NEXT: kmovw %k1, (%rcx) # encoding: [0xc5,0xf8,0x91,0x09] @@ -198,7 +198,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] -; X86-NEXT: vbroadcastsd (%edx), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x19,0x02] +; X86-NEXT: vpbroadcastq (%edx), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x02] ; X86-NEXT: vp2intersectq (%ecx){1to8}, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x58,0x68,0x01] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] @@ -210,7 +210,7 @@ ; ; X64-LABEL: test_mm512_2intersect_epi64_b: ; X64: # %bb.0: # %entry -; X64-NEXT: vbroadcastsd (%rdi), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x19,0x07] +; X64-NEXT: vpbroadcastq (%rdi), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x07] ; X64-NEXT: vp2intersectq (%rsi){1to8}, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x58,0x68,0x06] ; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] ; X64-NEXT: kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0] diff --git a/llvm/test/CodeGen/X86/bool-ext-inc.ll b/llvm/test/CodeGen/X86/bool-ext-inc.ll --- a/llvm/test/CodeGen/X86/bool-ext-inc.ll +++ b/llvm/test/CodeGen/X86/bool-ext-inc.ll @@ -19,8 +19,8 @@ define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind { ; CHECK-LABEL: sext_inc_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %ext = sext <4 x i1> %x to <4 x i32> %add = add <4 x i32> %ext, diff --git a/llvm/test/CodeGen/X86/break-false-dep.ll b/llvm/test/CodeGen/X86/break-false-dep.ll --- a/llvm/test/CodeGen/X86/break-false-dep.ll +++ b/llvm/test/CodeGen/X86/break-false-dep.ll @@ -340,50 +340,95 @@ ; SSE-WIN-NEXT: addq $184, %rsp ; SSE-WIN-NEXT: retq ; -; AVX-LABEL: loopdep2: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $184, %rsp -; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: movq (%rcx), %rax -; AVX-NEXT: movl $1, %r8d -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB7_1: # %loop -; AVX-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vcvtsi2sd %r8, %xmm1, %xmm0 -; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; AVX-NEXT: # xmm0 = mem[0],zero -; AVX-NEXT: vaddsd (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vcvttsd2si %xmm0, %rcx -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: incq %r8 -; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 -; AVX-NEXT: jne .LBB7_1 -; AVX-NEXT: # %bb.2: # %ret -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: addq $184, %rsp -; AVX-NEXT: retq +; AVX1-LABEL: loopdep2: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $184, %rsp +; AVX1-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: movq (%rcx), %rax +; AVX1-NEXT: movl $1, %r8d +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB7_1: # %loop +; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vcvtsi2sd %r8, %xmm1, %xmm0 +; AVX1-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; AVX1-NEXT: # xmm0 = mem[0],zero +; AVX1-NEXT: vaddsd (%rdx), %xmm0, %xmm0 +; AVX1-NEXT: vcvttsd2si %xmm0, %rcx +; AVX1-NEXT: addq %rcx, %rax +; AVX1-NEXT: incq %r8 +; AVX1-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; AVX1-NEXT: jne .LBB7_1 +; AVX1-NEXT: # %bb.2: # %ret +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-NEXT: addq $184, %rsp +; AVX1-NEXT: retq +; +; AVX512VL-LABEL: loopdep2: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: subq $184, %rsp +; AVX512VL-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: movq (%rcx), %rax +; AVX512VL-NEXT: movl $1, %r8d +; AVX512VL-NEXT: .p2align 4, 0x90 +; AVX512VL-NEXT: .LBB7_1: # %loop +; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtsi2sd %r8, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; AVX512VL-NEXT: # xmm0 = mem[0],zero +; AVX512VL-NEXT: vaddsd (%rdx), %xmm0, %xmm0 +; AVX512VL-NEXT: vcvttsd2si %xmm0, %rcx +; AVX512VL-NEXT: addq %rcx, %rax +; AVX512VL-NEXT: incq %r8 +; AVX512VL-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; AVX512VL-NEXT: jne .LBB7_1 +; AVX512VL-NEXT: # %bb.2: # %ret +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512VL-NEXT: addq $184, %rsp +; AVX512VL-NEXT: retq entry: %vx = load i64, i64* %x br label %loop @@ -521,81 +566,157 @@ ; SSE-WIN-NEXT: retq ; SSE-WIN-NEXT: .seh_endproc ; -; AVX-LABEL: loopdep3: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rsi -; AVX-NEXT: .seh_pushreg %rsi -; AVX-NEXT: subq $160, %rsp -; AVX-NEXT: .seh_stackalloc 160 -; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm15, 144 -; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm14, 128 -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm13, 112 -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm12, 96 -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm11, 80 -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm10, 64 -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm9, 48 -; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm8, 32 -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm7, 16 -; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm6, 0 -; AVX-NEXT: .seh_endprologue -; AVX-NEXT: xorl %r9d, %r9d -; AVX-NEXT: leaq {{.*}}(%rip), %r8 -; AVX-NEXT: leaq {{.*}}(%rip), %r10 -; AVX-NEXT: leaq {{.*}}(%rip), %r11 -; AVX-NEXT: leaq {{.*}}(%rip), %rax -; AVX-NEXT: leaq {{.*}}(%rip), %rdx -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB8_1: # %for.cond1.preheader -; AVX-NEXT: # =>This Loop Header: Depth=1 -; AVX-NEXT: # Child Loop BB8_2 Depth 2 -; AVX-NEXT: movq %r8, %rcx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB8_2: # %for.body3 -; AVX-NEXT: # Parent Loop BB8_1 Depth=1 -; AVX-NEXT: # => This Inner Loop Header: Depth=2 -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2sdl (%rcx), %xmm0, %xmm0 -; AVX-NEXT: vmulsd (%rsi,%r10), %xmm0, %xmm0 -; AVX-NEXT: vmulsd (%rsi,%r11), %xmm0, %xmm0 -; AVX-NEXT: vmulsd (%rsi,%rax), %xmm0, %xmm0 -; AVX-NEXT: vmovsd %xmm0, (%rsi,%rdx) -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: addq $8, %rsi -; AVX-NEXT: addq $4, %rcx -; AVX-NEXT: cmpq $8192, %rsi # imm = 0x2000 -; AVX-NEXT: jne .LBB8_2 -; AVX-NEXT: # %bb.3: # %for.inc14 -; AVX-NEXT: # in Loop: Header=BB8_1 Depth=1 -; AVX-NEXT: incl %r9d -; AVX-NEXT: cmpl $100000, %r9d # imm = 0x186A0 -; AVX-NEXT: jne .LBB8_1 -; AVX-NEXT: # %bb.4: # %for.end16 -; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: addq $160, %rsp -; AVX-NEXT: popq %rsi -; AVX-NEXT: retq -; AVX-NEXT: .seh_endproc +; AVX1-LABEL: loopdep3: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rsi +; AVX1-NEXT: .seh_pushreg %rsi +; AVX1-NEXT: subq $160, %rsp +; AVX1-NEXT: .seh_stackalloc 160 +; AVX1-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm15, 144 +; AVX1-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm14, 128 +; AVX1-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm13, 112 +; AVX1-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm12, 96 +; AVX1-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm11, 80 +; AVX1-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm10, 64 +; AVX1-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm9, 48 +; AVX1-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm8, 32 +; AVX1-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm7, 16 +; AVX1-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm6, 0 +; AVX1-NEXT: .seh_endprologue +; AVX1-NEXT: xorl %r9d, %r9d +; AVX1-NEXT: leaq {{.*}}(%rip), %r8 +; AVX1-NEXT: leaq {{.*}}(%rip), %r10 +; AVX1-NEXT: leaq {{.*}}(%rip), %r11 +; AVX1-NEXT: leaq {{.*}}(%rip), %rax +; AVX1-NEXT: leaq {{.*}}(%rip), %rdx +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB8_1: # %for.cond1.preheader +; AVX1-NEXT: # =>This Loop Header: Depth=1 +; AVX1-NEXT: # Child Loop BB8_2 Depth 2 +; AVX1-NEXT: movq %r8, %rcx +; AVX1-NEXT: xorl %esi, %esi +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB8_2: # %for.body3 +; AVX1-NEXT: # Parent Loop BB8_1 Depth=1 +; AVX1-NEXT: # => This Inner Loop Header: Depth=2 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2sdl (%rcx), %xmm0, %xmm0 +; AVX1-NEXT: vmulsd (%rsi,%r10), %xmm0, %xmm0 +; AVX1-NEXT: vmulsd (%rsi,%r11), %xmm0, %xmm0 +; AVX1-NEXT: vmulsd (%rsi,%rax), %xmm0, %xmm0 +; AVX1-NEXT: vmovsd %xmm0, (%rsi,%rdx) +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: addq $8, %rsi +; AVX1-NEXT: addq $4, %rcx +; AVX1-NEXT: cmpq $8192, %rsi # imm = 0x2000 +; AVX1-NEXT: jne .LBB8_2 +; AVX1-NEXT: # %bb.3: # %for.inc14 +; AVX1-NEXT: # in Loop: Header=BB8_1 Depth=1 +; AVX1-NEXT: incl %r9d +; AVX1-NEXT: cmpl $100000, %r9d # imm = 0x186A0 +; AVX1-NEXT: jne .LBB8_1 +; AVX1-NEXT: # %bb.4: # %for.end16 +; AVX1-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-NEXT: addq $160, %rsp +; AVX1-NEXT: popq %rsi +; AVX1-NEXT: retq +; AVX1-NEXT: .seh_endproc +; +; AVX512VL-LABEL: loopdep3: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: pushq %rsi +; AVX512VL-NEXT: .seh_pushreg %rsi +; AVX512VL-NEXT: subq $160, %rsp +; AVX512VL-NEXT: .seh_stackalloc 160 +; AVX512VL-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm15, 144 +; AVX512VL-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm14, 128 +; AVX512VL-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm13, 112 +; AVX512VL-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm12, 96 +; AVX512VL-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm11, 80 +; AVX512VL-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm10, 64 +; AVX512VL-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm9, 48 +; AVX512VL-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm8, 32 +; AVX512VL-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm7, 16 +; AVX512VL-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm6, 0 +; AVX512VL-NEXT: .seh_endprologue +; AVX512VL-NEXT: xorl %r9d, %r9d +; AVX512VL-NEXT: leaq {{.*}}(%rip), %r8 +; AVX512VL-NEXT: leaq {{.*}}(%rip), %r10 +; AVX512VL-NEXT: leaq {{.*}}(%rip), %r11 +; AVX512VL-NEXT: leaq {{.*}}(%rip), %rax +; AVX512VL-NEXT: leaq {{.*}}(%rip), %rdx +; AVX512VL-NEXT: .p2align 4, 0x90 +; AVX512VL-NEXT: .LBB8_1: # %for.cond1.preheader +; AVX512VL-NEXT: # =>This Loop Header: Depth=1 +; AVX512VL-NEXT: # Child Loop BB8_2 Depth 2 +; AVX512VL-NEXT: movq %r8, %rcx +; AVX512VL-NEXT: xorl %esi, %esi +; AVX512VL-NEXT: .p2align 4, 0x90 +; AVX512VL-NEXT: .LBB8_2: # %for.body3 +; AVX512VL-NEXT: # Parent Loop BB8_1 Depth=1 +; AVX512VL-NEXT: # => This Inner Loop Header: Depth=2 +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vcvtsi2sdl (%rcx), %xmm0, %xmm0 +; AVX512VL-NEXT: vmulsd (%rsi,%r10), %xmm0, %xmm0 +; AVX512VL-NEXT: vmulsd (%rsi,%r11), %xmm0, %xmm0 +; AVX512VL-NEXT: vmulsd (%rsi,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: vmovsd %xmm0, (%rsi,%rdx) +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: addq $8, %rsi +; AVX512VL-NEXT: addq $4, %rcx +; AVX512VL-NEXT: cmpq $8192, %rsi # imm = 0x2000 +; AVX512VL-NEXT: jne .LBB8_2 +; AVX512VL-NEXT: # %bb.3: # %for.inc14 +; AVX512VL-NEXT: # in Loop: Header=BB8_1 Depth=1 +; AVX512VL-NEXT: incl %r9d +; AVX512VL-NEXT: cmpl $100000, %r9d # imm = 0x186A0 +; AVX512VL-NEXT: jne .LBB8_1 +; AVX512VL-NEXT: # %bb.4: # %for.end16 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512VL-NEXT: addq $160, %rsp +; AVX512VL-NEXT: popq %rsi +; AVX512VL-NEXT: retq +; AVX512VL-NEXT: .seh_endproc entry: br label %for.cond1.preheader @@ -714,62 +835,119 @@ ; SSE-WIN-NEXT: retq ; SSE-WIN-NEXT: .seh_endproc ; -; AVX-LABEL: inlineasmdep: -; AVX: # %bb.0: # %top -; AVX-NEXT: subq $168, %rsp -; AVX-NEXT: .seh_stackalloc 168 -; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm15, 144 -; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm14, 128 -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm13, 112 -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm12, 96 -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm11, 80 -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm10, 64 -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm9, 48 -; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm8, 32 -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm7, 16 -; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm6, 0 -; AVX-NEXT: .seh_endprologue -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0 -; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: addq $168, %rsp -; AVX-NEXT: retq -; AVX-NEXT: .seh_endproc +; AVX1-LABEL: inlineasmdep: +; AVX1: # %bb.0: # %top +; AVX1-NEXT: subq $168, %rsp +; AVX1-NEXT: .seh_stackalloc 168 +; AVX1-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm15, 144 +; AVX1-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm14, 128 +; AVX1-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm13, 112 +; AVX1-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm12, 96 +; AVX1-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm11, 80 +; AVX1-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm10, 64 +; AVX1-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm9, 48 +; AVX1-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm8, 32 +; AVX1-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm7, 16 +; AVX1-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm6, 0 +; AVX1-NEXT: .seh_endprologue +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-NEXT: addq $168, %rsp +; AVX1-NEXT: retq +; AVX1-NEXT: .seh_endproc +; +; AVX512VL-LABEL: inlineasmdep: +; AVX512VL: # %bb.0: # %top +; AVX512VL-NEXT: subq $168, %rsp +; AVX512VL-NEXT: .seh_stackalloc 168 +; AVX512VL-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm15, 144 +; AVX512VL-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm14, 128 +; AVX512VL-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm13, 112 +; AVX512VL-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm12, 96 +; AVX512VL-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm11, 80 +; AVX512VL-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm10, 64 +; AVX512VL-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm9, 48 +; AVX512VL-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm8, 32 +; AVX512VL-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm7, 16 +; AVX512VL-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm6, 0 +; AVX512VL-NEXT: .seh_endprologue +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512VL-NEXT: addq $168, %rsp +; AVX512VL-NEXT: retq +; AVX512VL-NEXT: .seh_endproc top: tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"() @@ -873,66 +1051,127 @@ ; SSE-WIN-NEXT: retq ; SSE-WIN-NEXT: .seh_endproc ; -; AVX-LABEL: truedeps: -; AVX: # %bb.0: # %top -; AVX-NEXT: subq $184, %rsp -; AVX-NEXT: .seh_stackalloc 184 -; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm15, 160 -; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm14, 144 -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm13, 128 -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm12, 112 -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm11, 96 -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm10, 80 -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm9, 64 -; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm8, 48 -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm7, 32 -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm6, 16 -; AVX-NEXT: .seh_endprologue -; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: addq $184, %rsp -; AVX-NEXT: retq -; AVX-NEXT: .seh_endproc +; AVX1-LABEL: truedeps: +; AVX1: # %bb.0: # %top +; AVX1-NEXT: subq $184, %rsp +; AVX1-NEXT: .seh_stackalloc 184 +; AVX1-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm15, 160 +; AVX1-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm14, 144 +; AVX1-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm13, 128 +; AVX1-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm12, 112 +; AVX1-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm11, 96 +; AVX1-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm10, 80 +; AVX1-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm9, 64 +; AVX1-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm8, 48 +; AVX1-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm7, 32 +; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm6, 16 +; AVX1-NEXT: .seh_endprologue +; AVX1-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-NEXT: addq $184, %rsp +; AVX1-NEXT: retq +; AVX1-NEXT: .seh_endproc +; +; AVX512VL-LABEL: truedeps: +; AVX512VL: # %bb.0: # %top +; AVX512VL-NEXT: subq $184, %rsp +; AVX512VL-NEXT: .seh_stackalloc 184 +; AVX512VL-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm15, 160 +; AVX512VL-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm14, 144 +; AVX512VL-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm13, 128 +; AVX512VL-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm12, 112 +; AVX512VL-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm11, 96 +; AVX512VL-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm10, 80 +; AVX512VL-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm9, 64 +; AVX512VL-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm8, 48 +; AVX512VL-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm7, 32 +; AVX512VL-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm6, 16 +; AVX512VL-NEXT: .seh_endprologue +; AVX512VL-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512VL-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512VL-NEXT: addq $184, %rsp +; AVX512VL-NEXT: retq +; AVX512VL-NEXT: .seh_endproc top: tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() @@ -1033,64 +1272,123 @@ ; SSE-WIN-NEXT: retq ; SSE-WIN-NEXT: .seh_endproc ; -; AVX-LABEL: clearence: -; AVX: # %bb.0: # %top -; AVX-NEXT: subq $168, %rsp -; AVX-NEXT: .seh_stackalloc 168 -; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm15, 144 -; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm14, 128 -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm13, 112 -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm12, 96 -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm11, 80 -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm10, 64 -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm9, 48 -; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm8, 32 -; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm7, 16 -; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill -; AVX-NEXT: .seh_savexmm %xmm6, 0 -; AVX-NEXT: .seh_endprologue -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: vxorps %xmm6, %xmm6, %xmm6 -; AVX-NEXT: vcvtsi2sd %rcx, %xmm6, %xmm0 -; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: addq $168, %rsp -; AVX-NEXT: retq -; AVX-NEXT: .seh_endproc +; AVX1-LABEL: clearence: +; AVX1: # %bb.0: # %top +; AVX1-NEXT: subq $168, %rsp +; AVX1-NEXT: .seh_stackalloc 168 +; AVX1-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm15, 144 +; AVX1-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm14, 128 +; AVX1-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm13, 112 +; AVX1-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm12, 96 +; AVX1-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm11, 80 +; AVX1-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm10, 64 +; AVX1-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm9, 48 +; AVX1-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm8, 32 +; AVX1-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm7, 16 +; AVX1-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm6, 0 +; AVX1-NEXT: .seh_endprologue +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: vxorps %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vcvtsi2sd %rcx, %xmm6, %xmm0 +; AVX1-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-NEXT: addq $168, %rsp +; AVX1-NEXT: retq +; AVX1-NEXT: .seh_endproc +; +; AVX512VL-LABEL: clearence: +; AVX512VL: # %bb.0: # %top +; AVX512VL-NEXT: subq $168, %rsp +; AVX512VL-NEXT: .seh_stackalloc 168 +; AVX512VL-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm15, 144 +; AVX512VL-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm14, 128 +; AVX512VL-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm13, 112 +; AVX512VL-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm12, 96 +; AVX512VL-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm11, 80 +; AVX512VL-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm10, 64 +; AVX512VL-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm9, 48 +; AVX512VL-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm8, 32 +; AVX512VL-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm7, 16 +; AVX512VL-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm6, 0 +; AVX512VL-NEXT: .seh_endprologue +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: vxorps %xmm6, %xmm6, %xmm6 +; AVX512VL-NEXT: vcvtsi2sd %rcx, %xmm6, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512VL-NEXT: addq $168, %rsp +; AVX512VL-NEXT: retq +; AVX512VL-NEXT: .seh_endproc top: tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() @@ -1192,54 +1490,103 @@ ; SSE-WIN-NEXT: addq $136, %rsp ; SSE-WIN-NEXT: retq ; -; AVX-LABEL: loopclearence: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $136, %rsp -; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill -; AVX-NEXT: movq (%rcx), %rax -; AVX-NEXT: movl $1, %r8d -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB12_1: # %loop -; AVX-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: vcvtsi2sd %r8, %xmm5, %xmm4 -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: #APP -; AVX-NEXT: #NO_APP -; AVX-NEXT: vaddsd (%rdx), %xmm4, %xmm0 -; AVX-NEXT: vcvttsd2si %xmm0, %rcx -; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: incq %r8 -; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 -; AVX-NEXT: jne .LBB12_1 -; AVX-NEXT: # %bb.2: # %ret -; AVX-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: addq $136, %rsp -; AVX-NEXT: retq +; AVX1-LABEL: loopclearence: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $136, %rsp +; AVX1-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; AVX1-NEXT: movq (%rcx), %rax +; AVX1-NEXT: movl $1, %r8d +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB12_1: # %loop +; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX1-NEXT: vcvtsi2sd %r8, %xmm5, %xmm4 +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: vaddsd (%rdx), %xmm4, %xmm0 +; AVX1-NEXT: vcvttsd2si %xmm0, %rcx +; AVX1-NEXT: addq %rcx, %rax +; AVX1-NEXT: incq %r8 +; AVX1-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; AVX1-NEXT: jne .LBB12_1 +; AVX1-NEXT: # %bb.2: # %ret +; AVX1-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-NEXT: addq $136, %rsp +; AVX1-NEXT: retq +; +; AVX512VL-LABEL: loopclearence: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: subq $136, %rsp +; AVX512VL-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: movq (%rcx), %rax +; AVX512VL-NEXT: movl $1, %r8d +; AVX512VL-NEXT: .p2align 4, 0x90 +; AVX512VL-NEXT: .LBB12_1: # %loop +; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512VL-NEXT: vcvtsi2sd %r8, %xmm5, %xmm4 +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: vaddsd (%rdx), %xmm4, %xmm0 +; AVX512VL-NEXT: vcvttsd2si %xmm0, %rcx +; AVX512VL-NEXT: addq %rcx, %rax +; AVX512VL-NEXT: incq %r8 +; AVX512VL-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; AVX512VL-NEXT: jne .LBB12_1 +; AVX512VL-NEXT: # %bb.2: # %ret +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512VL-NEXT: addq $136, %rsp +; AVX512VL-NEXT: retq entry: %vx = load i64, i64* %x br label %loop @@ -1487,23 +1834,23 @@ ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: subq $152, %rsp ; AVX512VL-NEXT: .seh_stackalloc 152 -; AVX512VL-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm15, 128 -; AVX512VL-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm14, 112 -; AVX512VL-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm13, 96 -; AVX512VL-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm12, 80 -; AVX512VL-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm11, 64 -; AVX512VL-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm10, 48 -; AVX512VL-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm9, 32 -; AVX512VL-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm8, 16 -; AVX512VL-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: .seh_savexmm %xmm7, 0 ; AVX512VL-NEXT: .seh_endprologue ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero @@ -1551,15 +1898,15 @@ ; AVX512VL-NEXT: cmpq %r10, %r8 ; AVX512VL-NEXT: jge .LBB13_1 ; AVX512VL-NEXT: # %bb.3: # %loopdone -; AVX512VL-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX512VL-NEXT: addq $152, %rsp ; AVX512VL-NEXT: retq ; AVX512VL-NEXT: .seh_endproc diff --git a/llvm/test/CodeGen/X86/bswap-vector.ll b/llvm/test/CodeGen/X86/bswap-vector.ll --- a/llvm/test/CodeGen/X86/bswap-vector.ll +++ b/llvm/test/CodeGen/X86/bswap-vector.ll @@ -344,7 +344,7 @@ ; ; CHECK-AVX-LABEL: fold_v8i16: ; CHECK-AVX: # %bb.0: # %entry -; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536] +; CHECK-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536] ; CHECK-AVX-NEXT: retq entry: %r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> ) @@ -359,7 +359,7 @@ ; ; CHECK-AVX-LABEL: fold_v4i32: ; CHECK-AVX: # %bb.0: # %entry -; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863] +; CHECK-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863] ; CHECK-AVX-NEXT: retq entry: %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> ) @@ -379,7 +379,7 @@ ; ; CHECK-AVX-LABEL: fold_v2i64: ; CHECK-AVX: # %bb.0: # %entry -; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615] +; CHECK-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615] ; CHECK-AVX-NEXT: retq entry: %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> ) @@ -395,7 +395,7 @@ ; ; CHECK-AVX-LABEL: fold_v16i16: ; CHECK-AVX: # %bb.0: # %entry -; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584] +; CHECK-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584] ; CHECK-AVX-NEXT: retq entry: %r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> ) @@ -411,7 +411,7 @@ ; ; CHECK-AVX-LABEL: fold_v8i32: ; CHECK-AVX: # %bb.0: # %entry -; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296] +; CHECK-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296] ; CHECK-AVX-NEXT: retq entry: %r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> ) @@ -433,7 +433,7 @@ ; ; CHECK-AVX-LABEL: fold_v4i64: ; CHECK-AVX: # %bb.0: # %entry -; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160] +; CHECK-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160] ; CHECK-AVX-NEXT: retq entry: %r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> ) diff --git a/llvm/test/CodeGen/X86/bug37521.ll b/llvm/test/CodeGen/X86/bug37521.ll --- a/llvm/test/CodeGen/X86/bug37521.ll +++ b/llvm/test/CodeGen/X86/bug37521.ll @@ -15,9 +15,9 @@ ; CHECK-NEXT: movq a+{{.*}}(%rip), %rdx ; CHECK-NEXT: movq a+{{.*}}(%rip), %rsi ; CHECK-NEXT: movq {{.*}}(%rip), %rdi -; CHECK-NEXT: vmovaps a+{{.*}}(%rip), %xmm0 -; CHECK-NEXT: vmovups %xmm0, (%rsp) -; CHECK-NEXT: callq goo +; CHECK-NEXT: vmovdqa a+{{.*}}(%rip), %xmm0 +; CHECK-NEXT: vmovdqu %xmm0, (%rsp) +; CHECK-NEXT: callq goo@PLT ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: retq %k = bitcast <16 x float>* @a to <2 x i256>* diff --git a/llvm/test/CodeGen/X86/build-vector-128.ll b/llvm/test/CodeGen/X86/build-vector-128.ll --- a/llvm/test/CodeGen/X86/build-vector-128.ll +++ b/llvm/test/CodeGen/X86/build-vector-128.ll @@ -19,15 +19,25 @@ ; SSE-64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-64-NEXT: retq ; -; AVX-32-LABEL: test_buildvector_v2f64: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: test_buildvector_v2f64: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX1-32-NEXT: retl ; -; AVX-64-LABEL: test_buildvector_v2f64: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-64-NEXT: retq +; AVX1-64-LABEL: test_buildvector_v2f64: +; AVX1-64: # %bb.0: +; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_v2f64: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_buildvector_v2f64: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-64-NEXT: retq %ins0 = insertelement <2 x double> undef, double %a0, i32 0 %ins1 = insertelement <2 x double> %ins0, double %a1, i32 1 ret <2 x double> %ins1 @@ -53,10 +63,10 @@ ; SSE41-64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; SSE41-64-NEXT: retq ; -; AVX-32-LABEL: test_buildvector_v4f32: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: test_buildvector_v4f32: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX1-32-NEXT: retl ; ; AVX-64-LABEL: test_buildvector_v4f32: ; AVX-64: # %bb.0: @@ -64,6 +74,11 @@ ; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; AVX-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_v4f32: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 +; AVX2-32-NEXT: retl %ins0 = insertelement <4 x float> undef, float %a0, i32 0 %ins1 = insertelement <4 x float> %ins0, float %a1, i32 1 %ins2 = insertelement <4 x float> %ins1, float %a2, i32 2 @@ -84,10 +99,10 @@ ; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-64-NEXT: retq ; -; AVX-32-LABEL: test_buildvector_v2i64: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: test_buildvector_v2i64: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX1-32-NEXT: retl ; ; AVX-64-LABEL: test_buildvector_v2i64: ; AVX-64: # %bb.0: @@ -95,6 +110,11 @@ ; AVX-64-NEXT: vmovq %rdi, %xmm1 ; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_v2i64: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 +; AVX2-32-NEXT: retl %ins0 = insertelement <2 x i64> undef, i64 %a0, i32 0 %ins1 = insertelement <2 x i64> %ins0, i64 %a1, i32 1 ret <2 x i64> %ins1 @@ -125,10 +145,10 @@ ; SSE41-64-NEXT: pinsrd $3, %ecx, %xmm0 ; SSE41-64-NEXT: retq ; -; AVX-32-LABEL: test_buildvector_v4i32: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: test_buildvector_v4i32: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX1-32-NEXT: retl ; ; AVX-64-LABEL: test_buildvector_v4i32: ; AVX-64: # %bb.0: @@ -137,6 +157,11 @@ ; AVX-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 ; AVX-64-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 ; AVX-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_v4i32: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 +; AVX2-32-NEXT: retl %ins0 = insertelement <4 x i32> undef, i32 %f0, i32 0 %ins1 = insertelement <4 x i32> %ins0, i32 %f1, i32 1 %ins2 = insertelement <4 x i32> %ins1, i32 %f2, i32 2 @@ -535,16 +560,21 @@ ; SSE41-64-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-64-NEXT: retq ; -; AVX-32-LABEL: PR37502: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-32-NEXT: retl +; AVX1-32-LABEL: PR37502: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-32-NEXT: retl ; ; AVX-64-LABEL: PR37502: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-64-NEXT: retq +; +; AVX2-32-LABEL: PR37502: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm0 +; AVX2-32-NEXT: retl %i0 = insertelement <4 x float> undef, float %x, i32 0 %i1 = insertelement <4 x float> %i0, float %y, i32 1 %i2 = insertelement <4 x float> %i1, float %x, i32 2 diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll --- a/llvm/test/CodeGen/X86/build-vector-256.ll +++ b/llvm/test/CodeGen/X86/build-vector-256.ll @@ -5,17 +5,29 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX-64 --check-prefix=AVX2-64 define <4 x double> @test_buildvector_v4f64(double %a0, double %a1, double %a2, double %a3) { -; AVX-32-LABEL: test_buildvector_v4f64: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: test_buildvector_v4f64: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 +; AVX1-32-NEXT: retl ; -; AVX-64-LABEL: test_buildvector_v4f64: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: retq +; AVX1-64-LABEL: test_buildvector_v4f64: +; AVX1-64: # %bb.0: +; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_v4f64: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vmovdqu {{[0-9]+}}(%esp), %ymm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_buildvector_v4f64: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX2-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-64-NEXT: retq %ins0 = insertelement <4 x double> undef, double %a0, i32 0 %ins1 = insertelement <4 x double> %ins0, double %a1, i32 1 %ins2 = insertelement <4 x double> %ins1, double %a2, i32 2 @@ -24,10 +36,10 @@ } define <8 x float> @test_buildvector_v8f32(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) { -; AVX-32-LABEL: test_buildvector_v8f32: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: test_buildvector_v8f32: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 +; AVX1-32-NEXT: retl ; ; AVX-64-LABEL: test_buildvector_v8f32: ; AVX-64: # %bb.0: @@ -39,6 +51,11 @@ ; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; AVX-64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX-64-NEXT: retq +; +; AVX2-32-LABEL: test_buildvector_v8f32: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vmovdqu {{[0-9]+}}(%esp), %ymm0 +; AVX2-32-NEXT: retl %ins0 = insertelement <8 x float> undef, float %a0, i32 0 %ins1 = insertelement <8 x float> %ins0, float %a1, i32 1 %ins2 = insertelement <8 x float> %ins1, float %a2, i32 2 @@ -51,10 +68,10 @@ } define <4 x i64> @test_buildvector_v4i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) { -; AVX-32-LABEL: test_buildvector_v4i64: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: test_buildvector_v4i64: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 +; AVX1-32-NEXT: retl ; ; AVX1-64-LABEL: test_buildvector_v4i64: ; AVX1-64: # %bb.0: @@ -67,6 +84,11 @@ ; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-64-NEXT: retq ; +; AVX2-32-LABEL: test_buildvector_v4i64: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vmovdqu {{[0-9]+}}(%esp), %ymm0 +; AVX2-32-NEXT: retl +; ; AVX2-64-LABEL: test_buildvector_v4i64: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vmovq %rcx, %xmm0 @@ -85,10 +107,10 @@ } define <8 x i32> @test_buildvector_v8i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) { -; AVX-32-LABEL: test_buildvector_v8i32: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: test_buildvector_v8i32: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 +; AVX1-32-NEXT: retl ; ; AVX1-64-LABEL: test_buildvector_v8i32: ; AVX1-64: # %bb.0: @@ -103,6 +125,11 @@ ; AVX1-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-64-NEXT: retq ; +; AVX2-32-LABEL: test_buildvector_v8i32: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vmovdqu {{[0-9]+}}(%esp), %ymm0 +; AVX2-32-NEXT: retl +; ; AVX2-64-LABEL: test_buildvector_v8i32: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vmovd %edi, %xmm0 diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll --- a/llvm/test/CodeGen/X86/build-vector-512.ll +++ b/llvm/test/CodeGen/X86/build-vector-512.ll @@ -7,18 +7,18 @@ define <8 x double> @test_buildvector_v8f64(double %a0, double %a1, double %a2, double %a3, double %a4, double %a5, double %a6, double %a7) { ; AVX-32-LABEL: test_buildvector_v8f64: ; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %zmm0 +; AVX-32-NEXT: vmovdqu64 {{[0-9]+}}(%esp), %zmm0 ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: test_buildvector_v8f64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX-64-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; AVX-64-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX-64-NEXT: retq %ins0 = insertelement <8 x double> undef, double %a0, i32 0 %ins1 = insertelement <8 x double> %ins0, double %a1, i32 1 @@ -34,7 +34,7 @@ define <16 x float> @test_buildvector_v16f32(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7, float %a8, float %a9, float %a10, float %a11, float %a12, float %a13, float %a14, float %a15) { ; AVX-32-LABEL: test_buildvector_v16f32: ; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %zmm0 +; AVX-32-NEXT: vmovdqu64 {{[0-9]+}}(%esp), %zmm0 ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: test_buildvector_v16f32: @@ -79,7 +79,7 @@ define <8 x i64> @test_buildvector_v8i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7) { ; AVX-32-LABEL: test_buildvector_v8i64: ; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %zmm0 +; AVX-32-NEXT: vmovdqu64 {{[0-9]+}}(%esp), %zmm0 ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: test_buildvector_v8i64: @@ -111,7 +111,7 @@ define <16 x i32> @test_buildvector_v16i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15) { ; AVX-32-LABEL: test_buildvector_v16i32: ; AVX-32: # %bb.0: -; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %zmm0 +; AVX-32-NEXT: vmovdqu64 {{[0-9]+}}(%esp), %zmm0 ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: test_buildvector_v16i32: diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -90,10 +90,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test_negative_zero_2: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test_negative_zero_2: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_negative_zero_2: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-NEXT: retq entry: %0 = extractelement <2 x double> %A, i32 0 %1 = insertelement <2 x double> undef, double %0, i32 0 @@ -810,15 +815,25 @@ ; SSE41-NEXT: movl %edx, %eax ; SSE41-NEXT: retq ; -; AVX-LABEL: PR46586: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX-NEXT: vextractps $3, %xmm0, %ecx -; AVX-NEXT: vpextrd $3, %xmm1, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %ecx -; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: retq +; AVX1-LABEL: PR46586: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vextractps $3, %xmm0, %ecx +; AVX1-NEXT: vpextrd $3, %xmm1, %eax +; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: divl %ecx +; AVX1-NEXT: movl %edx, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR46586: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; AVX2-NEXT: vpextrd $3, %xmm1, %eax +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: divl %ecx +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: retq %p0 = getelementptr inbounds i8, i8* %p, i64 0 %p3 = getelementptr inbounds i8, i8* %p, i64 3 %t25 = load i8, i8* %p0 diff --git a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll --- a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -20,11 +20,17 @@ ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; SSE42-NEXT: retq ; -; AVX-LABEL: _clearupper2xi64a: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper2xi64a: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper2xi64a: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq %x0 = extractelement <2 x i64> %0, i32 0 %x1 = extractelement <2 x i64> %0, i32 1 %trunc0 = trunc i64 %x0 to i32 @@ -51,11 +57,17 @@ ; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; SSE42-NEXT: retq ; -; AVX-LABEL: _clearupper4xi64a: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper4xi64a: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper4xi64a: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq %x0 = extractelement <4 x i64> %0, i32 0 %x1 = extractelement <4 x i64> %0, i32 1 %x2 = extractelement <4 x i64> %0, i32 2 @@ -177,10 +189,15 @@ ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper8xi16a: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper8xi16a: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper8xi16a: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq %x0 = extractelement <8 x i16> %0, i32 0 %x1 = extractelement <8 x i16> %0, i32 1 %x2 = extractelement <8 x i16> %0, i32 2 @@ -224,10 +241,15 @@ ; SSE-NEXT: andps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper16xi16a: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper16xi16a: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper16xi16a: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq %x0 = extractelement <16 x i16> %0, i32 0 %x1 = extractelement <16 x i16> %0, i32 1 %x2 = extractelement <16 x i16> %0, i32 2 @@ -301,10 +323,15 @@ ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper16xi8a: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper16xi8a: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper16xi8a: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq %x0 = extractelement <16 x i8> %0, i32 0 %x1 = extractelement <16 x i8> %0, i32 1 %x2 = extractelement <16 x i8> %0, i32 2 @@ -380,10 +407,15 @@ ; SSE-NEXT: andps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper32xi8a: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper32xi8a: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper32xi8a: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq %x0 = extractelement <32 x i8> %0, i32 0 %x1 = extractelement <32 x i8> %0, i32 1 %x2 = extractelement <32 x i8> %0, i32 2 @@ -527,11 +559,17 @@ ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; SSE42-NEXT: retq ; -; AVX-LABEL: _clearupper2xi64b: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper2xi64b: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper2xi64b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq %x32 = bitcast <2 x i64> %0 to <4 x i32> %r0 = insertelement <4 x i32> %x32, i32 zeroinitializer, i32 1 %r1 = insertelement <4 x i32> %r0, i32 zeroinitializer, i32 3 @@ -554,11 +592,17 @@ ; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; SSE42-NEXT: retq ; -; AVX-LABEL: _clearupper4xi64b: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper4xi64b: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper4xi64b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq %x32 = bitcast <4 x i64> %0 to <8 x i32> %r0 = insertelement <8 x i32> %x32, i32 zeroinitializer, i32 1 %r1 = insertelement <8 x i32> %r0, i32 zeroinitializer, i32 3 @@ -638,10 +682,15 @@ ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper8xi16b: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper8xi16b: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper8xi16b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq %x8 = bitcast <8 x i16> %0 to <16 x i8> %r0 = insertelement <16 x i8> %x8, i8 zeroinitializer, i32 1 %r1 = insertelement <16 x i8> %r0, i8 zeroinitializer, i32 3 @@ -663,14 +712,23 @@ ; SSE-NEXT: andps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper16xi16b: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper16xi16b: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper16xi16b: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-NEXT: retq %x8 = bitcast <16 x i16> %0 to <32 x i8> %r0 = insertelement <32 x i8> %x8, i8 zeroinitializer, i32 1 %r1 = insertelement <32 x i8> %r0, i8 zeroinitializer, i32 3 @@ -800,59 +858,113 @@ ; SSE42-NEXT: popq %rbx ; SSE42-NEXT: retq ; -; AVX-LABEL: _clearupper16xi8b: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbx -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %r9 -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movq %r9, %r8 -; AVX-NEXT: shrq $56, %r8 -; AVX-NEXT: andl $15, %r8d -; AVX-NEXT: movq %r9, %r10 -; AVX-NEXT: shrq $48, %r10 -; AVX-NEXT: andl $15, %r10d -; AVX-NEXT: movq %rcx, %rdx -; AVX-NEXT: shldq $24, %r9, %rdx -; AVX-NEXT: andl $15, %edx -; AVX-NEXT: movq %r9, %r11 -; AVX-NEXT: shrq $32, %r11 -; AVX-NEXT: andl $15, %r11d -; AVX-NEXT: movq %rcx, %rdi -; AVX-NEXT: shrq $56, %rdi -; AVX-NEXT: andl $15, %edi -; AVX-NEXT: movq %rcx, %rsi -; AVX-NEXT: shrq $48, %rsi -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: shrq $40, %rax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: movq %rcx, %rbx -; AVX-NEXT: shrq $32, %rbx -; AVX-NEXT: andl $15, %ebx -; AVX-NEXT: shlq $32, %rbx -; AVX-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX-NEXT: orq %rbx, %rcx -; AVX-NEXT: shlq $40, %rax -; AVX-NEXT: orq %rcx, %rax -; AVX-NEXT: shlq $48, %rsi -; AVX-NEXT: orq %rax, %rsi -; AVX-NEXT: shlq $56, %rdi -; AVX-NEXT: orq %rsi, %rdi -; AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; AVX-NEXT: shlq $32, %r11 -; AVX-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F -; AVX-NEXT: orq %r11, %r9 -; AVX-NEXT: shlq $40, %rdx -; AVX-NEXT: orq %r9, %rdx -; AVX-NEXT: shlq $48, %r10 -; AVX-NEXT: orq %rdx, %r10 -; AVX-NEXT: shlq $56, %r8 -; AVX-NEXT: orq %r10, %r8 -; AVX-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: popq %rbx -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper16xi8b: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %r9 +; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX1-NEXT: movq %r9, %r8 +; AVX1-NEXT: shrq $56, %r8 +; AVX1-NEXT: andl $15, %r8d +; AVX1-NEXT: movq %r9, %r10 +; AVX1-NEXT: shrq $48, %r10 +; AVX1-NEXT: andl $15, %r10d +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: shldq $24, %r9, %rdx +; AVX1-NEXT: andl $15, %edx +; AVX1-NEXT: movq %r9, %r11 +; AVX1-NEXT: shrq $32, %r11 +; AVX1-NEXT: andl $15, %r11d +; AVX1-NEXT: movq %rcx, %rdi +; AVX1-NEXT: shrq $56, %rdi +; AVX1-NEXT: andl $15, %edi +; AVX1-NEXT: movq %rcx, %rsi +; AVX1-NEXT: shrq $48, %rsi +; AVX1-NEXT: andl $15, %esi +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: shrq $40, %rax +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: movq %rcx, %rbx +; AVX1-NEXT: shrq $32, %rbx +; AVX1-NEXT: andl $15, %ebx +; AVX1-NEXT: shlq $32, %rbx +; AVX1-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; AVX1-NEXT: orq %rbx, %rcx +; AVX1-NEXT: shlq $40, %rax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: shlq $48, %rsi +; AVX1-NEXT: orq %rax, %rsi +; AVX1-NEXT: shlq $56, %rdi +; AVX1-NEXT: orq %rsi, %rdi +; AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shlq $32, %r11 +; AVX1-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F +; AVX1-NEXT: orq %r11, %r9 +; AVX1-NEXT: shlq $40, %rdx +; AVX1-NEXT: orq %r9, %rdx +; AVX1-NEXT: shlq $48, %r10 +; AVX1-NEXT: orq %rdx, %r10 +; AVX1-NEXT: shlq $56, %r8 +; AVX1-NEXT: orq %r10, %r8 +; AVX1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper16xi8b: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %r9 +; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: movq %r9, %r8 +; AVX2-NEXT: shrq $56, %r8 +; AVX2-NEXT: andl $15, %r8d +; AVX2-NEXT: movq %r9, %r10 +; AVX2-NEXT: shrq $48, %r10 +; AVX2-NEXT: andl $15, %r10d +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: shldq $24, %r9, %rdx +; AVX2-NEXT: andl $15, %edx +; AVX2-NEXT: movq %r9, %r11 +; AVX2-NEXT: shrq $32, %r11 +; AVX2-NEXT: andl $15, %r11d +; AVX2-NEXT: movq %rcx, %rdi +; AVX2-NEXT: shrq $56, %rdi +; AVX2-NEXT: andl $15, %edi +; AVX2-NEXT: movq %rcx, %rsi +; AVX2-NEXT: shrq $48, %rsi +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: shrq $40, %rax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: movq %rcx, %rbx +; AVX2-NEXT: shrq $32, %rbx +; AVX2-NEXT: andl $15, %ebx +; AVX2-NEXT: shlq $32, %rbx +; AVX2-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; AVX2-NEXT: orq %rbx, %rcx +; AVX2-NEXT: shlq $40, %rax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: shlq $48, %rsi +; AVX2-NEXT: orq %rax, %rsi +; AVX2-NEXT: shlq $56, %rdi +; AVX2-NEXT: orq %rsi, %rdi +; AVX2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shlq $32, %r11 +; AVX2-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F +; AVX2-NEXT: orq %r11, %r9 +; AVX2-NEXT: shlq $40, %rdx +; AVX2-NEXT: orq %r9, %rdx +; AVX2-NEXT: shlq $48, %r10 +; AVX2-NEXT: orq %rdx, %r10 +; AVX2-NEXT: shlq $56, %r8 +; AVX2-NEXT: orq %r10, %r8 +; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq %x4 = bitcast <16 x i8> %0 to <32 x i4> %r0 = insertelement <32 x i4> %x4, i4 zeroinitializer, i32 1 %r1 = insertelement <32 x i4> %r0, i4 zeroinitializer, i32 3 @@ -1226,11 +1338,17 @@ ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; SSE42-NEXT: retq ; -; AVX-LABEL: _clearupper2xi64c: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper2xi64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper2xi64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq %r = and <2 x i64> , %0 ret <2 x i64> %r } @@ -1250,11 +1368,17 @@ ; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; SSE42-NEXT: retq ; -; AVX-LABEL: _clearupper4xi64c: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper4xi64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper4xi64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq %r = and <4 x i64> , %0 ret <4 x i64> %r } @@ -1315,10 +1439,15 @@ ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper8xi16c: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper8xi16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper8xi16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq %r = and <8 x i16> , %0 ret <8 x i16> %r } @@ -1331,10 +1460,15 @@ ; SSE-NEXT: andps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper16xi16c: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper16xi16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper16xi16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq %r = and <16 x i16> , %0 ret <16 x i16> %r } @@ -1345,10 +1479,15 @@ ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper16xi8c: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper16xi8c: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper16xi8c: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq %r = and <16 x i8> , %0 ret <16 x i8> %r } @@ -1361,10 +1500,15 @@ ; SSE-NEXT: andps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper32xi8c: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper32xi8c: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper32xi8c: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq %r = and <32 x i8> , %0 ret <32 x i8> %r } diff --git a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll --- a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll +++ b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll @@ -24,7 +24,7 @@ ; ; AVX512-LABEL: insert_f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512-NEXT: retq %1 = insertelement <2 x double> %a1, double %a0, i32 0 ret <2 x double> %1 @@ -49,7 +49,7 @@ ; ; AVX512-LABEL: insert_f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX512-NEXT: retq %1 = insertelement <4 x float> %a1, float %a0, i32 0 ret <4 x float> %1 diff --git a/llvm/test/CodeGen/X86/combine-abs.ll b/llvm/test/CodeGen/X86/combine-abs.ll --- a/llvm/test/CodeGen/X86/combine-abs.ll +++ b/llvm/test/CodeGen/X86/combine-abs.ll @@ -14,7 +14,7 @@ ; ; AVX-LABEL: combine_v4i32_abs_constant: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,3,2147483648] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,3,2147483648] ; AVX-NEXT: retq %1 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> , i1 false) ret <4 x i32> %1 @@ -29,7 +29,7 @@ ; ; AVX-LABEL: combine_v16i16_abs_constant: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,1,3,3,7,7,255,255,4096,4096,32767,32767,32768,32768,0] +; AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,1,3,3,7,7,255,255,4096,4096,32767,32767,32768,32768,0] ; AVX-NEXT: retq %1 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> , i1 false) ret <16 x i16> %1 @@ -168,7 +168,7 @@ ; ; AVX-LABEL: combine_v16i8_abs_constant: ; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = insertelement <16 x i8> undef, i8 15, i32 0 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/combine-add-ssat.ll b/llvm/test/CodeGen/X86/combine-add-ssat.ll --- a/llvm/test/CodeGen/X86/combine-add-ssat.ll +++ b/llvm/test/CodeGen/X86/combine-add-ssat.ll @@ -2,10 +2,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2OR512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2OR512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX2OR512 declare i32 @llvm.sadd.sat.i32 (i32, i32) declare i64 @llvm.sadd.sat.i64 (i64, i64) @@ -51,10 +51,15 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,0,256,65534,0,65280,32768,0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_constfold_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,256,65534,0,65280,32768,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_constfold_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,256,65534,0,65280,32768,0] +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: combine_constfold_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm0 = [1,0,256,65534,0,65280,32768,0] +; AVX2OR512-NEXT: retq %res = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res } @@ -65,10 +70,15 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65535,65535,65534,0,65280,32768,0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_constfold_undef_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65535,65535,65534,0,65280,32768,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_constfold_undef_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65535,65535,65534,0,65280,32768,0] +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: combine_constfold_undef_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,65535,65535,65534,0,65280,32768,0] +; AVX2OR512-NEXT: retq %res = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res } diff --git a/llvm/test/CodeGen/X86/combine-add-usat.ll b/llvm/test/CodeGen/X86/combine-add-usat.ll --- a/llvm/test/CodeGen/X86/combine-add-usat.ll +++ b/llvm/test/CodeGen/X86/combine-add-usat.ll @@ -3,9 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2OR512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX2OR512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX2OR512 declare i32 @llvm.uadd.sat.i32 (i32, i32) declare i64 @llvm.uadd.sat.i64 (i64, i64) @@ -31,6 +31,11 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq +; +; AVX2OR512-LABEL: combine_undef_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX2OR512-NEXT: retq %res = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> %a0) ret <8 x i16> %res } @@ -55,6 +60,11 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,65535,256,65535,65535,65535,2,65535] ; AVX-NEXT: retq +; +; AVX2OR512-LABEL: combine_constfold_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm0 = [1,65535,256,65535,65535,65535,2,65535] +; AVX2OR512-NEXT: retq %res = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res } @@ -69,6 +79,11 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,2,65535] ; AVX-NEXT: retq +; +; AVX2OR512-LABEL: combine_constfold_undef_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,2,65535] +; AVX2OR512-NEXT: retq %res = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res } @@ -95,6 +110,11 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpaddusw {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq +; +; AVX2OR512-LABEL: combine_constant_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vpaddusw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2OR512-NEXT: retq %1 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> , <8 x i16> %a0) ret <8 x i16> %1 } @@ -147,6 +167,13 @@ ; AVX-NEXT: vpsrlw $10, %xmm1, %xmm1 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq +; +; AVX2OR512-LABEL: combine_no_overflow_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vpsrlw $10, %xmm0, %xmm0 +; AVX2OR512-NEXT: vpsrlw $10, %xmm1, %xmm1 +; AVX2OR512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2OR512-NEXT: retq %1 = lshr <8 x i16> %a0, %2 = lshr <8 x i16> %a1, %3 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %1, <8 x i16> %2) diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll --- a/llvm/test/CodeGen/X86/combine-add.ll +++ b/llvm/test/CodeGen/X86/combine-add.ll @@ -76,7 +76,7 @@ ; ; AVX-LABEL: combine_vec_add_sub0: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: vmovdqa %xmm1, %xmm0 ; AVX-NEXT: retq %1 = sub <4 x i32> %b, %a %2 = add <4 x i32> %a, %1 @@ -92,7 +92,7 @@ ; ; AVX-LABEL: combine_vec_add_sub1: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: vmovdqa %xmm1, %xmm0 ; AVX-NEXT: retq %1 = sub <4 x i32> %b, %a %2 = add <4 x i32> %1, %a @@ -238,11 +238,11 @@ ; ; AVX-LABEL: combine_vec_add_uniquebits: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [61680,61680,61680,61680] -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [3855,3855,3855,3855] -; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [61680,61680,61680,61680] +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3855,3855,3855,3855] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = and <4 x i32> %a, %2 = and <4 x i32> %b, diff --git a/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll b/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll --- a/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll +++ b/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll @@ -55,7 +55,7 @@ define <16 x i16> @test3_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: test3_x86_avx2_pblendw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 -1) ret <16 x i16> %res @@ -64,7 +64,7 @@ define <4 x i32> @test3_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: test3_x86_avx2_pblendd_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 -1) ret <4 x i32> %res @@ -73,7 +73,7 @@ define <8 x i32> @test3_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: test3_x86_avx2_pblendd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 -1) ret <8 x i32> %res diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll --- a/llvm/test/CodeGen/X86/combine-bitselect.ll +++ b/llvm/test/CodeGen/X86/combine-bitselect.ll @@ -23,18 +23,25 @@ ; XOP-NEXT: vpcmov {{.*}}(%rip), %xmm0, %xmm1, %xmm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v2i64_rr: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v2i64_rr: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v2i64_rr: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v2i64_rr: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v2i64_rr: @@ -62,20 +69,28 @@ ; XOP-NEXT: vpcmov {{.*}}(%rip), %xmm0, %xmm1, %xmm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v2i64_rm: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm1 -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v2i64_rm: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v2i64_rm: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v2i64_rm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm1 -; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v2i64_rm: @@ -105,20 +120,28 @@ ; XOP-NEXT: vpcmov {{.*}}(%rip), %xmm0, %xmm1, %xmm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v2i64_mr: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm1 -; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v2i64_mr: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v2i64_mr: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v2i64_mr: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm1 -; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v2i64_mr: @@ -150,22 +173,31 @@ ; XOP-NEXT: vpcmov %xmm1, (%rdi), %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v2i64_mm: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vmovaps (%rsi), %xmm1 -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v2i64_mm: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vmovaps (%rsi), %xmm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v2i64_mm: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v2i64_mm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 -; AVX512F-NEXT: vmovaps (%rsi), %xmm1 -; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v2i64_mm: @@ -271,18 +303,18 @@ ; ; AVX2-LABEL: bitselect_v2i64_broadcast_rrm: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vandnps %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpandn %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v2i64_broadcast_rrm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX512F-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vandnps %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastq (%rdi), %xmm2 +; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpandn %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrm: @@ -320,18 +352,25 @@ ; XOP-NEXT: vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v4i64_rr: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v4i64_rr: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v4i64_rr: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v4i64_rr: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v4i64_rr: @@ -367,20 +406,28 @@ ; XOP-NEXT: vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v4i64_rm: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %ymm1 -; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v4i64_rm: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v4i64_rm: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v4i64_rm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm1 -; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v4i64_rm: @@ -418,20 +465,28 @@ ; XOP-NEXT: vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v4i64_mr: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %ymm1 -; AVX-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v4i64_mr: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v4i64_mr: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v4i64_mr: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm1 -; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v4i64_mr: @@ -468,22 +523,31 @@ ; XOP-NEXT: vpcmov %ymm1, (%rdi), %ymm0, %ymm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v4i64_mm: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: vmovaps (%rsi), %ymm1 -; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v4i64_mm: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovaps (%rsi), %ymm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v4i64_mm: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v4i64_mm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; AVX512F-NEXT: vmovaps (%rsi), %ymm1 -; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v4i64_mm: @@ -593,20 +657,28 @@ ; XOP-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v4i64_broadcast_rrm: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm2 -; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v4i64_broadcast_rrm: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v4i64_broadcast_rrm: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v4i64_broadcast_rrm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcastsd (%rdi), %ymm2 -; AVX512F-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrm: @@ -660,16 +732,27 @@ ; XOP-NEXT: vpcmov %ymm4, %ymm1, %ymm3, %ymm1 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v8i64_rr: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725] -; AVX-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v8i64_rr: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725] +; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v8i64_rr: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpandn %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpandn %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: bitselect_v8i64_rr: ; AVX512: # %bb.0: @@ -720,17 +803,29 @@ ; XOP-NEXT: vpcmov %ymm4, %ymm1, %ymm3, %ymm1 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v8i64_rm: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] -; AVX-NEXT: # ymm2 = mem[0,1,0,1] -; AVX-NEXT: vandps 32(%rdi), %ymm2, %ymm3 -; AVX-NEXT: vandps (%rdi), %ymm2, %ymm4 -; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0 -; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v8i64_rm: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] +; AVX1-NEXT: # ymm2 = mem[0,1,0,1] +; AVX1-NEXT: vandps 32(%rdi), %ymm2, %ymm3 +; AVX1-NEXT: vandps (%rdi), %ymm2, %ymm4 +; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v8i64_rm: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpand 32(%rdi), %ymm2, %ymm3 +; AVX2-NEXT: vpand (%rdi), %ymm2, %ymm4 +; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: bitselect_v8i64_rm: ; AVX512: # %bb.0: @@ -783,17 +878,29 @@ ; XOP-NEXT: vpcmov %ymm4, %ymm1, %ymm3, %ymm1 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v8i64_mr: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296] -; AVX-NEXT: # ymm2 = mem[0,1,0,1] -; AVX-NEXT: vandps 32(%rdi), %ymm2, %ymm3 -; AVX-NEXT: vandps (%rdi), %ymm2, %ymm4 -; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0 -; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v8i64_mr: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296] +; AVX1-NEXT: # ymm2 = mem[0,1,0,1] +; AVX1-NEXT: vandps 32(%rdi), %ymm2, %ymm3 +; AVX1-NEXT: vandps (%rdi), %ymm2, %ymm4 +; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v8i64_mr: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpand 32(%rdi), %ymm2, %ymm3 +; AVX2-NEXT: vpand (%rdi), %ymm2, %ymm4 +; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: bitselect_v8i64_mr: ; AVX512: # %bb.0: @@ -842,17 +949,29 @@ ; XOP-NEXT: vpcmov %ymm2, 32(%rdi), %ymm1, %ymm1 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v8i64_mm: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] -; AVX-NEXT: # ymm1 = mem[0,1,0,1] -; AVX-NEXT: vandps 32(%rsi), %ymm1, %ymm2 -; AVX-NEXT: vandps (%rsi), %ymm1, %ymm0 -; AVX-NEXT: vandnps (%rdi), %ymm1, %ymm3 -; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX-NEXT: vandnps 32(%rdi), %ymm1, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v8i64_mm: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] +; AVX1-NEXT: # ymm1 = mem[0,1,0,1] +; AVX1-NEXT: vandps 32(%rsi), %ymm1, %ymm2 +; AVX1-NEXT: vandps (%rsi), %ymm1, %ymm0 +; AVX1-NEXT: vandnps (%rdi), %ymm1, %ymm3 +; AVX1-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vandnps 32(%rdi), %ymm1, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v8i64_mm: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vpand 32(%rsi), %ymm1, %ymm2 +; AVX2-NEXT: vpand (%rsi), %ymm1, %ymm0 +; AVX2-NEXT: vpandn (%rdi), %ymm1, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpandn 32(%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: bitselect_v8i64_mm: ; AVX512: # %bb.0: @@ -978,16 +1097,27 @@ ; XOP-NEXT: vpcmov %ymm4, %ymm3, %ymm1, %ymm1 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v8i64_broadcast_rrm: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm4 -; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: bitselect_v8i64_broadcast_rrm: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/combine-fabs.ll b/llvm/test/CodeGen/X86/combine-fabs.ll --- a/llvm/test/CodeGen/X86/combine-fabs.ll +++ b/llvm/test/CodeGen/X86/combine-fabs.ll @@ -16,7 +16,7 @@ ; ; AVX-LABEL: combine_fabs_constant: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: retq %1 = call float @llvm.fabs.f32(float -2.0) ret float %1 @@ -30,7 +30,7 @@ ; ; AVX-LABEL: combine_vec_fabs_constant: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,2.0E+0,2.0E+0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0.0E+0,0.0E+0,2.0E+0,2.0E+0] ; AVX-NEXT: retq %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> ) ret <4 x float> %1 @@ -45,8 +45,8 @@ ; ; AVX-LABEL: combine_fabs_fabs: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call float @llvm.fabs.f32(float %a) %2 = call float @llvm.fabs.f32(float %1) @@ -61,8 +61,8 @@ ; ; AVX-LABEL: combine_vec_fabs_fabs: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) %2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %1) @@ -78,8 +78,8 @@ ; ; AVX-LABEL: combine_fabs_fneg: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = fsub float -0.0, %a %2 = call float @llvm.fabs.f32(float %1) @@ -94,8 +94,8 @@ ; ; AVX-LABEL: combine_vec_fabs_fneg: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = fsub <4 x float> , %a %2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %1) @@ -111,8 +111,8 @@ ; ; AVX-LABEL: combine_fabs_fcopysign: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call float @llvm.copysign.f32(float %a, float %b) %2 = call float @llvm.fabs.f32(float %1) @@ -127,8 +127,8 @@ ; ; AVX-LABEL: combine_vec_fabs_fcopysign: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b) %2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %1) diff --git a/llvm/test/CodeGen/X86/combine-fcopysign.ll b/llvm/test/CodeGen/X86/combine-fcopysign.ll --- a/llvm/test/CodeGen/X86/combine-fcopysign.ll +++ b/llvm/test/CodeGen/X86/combine-fcopysign.ll @@ -16,8 +16,8 @@ ; ; AVX-LABEL: combine_vec_fcopysign_pos_constant0: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> ) ret <4 x float> %1 @@ -31,8 +31,8 @@ ; ; AVX-LABEL: combine_vec_fcopysign_pos_constant1: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> ) ret <4 x float> %1 @@ -46,8 +46,8 @@ ; ; AVX-LABEL: combine_vec_fcopysign_fabs_sgn: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y) %2 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> %1) @@ -63,8 +63,8 @@ ; ; AVX-LABEL: combine_vec_fcopysign_neg_constant0: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> ) ret <4 x float> %1 @@ -78,8 +78,8 @@ ; ; AVX-LABEL: combine_vec_fcopysign_neg_constant1: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> ) ret <4 x float> %1 @@ -93,8 +93,8 @@ ; ; AVX-LABEL: combine_vec_fcopysign_fneg_fabs_sgn: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y) %2 = fsub <4 x float> , %1 @@ -113,11 +113,11 @@ ; ; AVX-LABEL: combine_vec_fcopysign_fabs_mag: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x) %2 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %1, <4 x float> %y) @@ -135,11 +135,11 @@ ; ; AVX-LABEL: combine_vec_fcopysign_fneg_mag: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = fsub <4 x float> , %x %2 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %1, <4 x float> %y) @@ -157,11 +157,11 @@ ; ; AVX-LABEL: combine_vec_fcopysign_fcopysign_mag: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> %z) %2 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %1, <4 x float> %y) @@ -179,11 +179,11 @@ ; ; AVX-LABEL: combine_vec_fcopysign_fcopysign_sgn: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vandps %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %y, <4 x float> %z) %2 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> %1) @@ -228,12 +228,12 @@ ; ; AVX-LABEL: combine_vec_fcopysign_fpext_sgn: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vcvtps2pd %xmm1, %ymm1 -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %1 = fpext <4 x float> %y to <4 x double> %2 = call <4 x double> @llvm.copysign.v4f64(<4 x double> %x, <4 x double> %1) diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -11,7 +11,7 @@ ; ; AVX-LABEL: combine_vec_mul_zero: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = mul <4 x i32> %x, zeroinitializer ret <4 x i32> %1 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -123,10 +123,20 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_sdiv_zero: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_sdiv_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2ORLATER-LABEL: combine_vec_sdiv_zero: +; AVX2ORLATER: # %bb.0: +; AVX2ORLATER-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2ORLATER-NEXT: retq +; +; XOP-LABEL: combine_vec_sdiv_zero: +; XOP: # %bb.0: +; XOP-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; XOP-NEXT: retq %1 = sdiv <4 x i32> zeroinitializer, %x ret <4 x i32> %1 } @@ -154,7 +164,7 @@ ; ; AVX2ORLATER-LABEL: combine_vec_sdiv_dupe: ; AVX2ORLATER: # %bb.0: -; AVX2ORLATER-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; AVX2ORLATER-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] ; AVX2ORLATER-NEXT: retq ; ; XOP-LABEL: combine_vec_sdiv_dupe: diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -13,7 +13,7 @@ ; ; AVX-LABEL: combine_vec_shl_zero: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shl <4 x i32> zeroinitializer, %x ret <4 x i32> %1 @@ -71,7 +71,7 @@ ; ; AVX-LABEL: combine_vec_shl_known_zero0: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = and <4 x i32> %x, %2 = shl <4 x i32> %1, @@ -211,7 +211,7 @@ ; ; AVX-LABEL: combine_vec_shl_shlr_zero0: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shl <4 x i32> %x, %2 = shl <4 x i32> %1, @@ -226,7 +226,7 @@ ; ; AVX-LABEL: combine_vec_shl_shl_zero1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shl <4 x i32> %x, %2 = shl <4 x i32> %1, @@ -273,7 +273,7 @@ ; ; AVX-LABEL: combine_vec_shl_ext_shl1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shl <8 x i16> %x, %2 = sext <8 x i16> %1 to <8 x i32> @@ -657,8 +657,8 @@ ; ; AVX-LABEL: combine_vec_shl_ashr0: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = ashr <4 x i32> %x, %2 = shl <4 x i32> %1, @@ -673,7 +673,7 @@ ; ; AVX-LABEL: combine_vec_shl_ashr1: ; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = ashr <4 x i32> %x, %2 = shl <4 x i32> %1, diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -12,7 +12,7 @@ ; ; AVX-LABEL: combine_vec_ashr_zero: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = ashr <4 x i32> zeroinitializer, %x ret <4 x i32> %1 @@ -304,7 +304,7 @@ ; ; AVX-LABEL: combine_vec_ashr_positive_splat: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = and <4 x i32> %x, %2 = ashr <4 x i32> %1, diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll --- a/llvm/test/CodeGen/X86/combine-srem.ll +++ b/llvm/test/CodeGen/X86/combine-srem.ll @@ -19,10 +19,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_srem_by_one: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_srem_by_one: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_srem_by_one: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i32> %x, ret <4 x i32> %1 } @@ -43,10 +48,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_srem_by_negone: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_srem_by_negone: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_srem_by_negone: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i32> %x, ret <4 x i32> %1 } @@ -115,10 +125,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_srem_zero: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_srem_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_srem_zero: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i32> zeroinitializer, %x ret <4 x i32> %1 } @@ -139,10 +154,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_srem_dupe: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_srem_dupe: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_srem_dupe: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i32> %x, %x ret <4 x i32> %1 } @@ -161,8 +181,8 @@ ; ; AVX2-LABEL: combine_vec_srem_by_pos0: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3] -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = and <4 x i32> %x, %2 = srem <4 x i32> %1, @@ -175,10 +195,15 @@ ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_srem_by_pos1: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_srem_by_pos1: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_srem_by_pos1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = and <4 x i32> %x, %2 = srem <4 x i32> %1, ret <4 x i32> %2 @@ -424,10 +449,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: boolvec_srem: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: boolvec_srem: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: boolvec_srem: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq %r = srem <4 x i1> %x, %y ret <4 x i1> %r } diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -12,7 +12,7 @@ ; ; AVX-LABEL: combine_vec_lshr_zero: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = lshr <4 x i32> zeroinitializer, %x ret <4 x i32> %1 @@ -61,7 +61,7 @@ ; ; AVX-LABEL: combine_vec_lshr_known_zero0: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = and <4 x i32> %x, %2 = lshr <4 x i32> %1, @@ -134,7 +134,7 @@ ; ; AVX-LABEL: combine_vec_lshr_lshr_zero0: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = lshr <4 x i32> %x, %2 = lshr <4 x i32> %1, @@ -149,7 +149,7 @@ ; ; AVX-LABEL: combine_vec_lshr_lshr_zero1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = lshr <4 x i32> %x, %2 = lshr <4 x i32> %1, @@ -233,7 +233,7 @@ ; ; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero0: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = lshr <4 x i64> %x, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -270,8 +270,8 @@ ; ; AVX-LABEL: combine_vec_lshr_shl_mask0: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shl <4 x i32> %x, %2 = lshr <4 x i32> %1, @@ -286,7 +286,7 @@ ; ; AVX-LABEL: combine_vec_lshr_shl_mask1: ; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shl <4 x i32> %x, %2 = lshr <4 x i32> %1, diff --git a/llvm/test/CodeGen/X86/combine-sub-ssat.ll b/llvm/test/CodeGen/X86/combine-sub-ssat.ll --- a/llvm/test/CodeGen/X86/combine-sub-ssat.ll +++ b/llvm/test/CodeGen/X86/combine-sub-ssat.ll @@ -2,10 +2,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2OR512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2OR512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX2OR512 declare i32 @llvm.ssub.sat.i32 (i32, i32) declare i64 @llvm.ssub.sat.i64 (i64, i64) @@ -27,10 +27,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_undef_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: combine_undef_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2OR512-NEXT: retq %res = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> %a0) ret <8 x i16> %res } @@ -51,10 +56,15 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,2,254,0,65534,65282,32786,2] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_constfold_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,2,254,0,65534,65282,32786,2] -; AVX-NEXT: retq +; AVX1-LABEL: combine_constfold_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [65535,2,254,0,65534,65282,32786,2] +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: combine_constfold_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,2,254,0,65534,65282,32786,2] +; AVX2OR512-NEXT: retq %res = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res } @@ -65,10 +75,15 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,65534,65282,32786,2] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_constfold_undef_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,65282,32786,2] -; AVX-NEXT: retq +; AVX1-LABEL: combine_constfold_undef_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,65282,32786,2] +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: combine_constfold_undef_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,65534,65282,32786,2] +; AVX2OR512-NEXT: retq %res = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res } @@ -107,10 +122,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_self_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_self_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: combine_self_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2OR512-NEXT: retq %1 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a0) ret <8 x i16> %1 } diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll --- a/llvm/test/CodeGen/X86/combine-sub-usat.ll +++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll @@ -2,10 +2,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2OR512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2OR512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX2OR512 declare i32 @llvm.usub.sat.i32 (i32, i32) declare i64 @llvm.usub.sat.i64 (i64, i64) @@ -27,10 +27,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_undef_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: combine_undef_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2OR512-NEXT: retq %res = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> %a0) ret <8 x i16> %res } @@ -51,10 +56,15 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_constfold_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_constfold_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0] +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: combine_constfold_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,254,0,65534,0,0,0] +; AVX2OR512-NEXT: retq %res = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res } @@ -65,10 +75,15 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_constfold_undef_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_constfold_undef_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0] +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: combine_constfold_undef_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,65534,0,0,0] +; AVX2OR512-NEXT: retq %res = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %res } @@ -107,10 +122,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_self_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_self_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: combine_self_v8i16: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2OR512-NEXT: retq %1 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a0) ret <8 x i16> %1 } diff --git a/llvm/test/CodeGen/X86/combine-sub.ll b/llvm/test/CodeGen/X86/combine-sub.ll --- a/llvm/test/CodeGen/X86/combine-sub.ll +++ b/llvm/test/CodeGen/X86/combine-sub.ll @@ -24,7 +24,7 @@ ; ; AVX-LABEL: combine_vec_sub_self: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = sub <4 x i32> %a, %a ret <4 x i32> %1 @@ -71,7 +71,7 @@ ; ; AVX-LABEL: combine_vec_sub_sub: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: vmovdqa %xmm1, %xmm0 ; AVX-NEXT: retq %1 = sub <4 x i32> %a, %b %2 = sub <4 x i32> %a, %1 @@ -87,7 +87,7 @@ ; ; AVX-LABEL: combine_vec_sub_add0: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: vmovdqa %xmm1, %xmm0 ; AVX-NEXT: retq %1 = add <4 x i32> %a, %b %2 = sub <4 x i32> %1, %a diff --git a/llvm/test/CodeGen/X86/combine-subo.ll b/llvm/test/CodeGen/X86/combine-subo.ll --- a/llvm/test/CodeGen/X86/combine-subo.ll +++ b/llvm/test/CodeGen/X86/combine-subo.ll @@ -100,7 +100,7 @@ ; ; AVX-LABEL: combine_vec_ssub_self: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call {<4 x i32>, <4 x i1>} @llvm.ssub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a0) %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0 @@ -135,7 +135,7 @@ ; ; AVX-LABEL: combine_vec_usub_self: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call {<4 x i32>, <4 x i1>} @llvm.usub.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a0) %2 = extractvalue {<4 x i32>, <4 x i1>} %1, 0 diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -106,10 +106,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_udiv_zero: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_udiv_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_udiv_zero: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: combine_vec_udiv_zero: ; XOP: # %bb.0: @@ -142,7 +147,7 @@ ; ; AVX2-LABEL: combine_vec_udiv_dupe: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] ; AVX2-NEXT: retq ; ; XOP-LABEL: combine_vec_udiv_dupe: diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll --- a/llvm/test/CodeGen/X86/combine-urem.ll +++ b/llvm/test/CodeGen/X86/combine-urem.ll @@ -19,10 +19,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_urem_by_one: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_urem_by_one: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_urem_by_one: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = urem <4 x i32> %x, ret <4 x i32> %1 } @@ -82,8 +87,8 @@ ; ; AVX2-LABEL: combine_vec_urem_by_minsigned: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483647,2147483647,2147483647,2147483647] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = urem <4 x i32> %x, ret <4 x i32> %1 @@ -105,10 +110,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_urem_zero: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_urem_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_urem_zero: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = urem <4 x i32> zeroinitializer, %x ret <4 x i32> %1 } @@ -129,10 +139,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_urem_dupe: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_urem_dupe: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_urem_dupe: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = urem <4 x i32> %x, %x ret <4 x i32> %1 } @@ -151,8 +166,8 @@ ; ; AVX2-LABEL: combine_vec_urem_by_pow2a: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3] -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = urem <4 x i32> %x, ret <4 x i32> %1 @@ -164,10 +179,15 @@ ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_urem_by_pow2b: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_urem_by_pow2b: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_urem_by_pow2b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = urem <4 x i32> %x, ret <4 x i32> %1 } @@ -351,10 +371,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: boolvec_urem: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: boolvec_urem: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: boolvec_urem: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq %r = urem <4 x i1> %x, %y ret <4 x i1> %r } diff --git a/llvm/test/CodeGen/X86/commute-blend-avx2.ll b/llvm/test/CodeGen/X86/commute-blend-avx2.ll --- a/llvm/test/CodeGen/X86/commute-blend-avx2.ll +++ b/llvm/test/CodeGen/X86/commute-blend-avx2.ll @@ -26,7 +26,7 @@ define <4 x i32> @commute_fold_vpblendd_128(<4 x i32> %a, <4 x i32>* %b) #0 { ; CHECK-LABEL: commute_fold_vpblendd_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; CHECK-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %b %2 = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %1, <4 x i32> %a, i8 1) @@ -37,7 +37,7 @@ define <8 x i32> @commute_fold_vpblendd_256(<8 x i32> %a, <8 x i32>* %b) #0 { ; CHECK-LABEL: commute_fold_vpblendd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7] ; CHECK-NEXT: retq %1 = load <8 x i32>, <8 x i32>* %b %2 = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %1, <8 x i32> %a, i8 129) @@ -48,7 +48,7 @@ define <4 x float> @commute_fold_vblendps_128(<4 x float> %a, <4 x float>* %b) #0 { ; CHECK-LABEL: commute_fold_vblendps_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3] +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3] ; CHECK-NEXT: retq %1 = load <4 x float>, <4 x float>* %b %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 5) @@ -59,7 +59,7 @@ define <8 x float> @commute_fold_vblendps_256(<8 x float> %a, <8 x float>* %b) #0 { ; CHECK-LABEL: commute_fold_vblendps_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7] ; CHECK-NEXT: retq %1 = load <8 x float>, <8 x float>* %b %2 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %1, <8 x float> %a, i8 7) @@ -70,7 +70,7 @@ define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, <2 x double>* %b) #0 { ; CHECK-LABEL: commute_fold_vblendpd_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; CHECK-NEXT: retq %1 = load <2 x double>, <2 x double>* %b %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1) @@ -81,7 +81,7 @@ define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, <4 x double>* %b) #0 { ; CHECK-LABEL: commute_fold_vblendpd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; CHECK-NEXT: retq %1 = load <4 x double>, <4 x double>* %b %2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7) diff --git a/llvm/test/CodeGen/X86/concat-cast.ll b/llvm/test/CodeGen/X86/concat-cast.ll --- a/llvm/test/CodeGen/X86/concat-cast.ll +++ b/llvm/test/CodeGen/X86/concat-cast.ll @@ -13,11 +13,29 @@ ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: sitofp_v4i32_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: sitofp_v4i32_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_v4i32_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sitofp_v4i32_v4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_v4i32_v4f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX512VL-NEXT: retq %s0 = sitofp <2 x i32> %x to <2 x float> %s1 = sitofp <2 x i32> %y to <2 x float> %r = shufflevector <2 x float> %s0, <2 x float> %s1, <4 x i32> @@ -73,7 +91,7 @@ ; ; AVX512F-LABEL: uitofp_v4i32_v4f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -81,7 +99,7 @@ ; ; AVX512VL-LABEL: uitofp_v4i32_v4f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 ; AVX512VL-NEXT: retq %s0 = uitofp <2 x i32> %x to <2 x float> @@ -97,11 +115,29 @@ ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_v4f32_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: fptosi_v4f32_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_v4f32_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptosi_v4f32_v4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_v4f32_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VL-NEXT: retq %s0 = fptosi <2 x float> %x to <2 x i32> %s1 = fptosi <2 x float> %y to <2 x i32> %r = shufflevector <2 x i32> %s0, <2 x i32> %s1, <4 x i32> @@ -198,7 +234,7 @@ ; ; AVX512F-LABEL: fptoui_v4f32_v4i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -206,7 +242,7 @@ ; ; AVX512VL-LABEL: fptoui_v4f32_v4i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0 ; AVX512VL-NEXT: retq %s0 = fptoui <2 x float> %x to <2 x i32> @@ -222,11 +258,29 @@ ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: sitofp_v4i32_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: sitofp_v4i32_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_v4i32_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sitofp_v4i32_v4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_v4i32_v4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX512VL-NEXT: retq %s0 = sitofp <2 x i32> %x to <2 x double> %s1 = sitofp <2 x i32> %y to <2 x double> %r = shufflevector <2 x double> %s0, <2 x double> %s1, <4 x i32> @@ -278,14 +332,14 @@ ; ; AVX512F-LABEL: uitofp_v4i32_v4f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_v4i32_v4f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0 ; AVX512VL-NEXT: retq %s0 = uitofp <2 x i32> %x to <2 x double> @@ -302,13 +356,37 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_v4f64_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: fptosi_v4f64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_v4f64_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: fptosi_v4f64_v4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_v4f64_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %s0 = fptosi <2 x double> %x to <2 x i32> %s1 = fptosi <2 x double> %y to <2 x i32> %r = shufflevector <2 x i32> %s0, <2 x i32> %s1, <4 x i32> @@ -398,7 +476,7 @@ ; AVX512F-LABEL: fptoui_v4f64_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -407,7 +485,7 @@ ; AVX512VL-LABEL: fptoui_v4f64_v4i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -478,7 +556,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 ; AVX512VL-NEXT: vcvtdq2ps %xmm1, %xmm1 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: retq %s0 = uitofp <2 x i32> %x to <2 x float> %s1 = sitofp <2 x i32> %y to <2 x float> @@ -497,13 +575,37 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: sitofp_v4i32_v4f32_extra_use: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: vcvtdq2ps %xmm1, %xmm1 -; AVX-NEXT: vmovlps %xmm1, (%rdi) -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: sitofp_v4i32_v4f32_extra_use: +; AVX1: # %bb.0: +; AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX1-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX1-NEXT: vmovlps %xmm1, (%rdi) +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_v4i32_v4f32_extra_use: +; AVX2: # %bb.0: +; AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX2-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX2-NEXT: vmovlps %xmm1, (%rdi) +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sitofp_v4i32_v4f32_extra_use: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX512F-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX512F-NEXT: vmovlps %xmm1, (%rdi) +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: sitofp_v4i32_v4f32_extra_use: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX512VL-NEXT: vmovq %xmm1, (%rdi) +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: retq %s0 = sitofp <2 x i32> %x to <2 x float> %s1 = sitofp <2 x i32> %y to <2 x float> store <2 x float> %s1, <2 x float>* %p diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -81,20 +81,20 @@ ; ; AVX2-LABEL: catcat: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,1,1,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[3,3,3,3] -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,1,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,2,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm0[3,3,3,3] +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: catcat: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [2,2,2,2,3,3,3,3] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm1 -; AVX512F-NEXT: vmovaps %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,3,3,3,3] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq %cat1 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> %cat2 = shufflevector <8 x i64> %cat1, <8 x i64> undef, <16 x i32> @@ -132,10 +132,10 @@ ; ; AVX2-LABEL: load_catcat: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: vbroadcastsd 8(%rdi), %ymm1 -; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm2 -; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm3 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: vpbroadcastq 8(%rdi), %ymm1 +; AVX2-NEXT: vpbroadcastq 16(%rdi), %ymm2 +; AVX2-NEXT: vpbroadcastq 24(%rdi), %ymm3 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_catcat: @@ -163,11 +163,23 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; SSE-NEXT: retq ; -; AVX-LABEL: cat_ext_straddle: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: retq +; AVX1-LABEL: cat_ext_straddle: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: cat_ext_straddle: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: cat_ext_straddle: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-NEXT: retq %x = load <6 x i32>, <6 x i32>* %px %y = load <6 x i32>, <6 x i32>* %py %cat = shufflevector <6 x i32> %x, <6 x i32> %y, <12 x i32> diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -5,16 +5,16 @@ define float @fneg_v4f32(<4 x float> %x) nounwind { ; X64-LABEL: fneg_v4f32: ; X64: # %bb.0: -; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fneg_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: vpbroadcastd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vmovd %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax ; X86-NEXT: retl @@ -26,9 +26,8 @@ define double @fneg_v4f64(<4 x double> %x) nounwind { ; X64-LABEL: fneg_v4f64: ; X64: # %bb.0: -; X64-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] -; X64-NEXT: # xmm1 = mem[0,0] -; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; X64-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -38,10 +37,9 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] -; X86-NEXT: # xmm1 = mem[0,0] -; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vmovlps %xmm0, (%esp) +; X86-NEXT: vpbroadcastq {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; X86-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vmovq %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp @@ -236,8 +234,8 @@ ; X86-LABEL: frem_v4f32: ; X86: # %bb.0: ; X86-NEXT: subl $8, %esp -; X86-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: vmovd %xmm1, {{[0-9]+}}(%esp) +; X86-NEXT: vmovd %xmm0, (%esp) ; X86-NEXT: calll fmodf ; X86-NEXT: addl $8, %esp ; X86-NEXT: retl @@ -257,8 +255,8 @@ ; X86-LABEL: frem_v4f64: ; X86: # %bb.0: ; X86-NEXT: subl $16, %esp -; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-NEXT: vmovups %xmm0, (%esp) +; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-NEXT: vmovdqu %xmm0, (%esp) ; X86-NEXT: vzeroupper ; X86-NEXT: calll fmod ; X86-NEXT: addl $16, %esp @@ -448,7 +446,7 @@ ; X86-LABEL: fsin_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: vmovd %xmm0, (%esp) ; X86-NEXT: calll sinf ; X86-NEXT: popl %eax ; X86-NEXT: retl @@ -467,7 +465,7 @@ ; X86-LABEL: fsin_v4f64: ; X86: # %bb.0: ; X86-NEXT: subl $8, %esp -; X86-NEXT: vmovlps %xmm0, (%esp) +; X86-NEXT: vmovq %xmm0, (%esp) ; X86-NEXT: vzeroupper ; X86-NEXT: calll sin ; X86-NEXT: addl $8, %esp @@ -525,16 +523,16 @@ define float @fabs_v4f32(<4 x float> %x) nounwind { ; X64-LABEL: fabs_v4f32: ; X64: # %bb.0: -; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; X64-NEXT: vandps %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fabs_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; X86-NEXT: vandps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: vpbroadcastd {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-NEXT: vmovd %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax ; X86-NEXT: retl @@ -546,7 +544,7 @@ define double @fabs_v4f64(<4 x double> %x) nounwind { ; X64-LABEL: fabs_v4f64: ; X64: # %bb.0: -; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -556,8 +554,8 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-NEXT: vmovlps %xmm0, (%esp) +; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-NEXT: vmovq %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp @@ -791,22 +789,22 @@ define float @copysign_v4f32(<4 x float> %x, <4 x float> %y) nounwind { ; X64-LABEL: copysign_v4f32: ; X64: # %bb.0: -; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X64-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; X64-NEXT: vandps %xmm2, %xmm0, %xmm0 -; X64-NEXT: vorps %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] +; X64-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: copysign_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; X86-NEXT: vandps %xmm2, %xmm0, %xmm0 -; X86-NEXT: vorps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: vpbroadcastd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-NEXT: vpbroadcastd {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] +; X86-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vmovd %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax ; X86-NEXT: retl @@ -818,9 +816,9 @@ define double @copysign_v4f64(<4 x double> %x, <4 x double> %y) nounwind { ; X64-LABEL: copysign_v4f64: ; X64: # %bb.0: -; X64-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 -; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vorps %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -830,10 +828,10 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vandps {{\.LCPI.*}}, %xmm1, %xmm1 -; X86-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-NEXT: vorps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vmovlps %xmm0, (%esp) +; X86-NEXT: vpand {{\.LCPI.*}}, %xmm1, %xmm1 +; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vmovq %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp diff --git a/llvm/test/CodeGen/X86/extractelement-index.ll b/llvm/test/CodeGen/X86/extractelement-index.ll --- a/llvm/test/CodeGen/X86/extractelement-index.ll +++ b/llvm/test/CodeGen/X86/extractelement-index.ll @@ -258,10 +258,15 @@ ; SSE41-NEXT: extractps $3, %xmm0, %eax ; SSE41-NEXT: retq ; -; AVX-LABEL: extractelement_v4i32_3: -; AVX: # %bb.0: -; AVX-NEXT: vextractps $3, %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: extractelement_v4i32_3: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractps $3, %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: extractelement_v4i32_3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrd $3, %xmm0, %eax +; AVX2-NEXT: retq %b = extractelement <4 x i32> %a, i256 3 ret i32 %b } @@ -324,12 +329,19 @@ ; SSE41-NEXT: extractps $3, %xmm1, %eax ; SSE41-NEXT: retq ; -; AVX-LABEL: extractelement_v8i32_7: -; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vextractps $3, %xmm0, %eax -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: extractelement_v8i32_7: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractps $3, %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: extractelement_v8i32_7: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrd $3, %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %b = extractelement <8 x i32> %a, i64 7 ret i32 %b } @@ -430,12 +442,19 @@ ; SSE-NEXT: movb -24(%rsp,%rdi), %al ; SSE-NEXT: retq ; -; AVX-LABEL: extractelement_v16i8_var: -; AVX: # %bb.0: -; AVX-NEXT: andl $15, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movb -24(%rsp,%rdi), %al -; AVX-NEXT: retq +; AVX1-LABEL: extractelement_v16i8_var: +; AVX1: # %bb.0: +; AVX1-NEXT: andl $15, %edi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movb -24(%rsp,%rdi), %al +; AVX1-NEXT: retq +; +; AVX2-LABEL: extractelement_v16i8_var: +; AVX2: # %bb.0: +; AVX2-NEXT: andl $15, %edi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movb -24(%rsp,%rdi), %al +; AVX2-NEXT: retq %b = extractelement <16 x i8> %a, i256 %i ret i8 %b } @@ -449,19 +468,33 @@ ; SSE-NEXT: movb -40(%rsp,%rdi), %al ; SSE-NEXT: retq ; -; AVX-LABEL: extractelement_v32i8_var: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: andl $31, %edi -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: movb (%rsp,%rdi), %al -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: extractelement_v32i8_var: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: andl $31, %edi +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: movb (%rsp,%rdi), %al +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: extractelement_v32i8_var: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: andl $31, %edi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: movb (%rsp,%rdi), %al +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %b = extractelement <32 x i8> %a, i256 %i ret i8 %b } @@ -474,12 +507,19 @@ ; SSE-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; SSE-NEXT: retq ; -; AVX-LABEL: extractelement_v8i16_var: -; AVX: # %bb.0: -; AVX-NEXT: andl $7, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movzwl -24(%rsp,%rdi,2), %eax -; AVX-NEXT: retq +; AVX1-LABEL: extractelement_v8i16_var: +; AVX1: # %bb.0: +; AVX1-NEXT: andl $7, %edi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: extractelement_v8i16_var: +; AVX2: # %bb.0: +; AVX2-NEXT: andl $7, %edi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax +; AVX2-NEXT: retq %b = extractelement <8 x i16> %a, i256 %i ret i16 %b } @@ -493,19 +533,33 @@ ; SSE-NEXT: movzwl -40(%rsp,%rdi,2), %eax ; SSE-NEXT: retq ; -; AVX-LABEL: extractelement_v16i16_var: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: andl $15, %edi -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: movzwl (%rsp,%rdi,2), %eax -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: extractelement_v16i16_var: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: andl $15, %edi +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: extractelement_v16i16_var: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: andl $15, %edi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %b = extractelement <16 x i16> %a, i256 %i ret i16 %b } @@ -518,12 +572,19 @@ ; SSE-NEXT: movl -24(%rsp,%rdi,4), %eax ; SSE-NEXT: retq ; -; AVX-LABEL: extractelement_v4i32_var: -; AVX: # %bb.0: -; AVX-NEXT: andl $3, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movl -24(%rsp,%rdi,4), %eax -; AVX-NEXT: retq +; AVX1-LABEL: extractelement_v4i32_var: +; AVX1: # %bb.0: +; AVX1-NEXT: andl $3, %edi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movl -24(%rsp,%rdi,4), %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: extractelement_v4i32_var: +; AVX2: # %bb.0: +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl -24(%rsp,%rdi,4), %eax +; AVX2-NEXT: retq %b = extractelement <4 x i32> %a, i256 %i ret i32 %b } @@ -537,19 +598,33 @@ ; SSE-NEXT: movl -40(%rsp,%rdi,4), %eax ; SSE-NEXT: retq ; -; AVX-LABEL: extractelement_v8i32_var: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: andl $7, %edi -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: movl (%rsp,%rdi,4), %eax -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: extractelement_v8i32_var: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: andl $7, %edi +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: movl (%rsp,%rdi,4), %eax +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: extractelement_v8i32_var: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: andl $7, %edi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: movl (%rsp,%rdi,4), %eax +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %b = extractelement <8 x i32> %a, i256 %i ret i32 %b } @@ -562,12 +637,19 @@ ; SSE-NEXT: movq -24(%rsp,%rdi,8), %rax ; SSE-NEXT: retq ; -; AVX-LABEL: extractelement_v2i64_var: -; AVX: # %bb.0: -; AVX-NEXT: andl $1, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -24(%rsp,%rdi,8), %rax -; AVX-NEXT: retq +; AVX1-LABEL: extractelement_v2i64_var: +; AVX1: # %bb.0: +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movq -24(%rsp,%rdi,8), %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: extractelement_v2i64_var: +; AVX2: # %bb.0: +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq -24(%rsp,%rdi,8), %rax +; AVX2-NEXT: retq %b = extractelement <2 x i64> %a, i256 %i ret i64 %b } @@ -581,19 +663,33 @@ ; SSE-NEXT: movq -40(%rsp,%rdi,8), %rax ; SSE-NEXT: retq ; -; AVX-LABEL: extractelement_v4i64_var: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: andl $3, %edi -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: movq (%rsp,%rdi,8), %rax -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: extractelement_v4i64_var: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: andl $3, %edi +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: movq (%rsp,%rdi,8), %rax +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: extractelement_v4i64_var: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: movq (%rsp,%rdi,8), %rax +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %b = extractelement <4 x i64> %a, i256 %i ret i64 %b } diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=X64,X64-SSSE3 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -64,11 +64,17 @@ ; X64-SSSE3-NEXT: movsd %xmm0, (%rax) ; X64-SSSE3-NEXT: retq ; -; X64-AVX-LABEL: t3: -; X64-AVX: # %bb.0: # %bb -; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX-NEXT: vmovsd %xmm0, (%rax) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: t3: +; X64-AVX1: # %bb.0: # %bb +; X64-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX1-NEXT: vmovsd %xmm0, (%rax) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: t3: +; X64-AVX2: # %bb.0: # %bb +; X64-AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X64-AVX2-NEXT: vmovq %xmm0, (%rax) +; X64-AVX2-NEXT: retq bb: %tmp13 = load <2 x double>, <2 x double>* %a0, align 1 %.sroa.3.24.vec.extract = extractelement <2 x double> %tmp13, i32 1 diff --git a/llvm/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll b/llvm/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll --- a/llvm/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll +++ b/llvm/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=AVX,AVX512 ; ; Verify that fast-isel doesn't select legacy SSE instructions on targets that ; feature AVX. diff --git a/llvm/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll b/llvm/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll --- a/llvm/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll +++ b/llvm/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll @@ -96,7 +96,7 @@ ; AVX512-FASTISEL-NEXT: cmpq %rsi, %rdi ; AVX512-FASTISEL-NEXT: je LBB2_2 ; AVX512-FASTISEL-NEXT: ## %bb.1: -; AVX512-FASTISEL-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-FASTISEL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512-FASTISEL-NEXT: LBB2_2: ; AVX512-FASTISEL-NEXT: retq %1 = icmp eq i64 %a, %b @@ -137,7 +137,7 @@ ; AVX512-FASTISEL-NEXT: cmpq %rsi, %rdi ; AVX512-FASTISEL-NEXT: jne LBB3_2 ; AVX512-FASTISEL-NEXT: ## %bb.1: -; AVX512-FASTISEL-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-FASTISEL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512-FASTISEL-NEXT: LBB3_2: ; AVX512-FASTISEL-NEXT: retq %1 = icmp ne i64 %a, %b @@ -178,7 +178,7 @@ ; AVX512-FASTISEL-NEXT: cmpq %rsi, %rdi ; AVX512-FASTISEL-NEXT: ja LBB4_2 ; AVX512-FASTISEL-NEXT: ## %bb.1: -; AVX512-FASTISEL-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-FASTISEL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512-FASTISEL-NEXT: LBB4_2: ; AVX512-FASTISEL-NEXT: retq %1 = icmp ugt i64 %a, %b @@ -219,7 +219,7 @@ ; AVX512-FASTISEL-NEXT: cmpq %rsi, %rdi ; AVX512-FASTISEL-NEXT: jae LBB5_2 ; AVX512-FASTISEL-NEXT: ## %bb.1: -; AVX512-FASTISEL-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-FASTISEL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512-FASTISEL-NEXT: LBB5_2: ; AVX512-FASTISEL-NEXT: retq %1 = icmp uge i64 %a, %b @@ -260,7 +260,7 @@ ; AVX512-FASTISEL-NEXT: cmpq %rsi, %rdi ; AVX512-FASTISEL-NEXT: jb LBB6_2 ; AVX512-FASTISEL-NEXT: ## %bb.1: -; AVX512-FASTISEL-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-FASTISEL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512-FASTISEL-NEXT: LBB6_2: ; AVX512-FASTISEL-NEXT: retq %1 = icmp ult i64 %a, %b @@ -301,7 +301,7 @@ ; AVX512-FASTISEL-NEXT: cmpq %rsi, %rdi ; AVX512-FASTISEL-NEXT: jbe LBB7_2 ; AVX512-FASTISEL-NEXT: ## %bb.1: -; AVX512-FASTISEL-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-FASTISEL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512-FASTISEL-NEXT: LBB7_2: ; AVX512-FASTISEL-NEXT: retq %1 = icmp ule i64 %a, %b @@ -342,7 +342,7 @@ ; AVX512-FASTISEL-NEXT: cmpq %rsi, %rdi ; AVX512-FASTISEL-NEXT: jg LBB8_2 ; AVX512-FASTISEL-NEXT: ## %bb.1: -; AVX512-FASTISEL-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-FASTISEL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512-FASTISEL-NEXT: LBB8_2: ; AVX512-FASTISEL-NEXT: retq %1 = icmp sgt i64 %a, %b @@ -383,7 +383,7 @@ ; AVX512-FASTISEL-NEXT: cmpq %rsi, %rdi ; AVX512-FASTISEL-NEXT: jge LBB9_2 ; AVX512-FASTISEL-NEXT: ## %bb.1: -; AVX512-FASTISEL-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-FASTISEL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512-FASTISEL-NEXT: LBB9_2: ; AVX512-FASTISEL-NEXT: retq %1 = icmp sge i64 %a, %b @@ -424,7 +424,7 @@ ; AVX512-FASTISEL-NEXT: cmpq %rsi, %rdi ; AVX512-FASTISEL-NEXT: jl LBB10_2 ; AVX512-FASTISEL-NEXT: ## %bb.1: -; AVX512-FASTISEL-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-FASTISEL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512-FASTISEL-NEXT: LBB10_2: ; AVX512-FASTISEL-NEXT: retq %1 = icmp slt i64 %a, %b @@ -465,7 +465,7 @@ ; AVX512-FASTISEL-NEXT: cmpq %rsi, %rdi ; AVX512-FASTISEL-NEXT: jle LBB11_2 ; AVX512-FASTISEL-NEXT: ## %bb.1: -; AVX512-FASTISEL-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-FASTISEL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512-FASTISEL-NEXT: LBB11_2: ; AVX512-FASTISEL-NEXT: retq %1 = icmp sle i64 %a, %b diff --git a/llvm/test/CodeGen/X86/fast-isel-store.ll b/llvm/test/CodeGen/X86/fast-isel-store.ll --- a/llvm/test/CodeGen/X86/fast-isel-store.ll +++ b/llvm/test/CodeGen/X86/fast-isel-store.ll @@ -119,16 +119,27 @@ ; SSE64-NEXT: movups %xmm0, (%eax) ; SSE64-NEXT: retl ; -; AVX32-LABEL: test_store_4xf32: -; AVX32: # %bb.0: -; AVX32-NEXT: vmovups %xmm0, (%rdi) -; AVX32-NEXT: retq +; AVXONLY32-LABEL: test_store_4xf32: +; AVXONLY32: # %bb.0: +; AVXONLY32-NEXT: vmovups %xmm0, (%rdi) +; AVXONLY32-NEXT: retq ; -; AVX64-LABEL: test_store_4xf32: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovups %xmm0, (%eax) -; AVX64-NEXT: retl +; AVXONLY64-LABEL: test_store_4xf32: +; AVXONLY64: # %bb.0: +; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVXONLY64-NEXT: vmovups %xmm0, (%eax) +; AVXONLY64-NEXT: retl +; +; AVX51232-LABEL: test_store_4xf32: +; AVX51232: # %bb.0: +; AVX51232-NEXT: vmovdqu %xmm0, (%rdi) +; AVX51232-NEXT: retq +; +; AVX51264-LABEL: test_store_4xf32: +; AVX51264: # %bb.0: +; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX51264-NEXT: vmovdqu %xmm0, (%eax) +; AVX51264-NEXT: retl store <4 x float> %value, <4 x float>* %addr, align 1 ret <4 x float> %value } @@ -145,16 +156,27 @@ ; SSE64-NEXT: movaps %xmm0, (%eax) ; SSE64-NEXT: retl ; -; AVX32-LABEL: test_store_4xf32_aligned: -; AVX32: # %bb.0: -; AVX32-NEXT: vmovaps %xmm0, (%rdi) -; AVX32-NEXT: retq +; AVXONLY32-LABEL: test_store_4xf32_aligned: +; AVXONLY32: # %bb.0: +; AVXONLY32-NEXT: vmovaps %xmm0, (%rdi) +; AVXONLY32-NEXT: retq ; -; AVX64-LABEL: test_store_4xf32_aligned: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovaps %xmm0, (%eax) -; AVX64-NEXT: retl +; AVXONLY64-LABEL: test_store_4xf32_aligned: +; AVXONLY64: # %bb.0: +; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVXONLY64-NEXT: vmovaps %xmm0, (%eax) +; AVXONLY64-NEXT: retl +; +; AVX51232-LABEL: test_store_4xf32_aligned: +; AVX51232: # %bb.0: +; AVX51232-NEXT: vmovdqa %xmm0, (%rdi) +; AVX51232-NEXT: retq +; +; AVX51264-LABEL: test_store_4xf32_aligned: +; AVX51264: # %bb.0: +; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX51264-NEXT: vmovdqa %xmm0, (%eax) +; AVX51264-NEXT: retl store <4 x float> %value, <4 x float>* %addr, align 16 ret <4 x float> %value } @@ -235,16 +257,27 @@ ; SSE64-NEXT: movups %xmm1, 16(%eax) ; SSE64-NEXT: retl ; -; AVX32-LABEL: test_store_8xi32: -; AVX32: # %bb.0: -; AVX32-NEXT: vmovups %ymm0, (%rdi) -; AVX32-NEXT: retq +; AVXONLY32-LABEL: test_store_8xi32: +; AVXONLY32: # %bb.0: +; AVXONLY32-NEXT: vmovups %ymm0, (%rdi) +; AVXONLY32-NEXT: retq ; -; AVX64-LABEL: test_store_8xi32: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovups %ymm0, (%eax) -; AVX64-NEXT: retl +; AVXONLY64-LABEL: test_store_8xi32: +; AVXONLY64: # %bb.0: +; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVXONLY64-NEXT: vmovups %ymm0, (%eax) +; AVXONLY64-NEXT: retl +; +; AVX51232-LABEL: test_store_8xi32: +; AVX51232: # %bb.0: +; AVX51232-NEXT: vmovdqu %ymm0, (%rdi) +; AVX51232-NEXT: retq +; +; AVX51264-LABEL: test_store_8xi32: +; AVX51264: # %bb.0: +; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX51264-NEXT: vmovdqu %ymm0, (%eax) +; AVX51264-NEXT: retl store <8 x i32> %value, <8 x i32>* %addr, align 1 ret <8 x i32> %value } @@ -263,16 +296,27 @@ ; SSE64-NEXT: movaps %xmm1, 16(%eax) ; SSE64-NEXT: retl ; -; AVX32-LABEL: test_store_8xi32_aligned: -; AVX32: # %bb.0: -; AVX32-NEXT: vmovaps %ymm0, (%rdi) -; AVX32-NEXT: retq +; AVXONLY32-LABEL: test_store_8xi32_aligned: +; AVXONLY32: # %bb.0: +; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi) +; AVXONLY32-NEXT: retq ; -; AVX64-LABEL: test_store_8xi32_aligned: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovaps %ymm0, (%eax) -; AVX64-NEXT: retl +; AVXONLY64-LABEL: test_store_8xi32_aligned: +; AVXONLY64: # %bb.0: +; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVXONLY64-NEXT: vmovaps %ymm0, (%eax) +; AVXONLY64-NEXT: retl +; +; AVX51232-LABEL: test_store_8xi32_aligned: +; AVX51232: # %bb.0: +; AVX51232-NEXT: vmovdqa %ymm0, (%rdi) +; AVX51232-NEXT: retq +; +; AVX51264-LABEL: test_store_8xi32_aligned: +; AVX51264: # %bb.0: +; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX51264-NEXT: vmovdqa %ymm0, (%eax) +; AVX51264-NEXT: retl store <8 x i32> %value, <8 x i32>* %addr, align 32 ret <8 x i32> %value } @@ -291,16 +335,27 @@ ; SSE64-NEXT: movups %xmm1, 16(%eax) ; SSE64-NEXT: retl ; -; AVX32-LABEL: test_store_8xf32: -; AVX32: # %bb.0: -; AVX32-NEXT: vmovups %ymm0, (%rdi) -; AVX32-NEXT: retq +; AVXONLY32-LABEL: test_store_8xf32: +; AVXONLY32: # %bb.0: +; AVXONLY32-NEXT: vmovups %ymm0, (%rdi) +; AVXONLY32-NEXT: retq ; -; AVX64-LABEL: test_store_8xf32: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovups %ymm0, (%eax) -; AVX64-NEXT: retl +; AVXONLY64-LABEL: test_store_8xf32: +; AVXONLY64: # %bb.0: +; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVXONLY64-NEXT: vmovups %ymm0, (%eax) +; AVXONLY64-NEXT: retl +; +; AVX51232-LABEL: test_store_8xf32: +; AVX51232: # %bb.0: +; AVX51232-NEXT: vmovdqu %ymm0, (%rdi) +; AVX51232-NEXT: retq +; +; AVX51264-LABEL: test_store_8xf32: +; AVX51264: # %bb.0: +; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX51264-NEXT: vmovdqu %ymm0, (%eax) +; AVX51264-NEXT: retl store <8 x float> %value, <8 x float>* %addr, align 1 ret <8 x float> %value } @@ -319,16 +374,27 @@ ; SSE64-NEXT: movaps %xmm1, 16(%eax) ; SSE64-NEXT: retl ; -; AVX32-LABEL: test_store_8xf32_aligned: -; AVX32: # %bb.0: -; AVX32-NEXT: vmovaps %ymm0, (%rdi) -; AVX32-NEXT: retq +; AVXONLY32-LABEL: test_store_8xf32_aligned: +; AVXONLY32: # %bb.0: +; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi) +; AVXONLY32-NEXT: retq ; -; AVX64-LABEL: test_store_8xf32_aligned: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovaps %ymm0, (%eax) -; AVX64-NEXT: retl +; AVXONLY64-LABEL: test_store_8xf32_aligned: +; AVXONLY64: # %bb.0: +; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVXONLY64-NEXT: vmovaps %ymm0, (%eax) +; AVXONLY64-NEXT: retl +; +; AVX51232-LABEL: test_store_8xf32_aligned: +; AVX51232: # %bb.0: +; AVX51232-NEXT: vmovdqa %ymm0, (%rdi) +; AVX51232-NEXT: retq +; +; AVX51264-LABEL: test_store_8xf32_aligned: +; AVX51264: # %bb.0: +; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX51264-NEXT: vmovdqa %ymm0, (%eax) +; AVX51264-NEXT: retl store <8 x float> %value, <8 x float>* %addr, align 32 ret <8 x float> %value } @@ -451,13 +517,13 @@ ; ; AVX51232-LABEL: test_store_16xi32: ; AVX51232: # %bb.0: -; AVX51232-NEXT: vmovups %zmm0, (%rdi) +; AVX51232-NEXT: vmovdqu64 %zmm0, (%rdi) ; AVX51232-NEXT: retq ; ; AVX51264-LABEL: test_store_16xi32: ; AVX51264: # %bb.0: ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX51264-NEXT: vmovups %zmm0, (%eax) +; AVX51264-NEXT: vmovdqu64 %zmm0, (%eax) ; AVX51264-NEXT: retl store <16 x i32> %value, <16 x i32>* %addr, align 1 ret <16 x i32> %value @@ -501,13 +567,13 @@ ; ; AVX51232-LABEL: test_store_16xi32_aligned: ; AVX51232: # %bb.0: -; AVX51232-NEXT: vmovaps %zmm0, (%rdi) +; AVX51232-NEXT: vmovdqa64 %zmm0, (%rdi) ; AVX51232-NEXT: retq ; ; AVX51264-LABEL: test_store_16xi32_aligned: ; AVX51264: # %bb.0: ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX51264-NEXT: vmovaps %zmm0, (%eax) +; AVX51264-NEXT: vmovdqa64 %zmm0, (%eax) ; AVX51264-NEXT: retl store <16 x i32> %value, <16 x i32>* %addr, align 64 ret <16 x i32> %value @@ -551,13 +617,13 @@ ; ; AVX51232-LABEL: test_store_16xf32: ; AVX51232: # %bb.0: -; AVX51232-NEXT: vmovups %zmm0, (%rdi) +; AVX51232-NEXT: vmovdqu64 %zmm0, (%rdi) ; AVX51232-NEXT: retq ; ; AVX51264-LABEL: test_store_16xf32: ; AVX51264: # %bb.0: ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX51264-NEXT: vmovups %zmm0, (%eax) +; AVX51264-NEXT: vmovdqu64 %zmm0, (%eax) ; AVX51264-NEXT: retl store <16 x float> %value, <16 x float>* %addr, align 1 ret <16 x float> %value @@ -601,13 +667,13 @@ ; ; AVX51232-LABEL: test_store_16xf32_aligned: ; AVX51232: # %bb.0: -; AVX51232-NEXT: vmovaps %zmm0, (%rdi) +; AVX51232-NEXT: vmovdqa64 %zmm0, (%rdi) ; AVX51232-NEXT: retq ; ; AVX51264-LABEL: test_store_16xf32_aligned: ; AVX51264: # %bb.0: ; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX51264-NEXT: vmovaps %zmm0, (%eax) +; AVX51264-NEXT: vmovdqa64 %zmm0, (%eax) ; AVX51264-NEXT: retl store <16 x float> %value, <16 x float>* %addr, align 64 ret <16 x float> %value diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -277,14 +277,14 @@ ; ; AVX512-LABEL: test_f32_cst: ; AVX512: ## %bb.0: ## %entry -; AVX512-NEXT: vmovss {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] +; AVX512-NEXT: vmovd {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x05,A,A,A,A] ; AVX512-NEXT: ## fixup A - offset: 4, value: LCPI4_0-4, kind: reloc_riprel_4byte ; AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero ; AVX512-NEXT: retq ## encoding: [0xc3] ; ; AVX512VL-LABEL: test_f32_cst: ; AVX512VL: ## %bb.0: ## %entry -; AVX512VL-NEXT: vmovss {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] +; AVX512VL-NEXT: vmovd {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x05,A,A,A,A] ; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI4_0-4, kind: reloc_riprel_4byte ; AVX512VL-NEXT: ## xmm0 = mem[0],zero,zero,zero ; AVX512VL-NEXT: retq ## encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -326,21 +326,32 @@ ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: callq pow +; SSE-NEXT: callq pow@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq ; -; AVX-LABEL: f6: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq pow -; AVX-NEXT: popq %rax -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: f6: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq pow@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: f6: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq pow@PLT +; AVX512-NEXT: popq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %result = call double @llvm.experimental.constrained.pow.f64(double 42.1, double 3.0, @@ -382,21 +393,32 @@ ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movl $3, %edi -; SSE-NEXT: callq __powidf2 +; SSE-NEXT: callq __powidf2@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq ; -; AVX-LABEL: f7: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: movl $3, %edi -; AVX-NEXT: callq __powidf2 -; AVX-NEXT: popq %rax -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: f7: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: movl $3, %edi +; AVX1-NEXT: callq __powidf2@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: f7: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: movl $3, %edi +; AVX512-NEXT: callq __powidf2@PLT +; AVX512-NEXT: popq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %result = call double @llvm.experimental.constrained.powi.f64(double 42.1, i32 3, @@ -435,20 +457,30 @@ ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: callq sin +; SSE-NEXT: callq sin@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq ; -; AVX-LABEL: f8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq sin -; AVX-NEXT: popq %rax -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: f8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq sin@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: f8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq sin@PLT +; AVX512-NEXT: popq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %result = call double @llvm.experimental.constrained.sin.f64(double 42.0, metadata !"round.dynamic", @@ -486,20 +518,30 @@ ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: callq cos +; SSE-NEXT: callq cos@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq ; -; AVX-LABEL: f9: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq cos -; AVX-NEXT: popq %rax -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: f9: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq cos@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: f9: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq cos@PLT +; AVX512-NEXT: popq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %result = call double @llvm.experimental.constrained.cos.f64(double 42.0, metadata !"round.dynamic", @@ -537,20 +579,30 @@ ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: callq exp +; SSE-NEXT: callq exp@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq ; -; AVX-LABEL: f10: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp -; AVX-NEXT: popq %rax -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: f10: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: f10: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp@PLT +; AVX512-NEXT: popq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %result = call double @llvm.experimental.constrained.exp.f64(double 42.0, metadata !"round.dynamic", @@ -588,20 +640,30 @@ ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: callq exp2 +; SSE-NEXT: callq exp2@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq ; -; AVX-LABEL: f11: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp2 -; AVX-NEXT: popq %rax -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: f11: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp2@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: f11: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp2@PLT +; AVX512-NEXT: popq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %result = call double @llvm.experimental.constrained.exp2.f64(double 42.1, metadata !"round.dynamic", @@ -639,20 +701,30 @@ ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: callq log +; SSE-NEXT: callq log@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq ; -; AVX-LABEL: f12: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log -; AVX-NEXT: popq %rax -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: f12: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: f12: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log@PLT +; AVX512-NEXT: popq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %result = call double @llvm.experimental.constrained.log.f64(double 42.0, metadata !"round.dynamic", @@ -690,20 +762,30 @@ ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: callq log10 +; SSE-NEXT: callq log10@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq ; -; AVX-LABEL: f13: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log10 -; AVX-NEXT: popq %rax -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: f13: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log10@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: f13: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log10@PLT +; AVX512-NEXT: popq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %result = call double @llvm.experimental.constrained.log10.f64(double 42.0, metadata !"round.dynamic", @@ -741,20 +823,30 @@ ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: callq log2 +; SSE-NEXT: callq log2@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq ; -; AVX-LABEL: f14: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log2 -; AVX-NEXT: popq %rax -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: f14: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log2@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: f14: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log2@PLT +; AVX512-NEXT: popq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %result = call double @llvm.experimental.constrained.log2.f64(double 42.0, metadata !"round.dynamic", @@ -792,7 +884,7 @@ ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: callq rint +; SSE-NEXT: callq rint@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq @@ -840,7 +932,7 @@ ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: callq nearbyint +; SSE-NEXT: callq nearbyint@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq @@ -892,21 +984,32 @@ ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: callq fmod +; SSE-NEXT: callq fmod@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq ; -; AVX-LABEL: f19: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmod -; AVX-NEXT: popq %rax -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: f19: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmod@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: f19: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmod@PLT +; AVX512-NEXT: popq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %rem = call double @llvm.experimental.constrained.frem.f64( double 1.000000e+00, @@ -1172,14 +1275,14 @@ ; SSE-LABEL: f20s128: ; SSE: # %bb.0: # %entry ; SSE-NEXT: pushq %rax -; SSE-NEXT: callq __fixdfti +; SSE-NEXT: callq __fixdfti@PLT ; SSE-NEXT: popq %rcx ; SSE-NEXT: retq ; ; AVX-LABEL: f20s128: ; AVX: # %bb.0: # %entry ; AVX-NEXT: pushq %rax -; AVX-NEXT: callq __fixdfti +; AVX-NEXT: callq __fixdfti@PLT ; AVX-NEXT: popq %rcx ; AVX-NEXT: retq entry: @@ -1517,14 +1620,14 @@ ; SSE-LABEL: f20u128: ; SSE: # %bb.0: # %entry ; SSE-NEXT: pushq %rax -; SSE-NEXT: callq __fixunsdfti +; SSE-NEXT: callq __fixunsdfti@PLT ; SSE-NEXT: popq %rcx ; SSE-NEXT: retq ; ; AVX-LABEL: f20u128: ; AVX: # %bb.0: # %entry ; AVX-NEXT: pushq %rax -; AVX-NEXT: callq __fixunsdfti +; AVX-NEXT: callq __fixunsdfti@PLT ; AVX-NEXT: popq %rcx ; AVX-NEXT: retq entry: @@ -1644,7 +1747,7 @@ ; SSE: # %bb.0: # %entry ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 -; SSE-NEXT: callq lrint +; SSE-NEXT: callq lrint@PLT ; SSE-NEXT: popq %rcx ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq @@ -1653,7 +1756,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: pushq %rax ; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: callq lrint +; AVX-NEXT: callq lrint@PLT ; AVX-NEXT: popq %rcx ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq @@ -1692,7 +1795,7 @@ ; SSE: # %bb.0: # %entry ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 -; SSE-NEXT: callq lrintf +; SSE-NEXT: callq lrintf@PLT ; SSE-NEXT: popq %rcx ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq @@ -1701,7 +1804,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: pushq %rax ; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: callq lrintf +; AVX-NEXT: callq lrintf@PLT ; AVX-NEXT: popq %rcx ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq @@ -1740,7 +1843,7 @@ ; SSE: # %bb.0: # %entry ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 -; SSE-NEXT: callq llrint +; SSE-NEXT: callq llrint@PLT ; SSE-NEXT: popq %rcx ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq @@ -1749,7 +1852,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: pushq %rax ; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: callq llrint +; AVX-NEXT: callq llrint@PLT ; AVX-NEXT: popq %rcx ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq @@ -1788,7 +1891,7 @@ ; SSE: # %bb.0: # %entry ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 -; SSE-NEXT: callq llrintf +; SSE-NEXT: callq llrintf@PLT ; SSE-NEXT: popq %rcx ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq @@ -1797,7 +1900,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: pushq %rax ; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: callq llrintf +; AVX-NEXT: callq llrintf@PLT ; AVX-NEXT: popq %rcx ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq @@ -1836,7 +1939,7 @@ ; SSE: # %bb.0: # %entry ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 -; SSE-NEXT: callq lround +; SSE-NEXT: callq lround@PLT ; SSE-NEXT: popq %rcx ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq @@ -1845,7 +1948,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: pushq %rax ; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: callq lround +; AVX-NEXT: callq lround@PLT ; AVX-NEXT: popq %rcx ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq @@ -1883,7 +1986,7 @@ ; SSE: # %bb.0: # %entry ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 -; SSE-NEXT: callq lroundf +; SSE-NEXT: callq lroundf@PLT ; SSE-NEXT: popq %rcx ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq @@ -1892,7 +1995,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: pushq %rax ; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: callq lroundf +; AVX-NEXT: callq lroundf@PLT ; AVX-NEXT: popq %rcx ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq @@ -1930,7 +2033,7 @@ ; SSE: # %bb.0: # %entry ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 -; SSE-NEXT: callq llround +; SSE-NEXT: callq llround@PLT ; SSE-NEXT: popq %rcx ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq @@ -1939,7 +2042,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: pushq %rax ; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: callq llround +; AVX-NEXT: callq llround@PLT ; AVX-NEXT: popq %rcx ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq @@ -1977,7 +2080,7 @@ ; SSE: # %bb.0: # %entry ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 -; SSE-NEXT: callq llroundf +; SSE-NEXT: callq llroundf@PLT ; SSE-NEXT: popq %rcx ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq @@ -1986,7 +2089,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: pushq %rax ; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: callq llroundf +; AVX-NEXT: callq llroundf@PLT ; AVX-NEXT: popq %rcx ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fp-logic-replace.ll b/llvm/test/CodeGen/X86/fp-logic-replace.ll --- a/llvm/test/CodeGen/X86/fp-logic-replace.ll +++ b/llvm/test/CodeGen/X86/fp-logic-replace.ll @@ -22,7 +22,7 @@ ; ; AVX512DQ-LABEL: FsANDPSrr: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1] +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX512DQ-NEXT: retq # encoding: [0xc3] %bc1 = bitcast double %x to i64 %bc2 = bitcast double %y to i64 @@ -45,7 +45,7 @@ ; ; AVX512DQ-LABEL: FsANDNPSrr: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vandnps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x55,0xc0] +; AVX512DQ-NEXT: vpandn %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdf,0xc0] ; AVX512DQ-NEXT: retq # encoding: [0xc3] %bc1 = bitcast double %x to i64 %bc2 = bitcast double %y to i64 @@ -68,7 +68,7 @@ ; ; AVX512DQ-LABEL: FsORPSrr: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1] +; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0xc1] ; AVX512DQ-NEXT: retq # encoding: [0xc3] %bc1 = bitcast double %x to i64 %bc2 = bitcast double %y to i64 @@ -90,7 +90,7 @@ ; ; AVX512DQ-LABEL: FsXORPSrr: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1] +; AVX512DQ-NEXT: vpxor %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc1] ; AVX512DQ-NEXT: retq # encoding: [0xc3] %bc1 = bitcast double %x to i64 %bc2 = bitcast double %y to i64 diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll @@ -246,32 +246,53 @@ ; SSE-X64-NEXT: cvttss2si %xmm0, %rax ; SSE-X64-NEXT: retq ; -; AVX-X86-LABEL: fptosi_f32toi64: -; AVX-X86: # %bb.0: -; AVX-X86-NEXT: pushl %ebp -; AVX-X86-NEXT: .cfi_def_cfa_offset 8 -; AVX-X86-NEXT: .cfi_offset %ebp, -8 -; AVX-X86-NEXT: movl %esp, %ebp -; AVX-X86-NEXT: .cfi_def_cfa_register %ebp -; AVX-X86-NEXT: andl $-8, %esp -; AVX-X86-NEXT: subl $8, %esp -; AVX-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-X86-NEXT: vmovss %xmm0, (%esp) -; AVX-X86-NEXT: flds (%esp) -; AVX-X86-NEXT: fisttpll (%esp) -; AVX-X86-NEXT: wait -; AVX-X86-NEXT: movl (%esp), %eax -; AVX-X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX-X86-NEXT: movl %ebp, %esp -; AVX-X86-NEXT: popl %ebp -; AVX-X86-NEXT: .cfi_def_cfa %esp, 4 -; AVX-X86-NEXT: retl +; AVX1-X86-LABEL: fptosi_f32toi64: +; AVX1-X86: # %bb.0: +; AVX1-X86-NEXT: pushl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX1-X86-NEXT: .cfi_offset %ebp, -8 +; AVX1-X86-NEXT: movl %esp, %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX1-X86-NEXT: andl $-8, %esp +; AVX1-X86-NEXT: subl $8, %esp +; AVX1-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-X86-NEXT: vmovss %xmm0, (%esp) +; AVX1-X86-NEXT: flds (%esp) +; AVX1-X86-NEXT: fisttpll (%esp) +; AVX1-X86-NEXT: wait +; AVX1-X86-NEXT: movl (%esp), %eax +; AVX1-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX1-X86-NEXT: movl %ebp, %esp +; AVX1-X86-NEXT: popl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-X86-NEXT: retl ; ; AVX-X64-LABEL: fptosi_f32toi64: ; AVX-X64: # %bb.0: ; AVX-X64-NEXT: vcvttss2si %xmm0, %rax ; AVX-X64-NEXT: retq ; +; AVX512-X86-LABEL: fptosi_f32toi64: +; AVX512-X86: # %bb.0: +; AVX512-X86-NEXT: pushl %ebp +; AVX512-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX512-X86-NEXT: .cfi_offset %ebp, -8 +; AVX512-X86-NEXT: movl %esp, %ebp +; AVX512-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX512-X86-NEXT: andl $-8, %esp +; AVX512-X86-NEXT: subl $8, %esp +; AVX512-X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-X86-NEXT: vmovd %xmm0, (%esp) +; AVX512-X86-NEXT: flds (%esp) +; AVX512-X86-NEXT: fisttpll (%esp) +; AVX512-X86-NEXT: wait +; AVX512-X86-NEXT: movl (%esp), %eax +; AVX512-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512-X86-NEXT: movl %ebp, %esp +; AVX512-X86-NEXT: popl %ebp +; AVX512-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX512-X86-NEXT: retl +; ; X87-LABEL: fptosi_f32toi64: ; X87: # %bb.0: ; X87-NEXT: pushl %ebp @@ -886,32 +907,53 @@ ; SSE-X64-NEXT: cvttsd2si %xmm0, %rax ; SSE-X64-NEXT: retq ; -; AVX-X86-LABEL: fptosi_f64toi64: -; AVX-X86: # %bb.0: -; AVX-X86-NEXT: pushl %ebp -; AVX-X86-NEXT: .cfi_def_cfa_offset 8 -; AVX-X86-NEXT: .cfi_offset %ebp, -8 -; AVX-X86-NEXT: movl %esp, %ebp -; AVX-X86-NEXT: .cfi_def_cfa_register %ebp -; AVX-X86-NEXT: andl $-8, %esp -; AVX-X86-NEXT: subl $8, %esp -; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-X86-NEXT: vmovsd %xmm0, (%esp) -; AVX-X86-NEXT: fldl (%esp) -; AVX-X86-NEXT: fisttpll (%esp) -; AVX-X86-NEXT: wait -; AVX-X86-NEXT: movl (%esp), %eax -; AVX-X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX-X86-NEXT: movl %ebp, %esp -; AVX-X86-NEXT: popl %ebp -; AVX-X86-NEXT: .cfi_def_cfa %esp, 4 -; AVX-X86-NEXT: retl +; AVX1-X86-LABEL: fptosi_f64toi64: +; AVX1-X86: # %bb.0: +; AVX1-X86-NEXT: pushl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX1-X86-NEXT: .cfi_offset %ebp, -8 +; AVX1-X86-NEXT: movl %esp, %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX1-X86-NEXT: andl $-8, %esp +; AVX1-X86-NEXT: subl $8, %esp +; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-X86-NEXT: vmovsd %xmm0, (%esp) +; AVX1-X86-NEXT: fldl (%esp) +; AVX1-X86-NEXT: fisttpll (%esp) +; AVX1-X86-NEXT: wait +; AVX1-X86-NEXT: movl (%esp), %eax +; AVX1-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX1-X86-NEXT: movl %ebp, %esp +; AVX1-X86-NEXT: popl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-X86-NEXT: retl ; ; AVX-X64-LABEL: fptosi_f64toi64: ; AVX-X64: # %bb.0: ; AVX-X64-NEXT: vcvttsd2si %xmm0, %rax ; AVX-X64-NEXT: retq ; +; AVX512-X86-LABEL: fptosi_f64toi64: +; AVX512-X86: # %bb.0: +; AVX512-X86-NEXT: pushl %ebp +; AVX512-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX512-X86-NEXT: .cfi_offset %ebp, -8 +; AVX512-X86-NEXT: movl %esp, %ebp +; AVX512-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX512-X86-NEXT: andl $-8, %esp +; AVX512-X86-NEXT: subl $8, %esp +; AVX512-X86-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-X86-NEXT: vmovq %xmm0, (%esp) +; AVX512-X86-NEXT: fldl (%esp) +; AVX512-X86-NEXT: fisttpll (%esp) +; AVX512-X86-NEXT: wait +; AVX512-X86-NEXT: movl (%esp), %eax +; AVX512-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX512-X86-NEXT: movl %ebp, %esp +; AVX512-X86-NEXT: popl %ebp +; AVX512-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX512-X86-NEXT: retl +; ; X87-LABEL: fptosi_f64toi64: ; X87: # %bb.0: ; X87-NEXT: pushl %ebp diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll @@ -609,31 +609,31 @@ ; SSE-X64-NEXT: .LBB9_2: ; SSE-X64-NEXT: retq ; -; AVX-X86-LABEL: uitofp_i64tof32: -; AVX-X86: # %bb.0: -; AVX-X86-NEXT: pushl %ebp -; AVX-X86-NEXT: .cfi_def_cfa_offset 8 -; AVX-X86-NEXT: .cfi_offset %ebp, -8 -; AVX-X86-NEXT: movl %esp, %ebp -; AVX-X86-NEXT: .cfi_def_cfa_register %ebp -; AVX-X86-NEXT: andl $-8, %esp -; AVX-X86-NEXT: subl $16, %esp -; AVX-X86-NEXT: movl 12(%ebp), %eax -; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-X86-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-X86-NEXT: shrl $31, %eax -; AVX-X86-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-X86-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-X86-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-X86-NEXT: wait -; AVX-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-X86-NEXT: vmovss %xmm0, (%esp) -; AVX-X86-NEXT: flds (%esp) -; AVX-X86-NEXT: wait -; AVX-X86-NEXT: movl %ebp, %esp -; AVX-X86-NEXT: popl %ebp -; AVX-X86-NEXT: .cfi_def_cfa %esp, 4 -; AVX-X86-NEXT: retl +; AVX1-X86-LABEL: uitofp_i64tof32: +; AVX1-X86: # %bb.0: +; AVX1-X86-NEXT: pushl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX1-X86-NEXT: .cfi_offset %ebp, -8 +; AVX1-X86-NEXT: movl %esp, %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX1-X86-NEXT: andl $-8, %esp +; AVX1-X86-NEXT: subl $16, %esp +; AVX1-X86-NEXT: movl 12(%ebp), %eax +; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-X86-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-X86-NEXT: shrl $31, %eax +; AVX1-X86-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-X86-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-X86-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-X86-NEXT: wait +; AVX1-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-X86-NEXT: vmovss %xmm0, (%esp) +; AVX1-X86-NEXT: flds (%esp) +; AVX1-X86-NEXT: wait +; AVX1-X86-NEXT: movl %ebp, %esp +; AVX1-X86-NEXT: popl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-X86-NEXT: retl ; ; AVX1-X64-LABEL: uitofp_i64tof32: ; AVX1-X64: # %bb.0: @@ -651,6 +651,32 @@ ; AVX1-X64-NEXT: .LBB9_2: ; AVX1-X64-NEXT: retq ; +; AVX512-X86-LABEL: uitofp_i64tof32: +; AVX512-X86: # %bb.0: +; AVX512-X86-NEXT: pushl %ebp +; AVX512-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX512-X86-NEXT: .cfi_offset %ebp, -8 +; AVX512-X86-NEXT: movl %esp, %ebp +; AVX512-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX512-X86-NEXT: andl $-8, %esp +; AVX512-X86-NEXT: subl $16, %esp +; AVX512-X86-NEXT: movl 12(%ebp), %eax +; AVX512-X86-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-X86-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512-X86-NEXT: shrl $31, %eax +; AVX512-X86-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512-X86-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512-X86-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512-X86-NEXT: wait +; AVX512-X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-X86-NEXT: vmovd %xmm0, (%esp) +; AVX512-X86-NEXT: flds (%esp) +; AVX512-X86-NEXT: wait +; AVX512-X86-NEXT: movl %ebp, %esp +; AVX512-X86-NEXT: popl %ebp +; AVX512-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX512-X86-NEXT: retl +; ; AVX512-X64-LABEL: uitofp_i64tof32: ; AVX512-X64: # %bb.0: ; AVX512-X64-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 @@ -1296,31 +1322,31 @@ ; SSE-X64-NEXT: .LBB18_2: ; SSE-X64-NEXT: retq ; -; AVX-X86-LABEL: uitofp_i64tof64: -; AVX-X86: # %bb.0: -; AVX-X86-NEXT: pushl %ebp -; AVX-X86-NEXT: .cfi_def_cfa_offset 8 -; AVX-X86-NEXT: .cfi_offset %ebp, -8 -; AVX-X86-NEXT: movl %esp, %ebp -; AVX-X86-NEXT: .cfi_def_cfa_register %ebp -; AVX-X86-NEXT: andl $-8, %esp -; AVX-X86-NEXT: subl $24, %esp -; AVX-X86-NEXT: movl 12(%ebp), %eax -; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-X86-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-X86-NEXT: shrl $31, %eax -; AVX-X86-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-X86-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-X86-NEXT: fstpl {{[0-9]+}}(%esp) -; AVX-X86-NEXT: wait -; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-X86-NEXT: vmovsd %xmm0, (%esp) -; AVX-X86-NEXT: fldl (%esp) -; AVX-X86-NEXT: wait -; AVX-X86-NEXT: movl %ebp, %esp -; AVX-X86-NEXT: popl %ebp -; AVX-X86-NEXT: .cfi_def_cfa %esp, 4 -; AVX-X86-NEXT: retl +; AVX1-X86-LABEL: uitofp_i64tof64: +; AVX1-X86: # %bb.0: +; AVX1-X86-NEXT: pushl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX1-X86-NEXT: .cfi_offset %ebp, -8 +; AVX1-X86-NEXT: movl %esp, %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX1-X86-NEXT: andl $-8, %esp +; AVX1-X86-NEXT: subl $24, %esp +; AVX1-X86-NEXT: movl 12(%ebp), %eax +; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-X86-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-X86-NEXT: shrl $31, %eax +; AVX1-X86-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-X86-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-X86-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX1-X86-NEXT: wait +; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-X86-NEXT: vmovsd %xmm0, (%esp) +; AVX1-X86-NEXT: fldl (%esp) +; AVX1-X86-NEXT: wait +; AVX1-X86-NEXT: movl %ebp, %esp +; AVX1-X86-NEXT: popl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-X86-NEXT: retl ; ; AVX1-X64-LABEL: uitofp_i64tof64: ; AVX1-X64: # %bb.0: @@ -1338,6 +1364,32 @@ ; AVX1-X64-NEXT: .LBB18_2: ; AVX1-X64-NEXT: retq ; +; AVX512-X86-LABEL: uitofp_i64tof64: +; AVX512-X86: # %bb.0: +; AVX512-X86-NEXT: pushl %ebp +; AVX512-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX512-X86-NEXT: .cfi_offset %ebp, -8 +; AVX512-X86-NEXT: movl %esp, %ebp +; AVX512-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX512-X86-NEXT: andl $-8, %esp +; AVX512-X86-NEXT: subl $24, %esp +; AVX512-X86-NEXT: movl 12(%ebp), %eax +; AVX512-X86-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-X86-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512-X86-NEXT: shrl $31, %eax +; AVX512-X86-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512-X86-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512-X86-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512-X86-NEXT: wait +; AVX512-X86-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-X86-NEXT: vmovq %xmm0, (%esp) +; AVX512-X86-NEXT: fldl (%esp) +; AVX512-X86-NEXT: wait +; AVX512-X86-NEXT: movl %ebp, %esp +; AVX512-X86-NEXT: popl %ebp +; AVX512-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX512-X86-NEXT: retl +; ; AVX512-X64-LABEL: uitofp_i64tof64: ; AVX512-X64: # %bb.0: ; AVX512-X64-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 -O3 | FileCheck %s --check-prefix=SSE41-X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -O3 | FileCheck %s --check-prefix=SSE41-X64 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefix=AVX-X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefix=AVX-X64 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefix=AVX-X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefix=AVX-X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=AVX-X86,AVX1-X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=AVX-X64,AVX1-X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX-X86,AVX512-X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX-X64,AVX512-X64 declare float @llvm.experimental.constrained.ceil.f32(float, metadata) declare double @llvm.experimental.constrained.ceil.f64(double, metadata) @@ -511,30 +511,41 @@ ; SSE41-X64: # %bb.0: ; SSE41-X64-NEXT: pushq %rax ; SSE41-X64-NEXT: .cfi_def_cfa_offset 16 -; SSE41-X64-NEXT: callq roundf +; SSE41-X64-NEXT: callq roundf@PLT ; SSE41-X64-NEXT: popq %rax ; SSE41-X64-NEXT: .cfi_def_cfa_offset 8 ; SSE41-X64-NEXT: retq ; -; AVX-X86-LABEL: fround32: -; AVX-X86: # %bb.0: -; AVX-X86-NEXT: pushl %eax -; AVX-X86-NEXT: .cfi_def_cfa_offset 8 -; AVX-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-X86-NEXT: vmovss %xmm0, (%esp) -; AVX-X86-NEXT: calll roundf -; AVX-X86-NEXT: popl %eax -; AVX-X86-NEXT: .cfi_def_cfa_offset 4 -; AVX-X86-NEXT: retl +; AVX1-X86-LABEL: fround32: +; AVX1-X86: # %bb.0: +; AVX1-X86-NEXT: pushl %eax +; AVX1-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX1-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-X86-NEXT: vmovss %xmm0, (%esp) +; AVX1-X86-NEXT: calll roundf +; AVX1-X86-NEXT: popl %eax +; AVX1-X86-NEXT: .cfi_def_cfa_offset 4 +; AVX1-X86-NEXT: retl ; ; AVX-X64-LABEL: fround32: ; AVX-X64: # %bb.0: ; AVX-X64-NEXT: pushq %rax ; AVX-X64-NEXT: .cfi_def_cfa_offset 16 -; AVX-X64-NEXT: callq roundf +; AVX-X64-NEXT: callq roundf@PLT ; AVX-X64-NEXT: popq %rax ; AVX-X64-NEXT: .cfi_def_cfa_offset 8 ; AVX-X64-NEXT: retq +; +; AVX512-X86-LABEL: fround32: +; AVX512-X86: # %bb.0: +; AVX512-X86-NEXT: pushl %eax +; AVX512-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX512-X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-X86-NEXT: vmovd %xmm0, (%esp) +; AVX512-X86-NEXT: calll roundf +; AVX512-X86-NEXT: popl %eax +; AVX512-X86-NEXT: .cfi_def_cfa_offset 4 +; AVX512-X86-NEXT: retl %res = call float @llvm.experimental.constrained.round.f32( float %f, metadata !"fpexcept.strict") #0 ret float %res @@ -556,30 +567,41 @@ ; SSE41-X64: # %bb.0: ; SSE41-X64-NEXT: pushq %rax ; SSE41-X64-NEXT: .cfi_def_cfa_offset 16 -; SSE41-X64-NEXT: callq round +; SSE41-X64-NEXT: callq round@PLT ; SSE41-X64-NEXT: popq %rax ; SSE41-X64-NEXT: .cfi_def_cfa_offset 8 ; SSE41-X64-NEXT: retq ; -; AVX-X86-LABEL: froundf64: -; AVX-X86: # %bb.0: -; AVX-X86-NEXT: subl $8, %esp -; AVX-X86-NEXT: .cfi_def_cfa_offset 12 -; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-X86-NEXT: vmovsd %xmm0, (%esp) -; AVX-X86-NEXT: calll round -; AVX-X86-NEXT: addl $8, %esp -; AVX-X86-NEXT: .cfi_def_cfa_offset 4 -; AVX-X86-NEXT: retl +; AVX1-X86-LABEL: froundf64: +; AVX1-X86: # %bb.0: +; AVX1-X86-NEXT: subl $8, %esp +; AVX1-X86-NEXT: .cfi_def_cfa_offset 12 +; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-X86-NEXT: vmovsd %xmm0, (%esp) +; AVX1-X86-NEXT: calll round +; AVX1-X86-NEXT: addl $8, %esp +; AVX1-X86-NEXT: .cfi_def_cfa_offset 4 +; AVX1-X86-NEXT: retl ; ; AVX-X64-LABEL: froundf64: ; AVX-X64: # %bb.0: ; AVX-X64-NEXT: pushq %rax ; AVX-X64-NEXT: .cfi_def_cfa_offset 16 -; AVX-X64-NEXT: callq round +; AVX-X64-NEXT: callq round@PLT ; AVX-X64-NEXT: popq %rax ; AVX-X64-NEXT: .cfi_def_cfa_offset 8 ; AVX-X64-NEXT: retq +; +; AVX512-X86-LABEL: froundf64: +; AVX512-X86: # %bb.0: +; AVX512-X86-NEXT: subl $8, %esp +; AVX512-X86-NEXT: .cfi_def_cfa_offset 12 +; AVX512-X86-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-X86-NEXT: vmovq %xmm0, (%esp) +; AVX512-X86-NEXT: calll round +; AVX512-X86-NEXT: addl $8, %esp +; AVX512-X86-NEXT: .cfi_def_cfa_offset 4 +; AVX512-X86-NEXT: retl %res = call double @llvm.experimental.constrained.round.f64( double %f, metadata !"fpexcept.strict") #0 ret double %res diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar.ll b/llvm/test/CodeGen/X86/fp-strict-scalar.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE-X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE-X64 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma -O3 | FileCheck %s --check-prefixes=AVX-X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma -O3 | FileCheck %s --check-prefixes=AVX-X64 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX-X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX-X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+fma -O3 | FileCheck %s --check-prefixes=AVX-X86,AVX1-X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma -O3 | FileCheck %s --check-prefixes=AVX-X64,AVX1-X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX-X86,AVX512-X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX-X64,AVX512-X64 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse -O3 | FileCheck %s --check-prefixes=X87 declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) @@ -626,7 +626,7 @@ ; SSE-X64-LABEL: fma_f64: ; SSE-X64: # %bb.0: ; SSE-X64-NEXT: pushq %rax -; SSE-X64-NEXT: callq fma +; SSE-X64-NEXT: callq fma@PLT ; SSE-X64-NEXT: popq %rax ; SSE-X64-NEXT: retq ; @@ -687,7 +687,7 @@ ; SSE-X64-LABEL: fma_f32: ; SSE-X64: # %bb.0: ; SSE-X64-NEXT: pushq %rax -; SSE-X64-NEXT: callq fmaf +; SSE-X64-NEXT: callq fmaf@PLT ; SSE-X64-NEXT: popq %rax ; SSE-X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll --- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-linux-android -mattr=+sse | FileCheck %s --check-prefixes=X64,X64-SSE ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck %s --check-prefixes=X64,X64-SSE -; RUN: llc < %s -mtriple=x86_64-linux-android -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX -; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX -; RUN: llc < %s -mtriple=x86_64-linux-android -mattr=+avx512f | FileCheck %s --check-prefixes=X64,X64-AVX -; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=X64,X64-AVX +; RUN: llc < %s -mtriple=x86_64-linux-android -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-linux-android -mattr=+avx512f | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512 ; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=-sse | FileCheck %s --check-prefixes=X86 ; Check soft floating point conversion function calls. @@ -19,19 +19,28 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: callq __extendsftf2 +; X64-SSE-NEXT: callq __extendsftf2@PLT ; X64-SSE-NEXT: movaps %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: TestFPExtF32_F128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: callq __extendsftf2 -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPExtF32_F128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX1-NEXT: callq __extendsftf2@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPExtF32_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX512-NEXT: callq __extendsftf2@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq ; ; X86-LABEL: TestFPExtF32_F128: ; X86: # %bb.0: # %entry @@ -67,19 +76,28 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X64-SSE-NEXT: callq __extenddftf2 +; X64-SSE-NEXT: callq __extenddftf2@PLT ; X64-SSE-NEXT: movaps %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: TestFPExtF64_F128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX-NEXT: callq __extenddftf2 -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPExtF64_F128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX1-NEXT: callq __extenddftf2@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPExtF64_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X64-AVX512-NEXT: callq __extenddftf2@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq ; ; X86-LABEL: TestFPExtF64_F128: ; X86: # %bb.0: # %entry @@ -117,21 +135,32 @@ ; X64-SSE-NEXT: fldt {{.*}}(%rip) ; X64-SSE-NEXT: fstpt (%rsp) ; X64-SSE-NEXT: wait -; X64-SSE-NEXT: callq __extendxftf2 +; X64-SSE-NEXT: callq __extendxftf2@PLT ; X64-SSE-NEXT: movaps %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: addq $24, %rsp ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: TestFPExtF80_F128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: subq $24, %rsp -; X64-AVX-NEXT: fldt {{.*}}(%rip) -; X64-AVX-NEXT: fstpt (%rsp) -; X64-AVX-NEXT: wait -; X64-AVX-NEXT: callq __extendxftf2 -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: addq $24, %rsp -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPExtF80_F128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: subq $24, %rsp +; X64-AVX1-NEXT: fldt {{.*}}(%rip) +; X64-AVX1-NEXT: fstpt (%rsp) +; X64-AVX1-NEXT: wait +; X64-AVX1-NEXT: callq __extendxftf2@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: addq $24, %rsp +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPExtF80_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: subq $24, %rsp +; X64-AVX512-NEXT: fldt {{.*}}(%rip) +; X64-AVX512-NEXT: fstpt (%rsp) +; X64-AVX512-NEXT: wait +; X64-AVX512-NEXT: callq __extendxftf2@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: addq $24, %rsp +; X64-AVX512-NEXT: retq ; ; X86-LABEL: TestFPExtF80_F128: ; X86: # %bb.0: # %entry @@ -167,19 +196,28 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __trunctfsf2 +; X64-SSE-NEXT: callq __trunctfsf2@PLT ; X64-SSE-NEXT: movss %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: TestFPTruncF128_F32: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: callq __trunctfsf2 -; X64-AVX-NEXT: vmovss %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPTruncF128_F32: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: callq __trunctfsf2@PLT +; X64-AVX1-NEXT: vmovss %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPTruncF128_F32: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: callq __trunctfsf2@PLT +; X64-AVX512-NEXT: vmovd %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq ; ; X86-LABEL: TestFPTruncF128_F32: ; X86: # %bb.0: # %entry @@ -206,19 +244,28 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __trunctfdf2 +; X64-SSE-NEXT: callq __trunctfdf2@PLT ; X64-SSE-NEXT: movsd %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: TestFPTruncF128_F64: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: callq __trunctfdf2 -; X64-AVX-NEXT: vmovsd %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPTruncF128_F64: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: callq __trunctfdf2@PLT +; X64-AVX1-NEXT: vmovsd %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPTruncF128_F64: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: callq __trunctfdf2@PLT +; X64-AVX512-NEXT: vmovq %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq ; ; X86-LABEL: TestFPTruncF128_F64: ; X86: # %bb.0: # %entry @@ -245,21 +292,31 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __trunctfxf2 +; X64-SSE-NEXT: callq __trunctfxf2@PLT ; X64-SSE-NEXT: fstpt {{.*}}(%rip) ; X64-SSE-NEXT: wait ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: TestFPTruncF128_F80: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: callq __trunctfxf2 -; X64-AVX-NEXT: fstpt {{.*}}(%rip) -; X64-AVX-NEXT: wait -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPTruncF128_F80: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: callq __trunctfxf2@PLT +; X64-AVX1-NEXT: fstpt {{.*}}(%rip) +; X64-AVX1-NEXT: wait +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPTruncF128_F80: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: callq __trunctfxf2@PLT +; X64-AVX512-NEXT: fstpt {{.*}}(%rip) +; X64-AVX512-NEXT: wait +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq ; ; X86-LABEL: TestFPTruncF128_F80: ; X86: # %bb.0: # %entry @@ -285,7 +342,7 @@ ; X64-LABEL: fptosi_i8: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __fixtfsi +; X64-NEXT: callq __fixtfsi@PLT ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq @@ -311,7 +368,7 @@ ; X64-LABEL: fptosi_i16: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __fixtfsi +; X64-NEXT: callq __fixtfsi@PLT ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq @@ -337,7 +394,7 @@ ; X64-LABEL: fptosi_i32: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __fixtfsi +; X64-NEXT: callq __fixtfsi@PLT ; X64-NEXT: popq %rcx ; X64-NEXT: retq ; @@ -360,7 +417,7 @@ ; X64-LABEL: fptosi_i64: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __fixtfdi +; X64-NEXT: callq __fixtfdi@PLT ; X64-NEXT: popq %rcx ; X64-NEXT: retq ; @@ -383,7 +440,7 @@ ; X64-LABEL: fptosi_i128: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __fixtfti +; X64-NEXT: callq __fixtfti@PLT ; X64-NEXT: popq %rcx ; X64-NEXT: retq ; @@ -424,7 +481,7 @@ ; X64-LABEL: fptoui_i8: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __fixtfsi +; X64-NEXT: callq __fixtfsi@PLT ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq @@ -450,7 +507,7 @@ ; X64-LABEL: fptoui_i16: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __fixtfsi +; X64-NEXT: callq __fixtfsi@PLT ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: popq %rcx ; X64-NEXT: retq @@ -476,7 +533,7 @@ ; X64-LABEL: fptoui_i32: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __fixunstfsi +; X64-NEXT: callq __fixunstfsi@PLT ; X64-NEXT: popq %rcx ; X64-NEXT: retq ; @@ -499,7 +556,7 @@ ; X64-LABEL: fptoui_i64: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __fixunstfdi +; X64-NEXT: callq __fixunstfdi@PLT ; X64-NEXT: popq %rcx ; X64-NEXT: retq ; @@ -522,7 +579,7 @@ ; X64-LABEL: fptoui_i128: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __fixunstfti +; X64-NEXT: callq __fixunstfti@PLT ; X64-NEXT: popq %rcx ; X64-NEXT: retq ; @@ -564,7 +621,7 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax ; X64-NEXT: movsbl %dil, %edi -; X64-NEXT: callq __floatsitf +; X64-NEXT: callq __floatsitf@PLT ; X64-NEXT: popq %rax ; X64-NEXT: retq ; @@ -604,7 +661,7 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax ; X64-NEXT: movswl %di, %edi -; X64-NEXT: callq __floatsitf +; X64-NEXT: callq __floatsitf@PLT ; X64-NEXT: popq %rax ; X64-NEXT: retq ; @@ -643,7 +700,7 @@ ; X64-LABEL: sitofp_i32: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __floatsitf +; X64-NEXT: callq __floatsitf@PLT ; X64-NEXT: popq %rax ; X64-NEXT: retq ; @@ -681,7 +738,7 @@ ; X64-LABEL: sitofp_i64: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __floatditf +; X64-NEXT: callq __floatditf@PLT ; X64-NEXT: popq %rax ; X64-NEXT: retq ; @@ -720,7 +777,7 @@ ; X64-LABEL: sitofp_i128: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __floattitf +; X64-NEXT: callq __floattitf@PLT ; X64-NEXT: popq %rax ; X64-NEXT: retq ; @@ -762,7 +819,7 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax ; X64-NEXT: movzbl %dil, %edi -; X64-NEXT: callq __floatsitf +; X64-NEXT: callq __floatsitf@PLT ; X64-NEXT: popq %rax ; X64-NEXT: retq ; @@ -802,7 +859,7 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax ; X64-NEXT: movzwl %di, %edi -; X64-NEXT: callq __floatsitf +; X64-NEXT: callq __floatsitf@PLT ; X64-NEXT: popq %rax ; X64-NEXT: retq ; @@ -841,7 +898,7 @@ ; X64-LABEL: uitofp_i32: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __floatunsitf +; X64-NEXT: callq __floatunsitf@PLT ; X64-NEXT: popq %rax ; X64-NEXT: retq ; @@ -879,7 +936,7 @@ ; X64-LABEL: uitofp_i64: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __floatunditf +; X64-NEXT: callq __floatunditf@PLT ; X64-NEXT: popq %rax ; X64-NEXT: retq ; @@ -918,7 +975,7 @@ ; X64-LABEL: uitofp_i128: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rax -; X64-NEXT: callq __floatuntitf +; X64-NEXT: callq __floatuntitf@PLT ; X64-NEXT: popq %rax ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll --- a/llvm/test/CodeGen/X86/fp128-cast.ll +++ b/llvm/test/CodeGen/X86/fp128-cast.ll @@ -2,10 +2,10 @@ ; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+sse | FileCheck %s --check-prefix=X64-SSE ; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck %s --check-prefix=X64-SSE ; RUN: llc < %s -O2 -mtriple=i686-linux-gnu -mattr=+mmx | FileCheck %s --check-prefix=X32 -; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+avx | FileCheck %s --check-prefix=X64-AVX -; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=X64-AVX -; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+avx512f | FileCheck %s --check-prefix=X64-AVX -; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=X64-AVX +; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+avx | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1 +; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1 +; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+avx512f | FileCheck %s --check-prefixes=X64-AVX,X64-AVX512 +; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=X64-AVX,X64-AVX512 ; Check soft floating point conversion function calls. @@ -26,7 +26,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: callq __extendsftf2 +; X64-SSE-NEXT: callq __extendsftf2@PLT ; X64-SSE-NEXT: movaps %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -53,14 +53,23 @@ ; X32-NEXT: popl %esi ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestFPExtF32_F128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: callq __extendsftf2 -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPExtF32_F128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX1-NEXT: callq __extendsftf2@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPExtF32_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX512-NEXT: callq __extendsftf2@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load float, float* @vf32, align 4 %conv = fpext float %0 to fp128 @@ -73,7 +82,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X64-SSE-NEXT: callq __extenddftf2 +; X64-SSE-NEXT: callq __extenddftf2@PLT ; X64-SSE-NEXT: movaps %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -100,14 +109,23 @@ ; X32-NEXT: popl %esi ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestFPExtF64_F128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-AVX-NEXT: callq __extenddftf2 -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPExtF64_F128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX1-NEXT: callq __extenddftf2@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPExtF64_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X64-AVX512-NEXT: callq __extenddftf2@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load double, double* @vf64, align 8 %conv = fpext double %0 to fp128 @@ -121,7 +139,7 @@ ; X64-SSE-NEXT: subq $24, %rsp ; X64-SSE-NEXT: fldt {{.*}}(%rip) ; X64-SSE-NEXT: fstpt (%rsp) -; X64-SSE-NEXT: callq __extendxftf2 +; X64-SSE-NEXT: callq __extendxftf2@PLT ; X64-SSE-NEXT: movaps %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: addq $24, %rsp ; X64-SSE-NEXT: retq @@ -148,15 +166,25 @@ ; X32-NEXT: popl %esi ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestFPExtF80_F128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: subq $24, %rsp -; X64-AVX-NEXT: fldt {{.*}}(%rip) -; X64-AVX-NEXT: fstpt (%rsp) -; X64-AVX-NEXT: callq __extendxftf2 -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: addq $24, %rsp -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPExtF80_F128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: subq $24, %rsp +; X64-AVX1-NEXT: fldt {{.*}}(%rip) +; X64-AVX1-NEXT: fstpt (%rsp) +; X64-AVX1-NEXT: callq __extendxftf2@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: addq $24, %rsp +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPExtF80_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: subq $24, %rsp +; X64-AVX512-NEXT: fldt {{.*}}(%rip) +; X64-AVX512-NEXT: fstpt (%rsp) +; X64-AVX512-NEXT: callq __extendxftf2@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: addq $24, %rsp +; X64-AVX512-NEXT: retq entry: %0 = load x86_fp80, x86_fp80* @vf80, align 8 %conv = fpext x86_fp80 %0 to fp128 @@ -169,7 +197,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __fixtfsi +; X64-SSE-NEXT: callq __fixtfsi@PLT ; X64-SSE-NEXT: movw %ax, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -187,14 +215,23 @@ ; X32-NEXT: addl $12, %esp ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestFPToSIF128_I16: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: callq __fixtfsi -; X64-AVX-NEXT: movw %ax, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPToSIF128_I16: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: callq __fixtfsi@PLT +; X64-AVX1-NEXT: movw %ax, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPToSIF128_I16: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: callq __fixtfsi@PLT +; X64-AVX512-NEXT: movw %ax, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load fp128, fp128* @vf128, align 16 %conv = fptosi fp128 %0 to i16 @@ -207,7 +244,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __fixtfsi +; X64-SSE-NEXT: callq __fixtfsi@PLT ; X64-SSE-NEXT: movw %ax, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -225,14 +262,23 @@ ; X32-NEXT: addl $12, %esp ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestFPToUIF128_I16: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: callq __fixtfsi -; X64-AVX-NEXT: movw %ax, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPToUIF128_I16: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: callq __fixtfsi@PLT +; X64-AVX1-NEXT: movw %ax, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPToUIF128_I16: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: callq __fixtfsi@PLT +; X64-AVX512-NEXT: movw %ax, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load fp128, fp128* @vf128, align 16 %conv = fptoui fp128 %0 to i16 @@ -245,7 +291,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __fixtfsi +; X64-SSE-NEXT: callq __fixtfsi@PLT ; X64-SSE-NEXT: movl %eax, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -263,14 +309,23 @@ ; X32-NEXT: addl $12, %esp ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestFPToSIF128_I32: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: callq __fixtfsi -; X64-AVX-NEXT: movl %eax, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPToSIF128_I32: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: callq __fixtfsi@PLT +; X64-AVX1-NEXT: movl %eax, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPToSIF128_I32: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: callq __fixtfsi@PLT +; X64-AVX512-NEXT: movl %eax, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load fp128, fp128* @vf128, align 16 %conv = fptosi fp128 %0 to i32 @@ -283,7 +338,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __fixunstfsi +; X64-SSE-NEXT: callq __fixunstfsi@PLT ; X64-SSE-NEXT: movl %eax, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -301,14 +356,23 @@ ; X32-NEXT: addl $12, %esp ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestFPToUIF128_U32: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: callq __fixunstfsi -; X64-AVX-NEXT: movl %eax, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPToUIF128_U32: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: callq __fixunstfsi@PLT +; X64-AVX1-NEXT: movl %eax, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPToUIF128_U32: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: callq __fixunstfsi@PLT +; X64-AVX512-NEXT: movl %eax, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load fp128, fp128* @vf128, align 16 %conv = fptoui fp128 %0 to i32 @@ -321,7 +385,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __fixtfsi +; X64-SSE-NEXT: callq __fixtfsi@PLT ; X64-SSE-NEXT: cltq ; X64-SSE-NEXT: movq %rax, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax @@ -342,15 +406,25 @@ ; X32-NEXT: addl $12, %esp ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestFPToSIF128_I64: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: callq __fixtfsi -; X64-AVX-NEXT: cltq -; X64-AVX-NEXT: movq %rax, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPToSIF128_I64: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: callq __fixtfsi@PLT +; X64-AVX1-NEXT: cltq +; X64-AVX1-NEXT: movq %rax, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPToSIF128_I64: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: callq __fixtfsi@PLT +; X64-AVX512-NEXT: cltq +; X64-AVX512-NEXT: movq %rax, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load fp128, fp128* @vf128, align 16 %conv = fptosi fp128 %0 to i32 @@ -364,7 +438,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __fixunstfsi +; X64-SSE-NEXT: callq __fixunstfsi@PLT ; X64-SSE-NEXT: movl %eax, %eax ; X64-SSE-NEXT: movq %rax, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax @@ -384,15 +458,25 @@ ; X32-NEXT: addl $12, %esp ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestFPToUIF128_U64: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: callq __fixunstfsi -; X64-AVX-NEXT: movl %eax, %eax -; X64-AVX-NEXT: movq %rax, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPToUIF128_U64: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: callq __fixunstfsi@PLT +; X64-AVX1-NEXT: movl %eax, %eax +; X64-AVX1-NEXT: movq %rax, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPToUIF128_U64: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: callq __fixunstfsi@PLT +; X64-AVX512-NEXT: movl %eax, %eax +; X64-AVX512-NEXT: movq %rax, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load fp128, fp128* @vf128, align 16 %conv = fptoui fp128 %0 to i32 @@ -406,7 +490,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __fixtfti +; X64-SSE-NEXT: callq __fixtfti@PLT ; X64-SSE-NEXT: movq %rdx, vi128+{{.*}}(%rip) ; X64-SSE-NEXT: movq %rax, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax @@ -436,15 +520,25 @@ ; X32-NEXT: popl %esi ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestFPToSIF128_I128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: callq __fixtfti -; X64-AVX-NEXT: movq %rdx, vi128+{{.*}}(%rip) -; X64-AVX-NEXT: movq %rax, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPToSIF128_I128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: callq __fixtfti@PLT +; X64-AVX1-NEXT: movq %rdx, vi128+{{.*}}(%rip) +; X64-AVX1-NEXT: movq %rax, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPToSIF128_I128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: callq __fixtfti@PLT +; X64-AVX512-NEXT: movq %rdx, vi128+{{.*}}(%rip) +; X64-AVX512-NEXT: movq %rax, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load fp128, fp128* @vf128, align 16 %conv = fptosi fp128 %0 to i128 @@ -457,7 +551,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __fixunstfti +; X64-SSE-NEXT: callq __fixunstfti@PLT ; X64-SSE-NEXT: movq %rdx, vu128+{{.*}}(%rip) ; X64-SSE-NEXT: movq %rax, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax @@ -487,15 +581,25 @@ ; X32-NEXT: popl %esi ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestFPToUIF128_U128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: callq __fixunstfti -; X64-AVX-NEXT: movq %rdx, vu128+{{.*}}(%rip) -; X64-AVX-NEXT: movq %rax, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPToUIF128_U128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: callq __fixunstfti@PLT +; X64-AVX1-NEXT: movq %rdx, vu128+{{.*}}(%rip) +; X64-AVX1-NEXT: movq %rax, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPToUIF128_U128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: callq __fixunstfti@PLT +; X64-AVX512-NEXT: movq %rdx, vu128+{{.*}}(%rip) +; X64-AVX512-NEXT: movq %rax, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load fp128, fp128* @vf128, align 16 %conv = fptoui fp128 %0 to i128 @@ -508,7 +612,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __trunctfsf2 +; X64-SSE-NEXT: callq __trunctfsf2@PLT ; X64-SSE-NEXT: movss %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -526,14 +630,23 @@ ; X32-NEXT: addl $12, %esp ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestFPTruncF128_F32: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: callq __trunctfsf2 -; X64-AVX-NEXT: vmovss %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPTruncF128_F32: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: callq __trunctfsf2@PLT +; X64-AVX1-NEXT: vmovss %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPTruncF128_F32: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: callq __trunctfsf2@PLT +; X64-AVX512-NEXT: vmovd %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load fp128, fp128* @vf128, align 16 %conv = fptrunc fp128 %0 to float @@ -546,7 +659,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __trunctfdf2 +; X64-SSE-NEXT: callq __trunctfdf2@PLT ; X64-SSE-NEXT: movsd %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -564,14 +677,23 @@ ; X32-NEXT: addl $12, %esp ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestFPTruncF128_F64: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: callq __trunctfdf2 -; X64-AVX-NEXT: vmovsd %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPTruncF128_F64: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: callq __trunctfdf2@PLT +; X64-AVX1-NEXT: vmovsd %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPTruncF128_F64: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: callq __trunctfdf2@PLT +; X64-AVX512-NEXT: vmovq %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load fp128, fp128* @vf128, align 16 %conv = fptrunc fp128 %0 to double @@ -584,7 +706,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __trunctfxf2 +; X64-SSE-NEXT: callq __trunctfxf2@PLT ; X64-SSE-NEXT: fstpt {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -602,14 +724,23 @@ ; X32-NEXT: addl $12, %esp ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestFPTruncF128_F80: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: callq __trunctfxf2 -; X64-AVX-NEXT: fstpt {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestFPTruncF128_F80: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: callq __trunctfxf2@PLT +; X64-AVX1-NEXT: fstpt {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestFPTruncF128_F80: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: callq __trunctfxf2@PLT +; X64-AVX512-NEXT: fstpt {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load fp128, fp128* @vf128, align 16 %conv = fptrunc fp128 %0 to x86_fp80 @@ -622,7 +753,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movswl {{.*}}(%rip), %edi -; X64-SSE-NEXT: callq __floatsitf +; X64-SSE-NEXT: callq __floatsitf@PLT ; X64-SSE-NEXT: movaps %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -650,14 +781,23 @@ ; X32-NEXT: popl %esi ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestSIToFPI16_F128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: movswl {{.*}}(%rip), %edi -; X64-AVX-NEXT: callq __floatsitf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestSIToFPI16_F128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movswl {{.*}}(%rip), %edi +; X64-AVX1-NEXT: callq __floatsitf@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestSIToFPI16_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: movswl {{.*}}(%rip), %edi +; X64-AVX512-NEXT: callq __floatsitf@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load i16, i16* @vi16, align 4 %conv = sitofp i16 %0 to fp128 @@ -670,7 +810,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movzwl {{.*}}(%rip), %edi -; X64-SSE-NEXT: callq __floatsitf +; X64-SSE-NEXT: callq __floatsitf@PLT ; X64-SSE-NEXT: movaps %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -698,14 +838,23 @@ ; X32-NEXT: popl %esi ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestSIToFPU16_F128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: movzwl {{.*}}(%rip), %edi -; X64-AVX-NEXT: callq __floatsitf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestSIToFPU16_F128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movzwl {{.*}}(%rip), %edi +; X64-AVX1-NEXT: callq __floatsitf@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestSIToFPU16_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: movzwl {{.*}}(%rip), %edi +; X64-AVX512-NEXT: callq __floatsitf@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load i16, i16* @vi16, align 4 %conv = uitofp i16 %0 to fp128 @@ -718,7 +867,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movl {{.*}}(%rip), %edi -; X64-SSE-NEXT: callq __floatsitf +; X64-SSE-NEXT: callq __floatsitf@PLT ; X64-SSE-NEXT: movaps %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -744,14 +893,23 @@ ; X32-NEXT: popl %esi ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestSIToFPI32_F128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: movl {{.*}}(%rip), %edi -; X64-AVX-NEXT: callq __floatsitf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestSIToFPI32_F128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movl {{.*}}(%rip), %edi +; X64-AVX1-NEXT: callq __floatsitf@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestSIToFPI32_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: movl {{.*}}(%rip), %edi +; X64-AVX512-NEXT: callq __floatsitf@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load i32, i32* @vi32, align 4 %conv = sitofp i32 %0 to fp128 @@ -764,7 +922,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movl {{.*}}(%rip), %edi -; X64-SSE-NEXT: callq __floatunsitf +; X64-SSE-NEXT: callq __floatunsitf@PLT ; X64-SSE-NEXT: movaps %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -790,14 +948,23 @@ ; X32-NEXT: popl %esi ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestUIToFPU32_F128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: movl {{.*}}(%rip), %edi -; X64-AVX-NEXT: callq __floatunsitf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestUIToFPU32_F128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movl {{.*}}(%rip), %edi +; X64-AVX1-NEXT: callq __floatunsitf@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestUIToFPU32_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: movl {{.*}}(%rip), %edi +; X64-AVX512-NEXT: callq __floatunsitf@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load i32, i32* @vu32, align 4 %conv = uitofp i32 %0 to fp128 @@ -810,7 +977,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movq {{.*}}(%rip), %rdi -; X64-SSE-NEXT: callq __floatditf +; X64-SSE-NEXT: callq __floatditf@PLT ; X64-SSE-NEXT: movaps %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -837,14 +1004,23 @@ ; X32-NEXT: popl %esi ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestSIToFPI64_F128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: movq {{.*}}(%rip), %rdi -; X64-AVX-NEXT: callq __floatditf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestSIToFPI64_F128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rdi +; X64-AVX1-NEXT: callq __floatditf@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestSIToFPI64_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: movq {{.*}}(%rip), %rdi +; X64-AVX512-NEXT: callq __floatditf@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load i64, i64* @vi64, align 8 %conv = sitofp i64 %0 to fp128 @@ -857,7 +1033,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movq {{.*}}(%rip), %rdi -; X64-SSE-NEXT: callq __floatunditf +; X64-SSE-NEXT: callq __floatunditf@PLT ; X64-SSE-NEXT: movaps %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -884,14 +1060,23 @@ ; X32-NEXT: popl %esi ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestUIToFPU64_F128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: movq {{.*}}(%rip), %rdi -; X64-AVX-NEXT: callq __floatunditf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestUIToFPU64_F128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rdi +; X64-AVX1-NEXT: callq __floatunditf@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestUIToFPU64_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: movq {{.*}}(%rip), %rdi +; X64-AVX512-NEXT: callq __floatunditf@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load i64, i64* @vu64, align 8 %conv = uitofp i64 %0 to fp128 @@ -905,7 +1090,7 @@ ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movq {{.*}}(%rip), %rdi ; X64-SSE-NEXT: movq vi128+{{.*}}(%rip), %rsi -; X64-SSE-NEXT: callq __floattitf +; X64-SSE-NEXT: callq __floattitf@PLT ; X64-SSE-NEXT: movaps %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -934,15 +1119,25 @@ ; X32-NEXT: popl %esi ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestSIToFPI128_F128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: movq {{.*}}(%rip), %rdi -; X64-AVX-NEXT: movq vi128+{{.*}}(%rip), %rsi -; X64-AVX-NEXT: callq __floattitf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestSIToFPI128_F128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rdi +; X64-AVX1-NEXT: movq vi128+{{.*}}(%rip), %rsi +; X64-AVX1-NEXT: callq __floattitf@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestSIToFPI128_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: movq {{.*}}(%rip), %rdi +; X64-AVX512-NEXT: movq vi128+{{.*}}(%rip), %rsi +; X64-AVX512-NEXT: callq __floattitf@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load i128, i128* @vi128, align 16 %conv = sitofp i128 %0 to fp128 @@ -956,7 +1151,7 @@ ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movq {{.*}}(%rip), %rdi ; X64-SSE-NEXT: movq vu128+{{.*}}(%rip), %rsi -; X64-SSE-NEXT: callq __floatuntitf +; X64-SSE-NEXT: callq __floatuntitf@PLT ; X64-SSE-NEXT: movaps %xmm0, {{.*}}(%rip) ; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq @@ -985,15 +1180,25 @@ ; X32-NEXT: popl %esi ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestUIToFPU128_F128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: movq {{.*}}(%rip), %rdi -; X64-AVX-NEXT: movq vu128+{{.*}}(%rip), %rsi -; X64-AVX-NEXT: callq __floatuntitf -; X64-AVX-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; X64-AVX-NEXT: popq %rax -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestUIToFPU128_F128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rdi +; X64-AVX1-NEXT: movq vu128+{{.*}}(%rip), %rsi +; X64-AVX1-NEXT: callq __floatuntitf@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; X64-AVX1-NEXT: popq %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestUIToFPU128_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: movq {{.*}}(%rip), %rdi +; X64-AVX512-NEXT: movq vu128+{{.*}}(%rip), %rsi +; X64-AVX512-NEXT: callq __floatuntitf@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq entry: %0 = load i128, i128* @vu128, align 16 %conv = uitofp i128 %0 to fp128 @@ -1006,7 +1211,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm1 -; X64-SSE-NEXT: callq __gttf2 +; X64-SSE-NEXT: callq __gttf2@PLT ; X64-SSE-NEXT: xorl %ecx, %ecx ; X64-SSE-NEXT: testl %eax, %eax ; X64-SSE-NEXT: setg %cl @@ -1034,17 +1239,29 @@ ; X32-NEXT: addl $12, %esp ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestConst128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm1 -; X64-AVX-NEXT: callq __gttf2 -; X64-AVX-NEXT: xorl %ecx, %ecx -; X64-AVX-NEXT: testl %eax, %eax -; X64-AVX-NEXT: setg %cl -; X64-AVX-NEXT: movl %ecx, %eax -; X64-AVX-NEXT: popq %rcx -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestConst128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vmovaps {{.*}}(%rip), %xmm1 +; X64-AVX1-NEXT: callq __gttf2@PLT +; X64-AVX1-NEXT: xorl %ecx, %ecx +; X64-AVX1-NEXT: testl %eax, %eax +; X64-AVX1-NEXT: setg %cl +; X64-AVX1-NEXT: movl %ecx, %eax +; X64-AVX1-NEXT: popq %rcx +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestConst128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm1 +; X64-AVX512-NEXT: callq __gttf2@PLT +; X64-AVX512-NEXT: xorl %ecx, %ecx +; X64-AVX512-NEXT: testl %eax, %eax +; X64-AVX512-NEXT: setg %cl +; X64-AVX512-NEXT: movl %ecx, %eax +; X64-AVX512-NEXT: popq %rcx +; X64-AVX512-NEXT: retq entry: %cmp = fcmp ogt fp128 %v, 0xL00000000000000003FFF000000000000 %conv = zext i1 %cmp to i32 @@ -1057,7 +1274,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: xorps %xmm1, %xmm1 -; X64-SSE-NEXT: callq __gttf2 +; X64-SSE-NEXT: callq __gttf2@PLT ; X64-SSE-NEXT: xorl %ecx, %ecx ; X64-SSE-NEXT: testl %eax, %eax ; X64-SSE-NEXT: setg %cl @@ -1085,17 +1302,29 @@ ; X32-NEXT: addl $12, %esp ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestConst128Zero: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX-NEXT: callq __gttf2 -; X64-AVX-NEXT: xorl %ecx, %ecx -; X64-AVX-NEXT: testl %eax, %eax -; X64-AVX-NEXT: setg %cl -; X64-AVX-NEXT: movl %ecx, %eax -; X64-AVX-NEXT: popq %rcx -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestConst128Zero: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: callq __gttf2@PLT +; X64-AVX1-NEXT: xorl %ecx, %ecx +; X64-AVX1-NEXT: testl %eax, %eax +; X64-AVX1-NEXT: setg %cl +; X64-AVX1-NEXT: movl %ecx, %eax +; X64-AVX1-NEXT: popq %rcx +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestConst128Zero: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT: callq __gttf2@PLT +; X64-AVX512-NEXT: xorl %ecx, %ecx +; X64-AVX512-NEXT: testl %eax, %eax +; X64-AVX512-NEXT: setg %cl +; X64-AVX512-NEXT: movl %ecx, %eax +; X64-AVX512-NEXT: popq %rcx +; X64-AVX512-NEXT: retq entry: %cmp = fcmp ogt fp128 %v, 0xL00000000000000000000000000000000 %conv = zext i1 %cmp to i32 @@ -1121,7 +1350,7 @@ ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: subq $24, %rsp ; X64-SSE-NEXT: movaps %xmm0, %xmm1 -; X64-SSE-NEXT: callq __multf3 +; X64-SSE-NEXT: callq __multf3@PLT ; X64-SSE-NEXT: movaps %xmm0, (%rsp) ; X64-SSE-NEXT: movq (%rsp), %rcx ; X64-SSE-NEXT: movq %rcx, %rdx @@ -1163,20 +1392,35 @@ ; X32-NEXT: popl %edi ; X32-NEXT: retl ; -; X64-AVX-LABEL: TestBits128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: subq $24, %rsp -; X64-AVX-NEXT: vmovaps %xmm0, %xmm1 -; X64-AVX-NEXT: callq __multf3 -; X64-AVX-NEXT: vmovaps %xmm0, (%rsp) -; X64-AVX-NEXT: movq (%rsp), %rcx -; X64-AVX-NEXT: movq %rcx, %rdx -; X64-AVX-NEXT: shrq $32, %rdx -; X64-AVX-NEXT: xorl %eax, %eax -; X64-AVX-NEXT: orl %ecx, %edx -; X64-AVX-NEXT: sete %al -; X64-AVX-NEXT: addq $24, %rsp -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestBits128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: subq $24, %rsp +; X64-AVX1-NEXT: vmovaps %xmm0, %xmm1 +; X64-AVX1-NEXT: callq __multf3@PLT +; X64-AVX1-NEXT: vmovaps %xmm0, (%rsp) +; X64-AVX1-NEXT: movq (%rsp), %rcx +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $32, %rdx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: orl %ecx, %edx +; X64-AVX1-NEXT: sete %al +; X64-AVX1-NEXT: addq $24, %rsp +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestBits128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: subq $24, %rsp +; X64-AVX512-NEXT: vmovdqa %xmm0, %xmm1 +; X64-AVX512-NEXT: callq __multf3@PLT +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsp) +; X64-AVX512-NEXT: movq (%rsp), %rcx +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $32, %rdx +; X64-AVX512-NEXT: xorl %eax, %eax +; X64-AVX512-NEXT: orl %ecx, %edx +; X64-AVX512-NEXT: sete %al +; X64-AVX512-NEXT: addq $24, %rsp +; X64-AVX512-NEXT: retq entry: %mul = fmul fp128 %ld, %ld %0 = bitcast fp128 %mul to i128 @@ -1233,14 +1477,23 @@ ; X32-NEXT: popl %edi ; X32-NEXT: retl $4 ; -; X64-AVX-LABEL: TestPair128: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: addq $3, %rsi -; X64-AVX-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: adcq $0, %rdi -; X64-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestPair128: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: addq $3, %rsi +; X64-AVX1-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: adcq $0, %rdi +; X64-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestPair128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: addq $3, %rsi +; X64-AVX512-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: adcq $0, %rdi +; X64-AVX512-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; X64-AVX512-NEXT: retq entry: %conv = zext i64 %a to i128 %shl = shl nuw i128 %conv, 64 @@ -1258,10 +1511,10 @@ ; X64-SSE-NEXT: jl .LBB26_2 ; X64-SSE-NEXT: # %bb.1: # %if.then ; X64-SSE-NEXT: pushq %rax -; X64-SSE-NEXT: callq __trunctfdf2 +; X64-SSE-NEXT: callq __trunctfdf2@PLT ; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: orps {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: callq __extenddftf2 +; X64-SSE-NEXT: callq __extenddftf2@PLT ; X64-SSE-NEXT: addq $8, %rsp ; X64-SSE-NEXT: .LBB26_2: # %cleanup ; X64-SSE-NEXT: retq @@ -1316,21 +1569,36 @@ ; X32-NEXT: popl %edi ; X32-NEXT: retl $4 ; -; X64-AVX-LABEL: TestTruncCopysign: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: cmpl $50001, %edi # imm = 0xC351 -; X64-AVX-NEXT: jl .LBB26_2 -; X64-AVX-NEXT: # %bb.1: # %if.then -; X64-AVX-NEXT: pushq %rax -; X64-AVX-NEXT: callq __trunctfdf2 -; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [+Inf,+Inf] -; X64-AVX-NEXT: # xmm1 = mem[0,0] -; X64-AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: callq __extenddftf2 -; X64-AVX-NEXT: addq $8, %rsp -; X64-AVX-NEXT: .LBB26_2: # %cleanup -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: TestTruncCopysign: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: cmpl $50001, %edi # imm = 0xC351 +; X64-AVX1-NEXT: jl .LBB26_2 +; X64-AVX1-NEXT: # %bb.1: # %if.then +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: callq __trunctfdf2@PLT +; X64-AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [+Inf,+Inf] +; X64-AVX1-NEXT: # xmm1 = mem[0,0] +; X64-AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: callq __extenddftf2@PLT +; X64-AVX1-NEXT: addq $8, %rsp +; X64-AVX1-NEXT: .LBB26_2: # %cleanup +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: TestTruncCopysign: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: cmpl $50001, %edi # imm = 0xC351 +; X64-AVX512-NEXT: jl .LBB26_2 +; X64-AVX512-NEXT: # %bb.1: # %if.then +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: callq __trunctfdf2@PLT +; X64-AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [+Inf,+Inf] +; X64-AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-AVX512-NEXT: callq __extenddftf2@PLT +; X64-AVX512-NEXT: addq $8, %rsp +; X64-AVX512-NEXT: .LBB26_2: # %cleanup +; X64-AVX512-NEXT: retq entry: %cmp = icmp sgt i32 %n, 50000 br i1 %cmp, label %if.then, label %cleanup @@ -1367,15 +1635,25 @@ ; X32-NEXT: sete %al ; X32-NEXT: retl ; -; X64-AVX-LABEL: PR34866: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: xorq -{{[0-9]+}}(%rsp), %rsi -; X64-AVX-NEXT: xorq -{{[0-9]+}}(%rsp), %rdi -; X64-AVX-NEXT: orq %rsi, %rdi -; X64-AVX-NEXT: sete %al -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: PR34866: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: xorq -{{[0-9]+}}(%rsp), %rsi +; X64-AVX1-NEXT: xorq -{{[0-9]+}}(%rsp), %rdi +; X64-AVX1-NEXT: orq %rsi, %rdi +; X64-AVX1-NEXT: sete %al +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: PR34866: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: xorq -{{[0-9]+}}(%rsp), %rsi +; X64-AVX512-NEXT: xorq -{{[0-9]+}}(%rsp), %rdi +; X64-AVX512-NEXT: orq %rsi, %rdi +; X64-AVX512-NEXT: sete %al +; X64-AVX512-NEXT: retq %bc_mmx = bitcast fp128 0xL00000000000000000000000000000000 to i128 %cmp = icmp eq i128 %bc_mmx, %x ret i1 %cmp @@ -1402,15 +1680,25 @@ ; X32-NEXT: sete %al ; X32-NEXT: retl ; -; X64-AVX-LABEL: PR34866_commute: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: xorq -{{[0-9]+}}(%rsp), %rsi -; X64-AVX-NEXT: xorq -{{[0-9]+}}(%rsp), %rdi -; X64-AVX-NEXT: orq %rsi, %rdi -; X64-AVX-NEXT: sete %al -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: PR34866_commute: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: xorq -{{[0-9]+}}(%rsp), %rsi +; X64-AVX1-NEXT: xorq -{{[0-9]+}}(%rsp), %rdi +; X64-AVX1-NEXT: orq %rsi, %rdi +; X64-AVX1-NEXT: sete %al +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: PR34866_commute: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: xorq -{{[0-9]+}}(%rsp), %rsi +; X64-AVX512-NEXT: xorq -{{[0-9]+}}(%rsp), %rdi +; X64-AVX512-NEXT: orq %rsi, %rdi +; X64-AVX512-NEXT: sete %al +; X64-AVX512-NEXT: retq %bc_mmx = bitcast fp128 0xL00000000000000000000000000000000 to i128 %cmp = icmp eq i128 %x, %bc_mmx ret i1 %cmp diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll --- a/llvm/test/CodeGen/X86/fp128-i128.ll +++ b/llvm/test/CodeGen/X86/fp128-i128.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-android -mattr=+mmx -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+mmx -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-android -mattr=+mmx,avx2 -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+mmx,avx2 -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-android -mattr=+mmx,avx512vl -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+mmx,avx512vl -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-android -mattr=+mmx,avx2 -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+mmx,avx2 -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-android -mattr=+mmx,avx512vl -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 +; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+mmx,avx512vl -enable-legalize-types-checking | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 ; These tests were generated from simplified libm C code. ; When compiled for the x86_64-linux-android target, @@ -62,7 +62,7 @@ ; ; AVX-LABEL: TestUnionLD1: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax ; AVX-NEXT: shlq $48, %rax ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx @@ -71,7 +71,7 @@ ; AVX-NEXT: orq %rax, %rdx ; AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: jmp foo # TAILCALL entry: %0 = bitcast fp128 %s to i128 @@ -106,11 +106,11 @@ ; ; AVX-LABEL: TestUnionLD2: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: retq entry: %0 = bitcast fp128 %s to i128 @@ -139,7 +139,7 @@ ; SSE-NEXT: movq %rcx, (%rsp) ; SSE-NEXT: movaps (%rsp), %xmm0 ; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 -; SSE-NEXT: callq __lttf2 +; SSE-NEXT: callq __lttf2@PLT ; SSE-NEXT: xorl %ecx, %ecx ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: sets %cl @@ -151,20 +151,20 @@ ; AVX-LABEL: TestI128_1: ; AVX: # %bb.0: # %entry ; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF ; AVX-NEXT: andq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; AVX-NEXT: movq %rcx, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %xmm0 -; AVX-NEXT: vmovaps {{.*}}(%rip), %xmm1 -; AVX-NEXT: callq __lttf2 +; AVX-NEXT: vmovdqa (%rsp), %xmm0 +; AVX-NEXT: vmovdqa {{.*}}(%rip), %xmm1 +; AVX-NEXT: callq __lttf2@PLT ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: testl %eax, %eax ; AVX-NEXT: sets %cl ; AVX-NEXT: shlq $4, %rcx -; AVX-NEXT: vmovaps {{\.LCPI.*}}(%rcx), %xmm0 +; AVX-NEXT: vmovdqa {{\.LCPI.*}}(%rcx), %xmm0 ; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: retq entry: @@ -198,11 +198,11 @@ ; ; AVX-LABEL: TestI128_2: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: jns .LBB3_2 ; AVX-NEXT: # %bb.1: # %entry -; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: vmovdqa %xmm1, %xmm0 ; AVX-NEXT: .LBB3_2: # %entry ; AVX-NEXT: retq entry: @@ -237,7 +237,7 @@ ; SSE-NEXT: jmp .LBB4_3 ; SSE-NEXT: .LBB4_2: # %if.then ; SSE-NEXT: movaps {{.*}}(%rip), %xmm1 -; SSE-NEXT: callq __multf3 +; SSE-NEXT: callq __multf3@PLT ; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE-NEXT: movabsq $-9223090561878065153, %rdx # imm = 0x8000FFFFFFFFFFFF @@ -254,7 +254,7 @@ ; AVX-LABEL: TestI128_3: ; AVX: # %bb.0: # %entry ; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: movabsq $9223090561878065152, %rcx # imm = 0x7FFF000000000000 ; AVX-NEXT: testq %rcx, %rax @@ -263,9 +263,9 @@ ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: jmp .LBB4_3 ; AVX-NEXT: .LBB4_2: # %if.then -; AVX-NEXT: vmovaps {{.*}}(%rip), %xmm1 -; AVX-NEXT: callq __multf3 -; AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa {{.*}}(%rip), %xmm1 +; AVX-NEXT: callq __multf3@PLT +; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX-NEXT: movabsq $-9223090561878065153, %rdx # imm = 0x8000FFFFFFFFFFFF ; AVX-NEXT: andq {{[0-9]+}}(%rsp), %rdx @@ -274,7 +274,7 @@ ; AVX-NEXT: .LBB4_3: # %if.end ; AVX-NEXT: movq %rcx, (%rsp) ; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: vmovaps (%rsp), %xmm0 +; AVX-NEXT: vmovdqa (%rsp), %xmm0 ; AVX-NEXT: addq $56, %rsp ; AVX-NEXT: retq entry: @@ -319,12 +319,12 @@ ; ; AVX-LABEL: TestI128_4: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps %xmm0, %xmm1 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, %xmm1 +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: jmp __addtf3@PLT # TAILCALL entry: %0 = bitcast fp128 %x to i128 @@ -374,12 +374,12 @@ ; ; AVX-LABEL: acosl: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps %xmm0, %xmm1 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, %xmm1 +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: jmp __addtf3@PLT # TAILCALL entry: %0 = bitcast fp128 %x to i128 @@ -403,11 +403,11 @@ ; ; AVX-LABEL: TestComp: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: jns .LBB8_2 ; AVX-NEXT: # %bb.1: # %entry -; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: vmovdqa %xmm1, %xmm0 ; AVX-NEXT: .LBB8_2: # %entry ; AVX-NEXT: retq entry: @@ -426,10 +426,15 @@ ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: TestFABS_LD: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: TestFABS_LD: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: TestFABS_LD: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %call = tail call fp128 @fabsl(fp128 %x) #2 ret fp128 %call @@ -451,11 +456,11 @@ ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: callq __gttf2 +; SSE-NEXT: callq __gttf2@PLT ; SSE-NEXT: movl %eax, %ebp ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: callq __subtf3 +; SSE-NEXT: callq __subtf3@PLT ; SSE-NEXT: testl %ebp, %ebp ; SSE-NEXT: jle .LBB10_1 ; SSE-NEXT: # %bb.2: # %if.then @@ -478,41 +483,77 @@ ; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; -; AVX-LABEL: TestCopySign: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %rbx -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: movq %rdi, %rbx -; AVX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: callq __gttf2 -; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vmovaps %xmm0, %xmm1 -; AVX-NEXT: callq __subtf3 -; AVX-NEXT: testl %ebp, %ebp -; AVX-NEXT: jle .LBB10_1 -; AVX-NEXT: # %bb.2: # %if.then -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX-NEXT: jmp .LBB10_3 -; AVX-NEXT: .LBB10_1: -; AVX-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX-NEXT: .LBB10_3: # %cleanup -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm2, (%rbx) -; AVX-NEXT: vmovaps %xmm0, 16(%rbx) -; AVX-NEXT: movq %rbx, %rax -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX2-LABEL: TestCopySign: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $40, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm1 +; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: callq __gttf2@PLT +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovdqa %xmm0, %xmm1 +; AVX2-NEXT: callq __subtf3@PLT +; AVX2-NEXT: testl %ebp, %ebp +; AVX2-NEXT: jle .LBB10_1 +; AVX2-NEXT: # %bb.2: # %if.then +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: jmp .LBB10_3 +; AVX2-NEXT: .LBB10_1: +; AVX2-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-NEXT: .LBB10_3: # %cleanup +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm2, (%rbx) +; AVX2-NEXT: vmovdqa %xmm0, 16(%rbx) +; AVX2-NEXT: movq %rbx, %rax +; AVX2-NEXT: addq $40, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: TestCopySign: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0 +; AVX512-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: callq __gttf2@PLT +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovaps %xmm0, %xmm1 +; AVX512-NEXT: callq __subtf3@PLT +; AVX512-NEXT: testl %ebp, %ebp +; AVX512-NEXT: jle .LBB10_1 +; AVX512-NEXT: # %bb.2: # %if.then +; AVX512-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm2 +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: jmp .LBB10_3 +; AVX512-NEXT: .LBB10_1: +; AVX512-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX512-NEXT: .LBB10_3: # %cleanup +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm2, (%rbx) +; AVX512-NEXT: vmovaps %xmm0, 16(%rbx) +; AVX512-NEXT: movq %rbx, %rax +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq entry: %z.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %z, i64 0, i32 0 %z.real = load fp128, fp128* %z.realp, align 16 diff --git a/llvm/test/CodeGen/X86/function-subtarget-features.ll b/llvm/test/CodeGen/X86/function-subtarget-features.ll --- a/llvm/test/CodeGen/X86/function-subtarget-features.ll +++ b/llvm/test/CodeGen/X86/function-subtarget-features.ll @@ -12,7 +12,7 @@ } ; CHECK: barv -; CHECK: vmovss +; CHECK: vmovd define float @_Z4testv() #1 { entry: @@ -36,7 +36,7 @@ } ; CHECK: bazv -; CHECK: vmovss +; CHECK: vmovd define <2 x i64> @foo(<2 x i64> %a) #3 { entry: diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -887,7 +887,7 @@ ; ; X64-AVX2-LABEL: fshr_v4i32_shift_by_bitwidth: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovaps %xmm1, %xmm0 +; X64-AVX2-NEXT: vmovdqa %xmm1, %xmm0 ; X64-AVX2-NEXT: retq %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> ) ret <4 x i32> %f diff --git a/llvm/test/CodeGen/X86/i64-mem-copy.ll b/llvm/test/CodeGen/X86/i64-mem-copy.ll --- a/llvm/test/CodeGen/X86/i64-mem-copy.ll +++ b/llvm/test/CodeGen/X86/i64-mem-copy.ll @@ -25,8 +25,8 @@ ; X32AVX: # %bb.0: ; X32AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32AVX-NEXT: vmovsd %xmm0, (%eax) +; X32AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X32AVX-NEXT: vmovq %xmm0, (%eax) ; X32AVX-NEXT: retl %tmp1 = load i64, i64* %y, align 8 store i64 %tmp1, i64* %x, align 8 @@ -152,15 +152,15 @@ ; X32AVX-NEXT: movl %esp, %ebp ; X32AVX-NEXT: andl $-32, %esp ; X32AVX-NEXT: subl $96, %esp -; X32AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X32AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X32AVX-NEXT: movl 52(%ebp), %eax ; X32AVX-NEXT: andl $7, %eax ; X32AVX-NEXT: movl 48(%ebp), %ecx -; X32AVX-NEXT: vmovups 8(%ebp), %ymm1 -; X32AVX-NEXT: vmovaps %ymm1, (%esp) -; X32AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) -; X32AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32AVX-NEXT: vmovsd %xmm0, (%ecx) +; X32AVX-NEXT: vmovdqu 8(%ebp), %ymm1 +; X32AVX-NEXT: vmovdqa %ymm1, (%esp) +; X32AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp) +; X32AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X32AVX-NEXT: vmovq %xmm0, (%ecx) ; X32AVX-NEXT: movl %ebp, %esp ; X32AVX-NEXT: popl %ebp ; X32AVX-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll --- a/llvm/test/CodeGen/X86/i64-to-float.ll +++ b/llvm/test/CodeGen/X86/i64-to-float.ll @@ -32,7 +32,7 @@ ; ; X86-AVX512DQ-LABEL: mask_sitofp_2i64_2f64: ; X86-AVX512DQ: # %bb.0: -; X86-AVX512DQ-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX512DQ-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX512DQ-NEXT: vcvtqq2pd %xmm0, %xmm0 ; X86-AVX512DQ-NEXT: retl ; @@ -57,7 +57,7 @@ ; ; X64-AVX512DQ-LABEL: mask_sitofp_2i64_2f64: ; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX512DQ-NEXT: vcvtqq2pd %xmm0, %xmm0 ; X64-AVX512DQ-NEXT: retq %and = and <2 x i64> %a, @@ -87,7 +87,7 @@ ; ; X86-AVX512DQ-LABEL: mask_uitofp_2i64_2f64: ; X86-AVX512DQ: # %bb.0: -; X86-AVX512DQ-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX512DQ-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX512DQ-NEXT: vcvtqq2pd %xmm0, %xmm0 ; X86-AVX512DQ-NEXT: retl ; @@ -112,7 +112,7 @@ ; ; X64-AVX512DQ-LABEL: mask_uitofp_2i64_2f64: ; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX512DQ-NEXT: vcvtqq2pd %xmm0, %xmm0 ; X64-AVX512DQ-NEXT: retq %and = and <2 x i64> %a, @@ -147,7 +147,7 @@ ; ; X86-AVX512DQ-LABEL: mask_sitofp_4i64_4f32: ; X86-AVX512DQ: # %bb.0: -; X86-AVX512DQ-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; X86-AVX512DQ-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 ; X86-AVX512DQ-NEXT: vcvtqq2ps %ymm0, %xmm0 ; X86-AVX512DQ-NEXT: vzeroupper ; X86-AVX512DQ-NEXT: retl @@ -178,7 +178,7 @@ ; ; X64-AVX512DQ-LABEL: mask_sitofp_4i64_4f32: ; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: vcvtqq2ps %ymm0, %xmm0 ; X64-AVX512DQ-NEXT: vzeroupper ; X64-AVX512DQ-NEXT: retq @@ -214,7 +214,7 @@ ; ; X86-AVX512DQ-LABEL: mask_uitofp_4i64_4f32: ; X86-AVX512DQ: # %bb.0: -; X86-AVX512DQ-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; X86-AVX512DQ-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 ; X86-AVX512DQ-NEXT: vcvtqq2ps %ymm0, %xmm0 ; X86-AVX512DQ-NEXT: vzeroupper ; X86-AVX512DQ-NEXT: retl @@ -245,7 +245,7 @@ ; ; X64-AVX512DQ-LABEL: mask_uitofp_4i64_4f32: ; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: vcvtqq2ps %ymm0, %xmm0 ; X64-AVX512DQ-NEXT: vzeroupper ; X64-AVX512DQ-NEXT: retq diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll --- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll @@ -146,17 +146,29 @@ ; X64-SSE4-NEXT: pinsrq $0, %rdi, %xmm0 ; X64-SSE4-NEXT: retq ; -; X86-AVX-LABEL: elt0_v2i64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: elt0_v2i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X86-AVX1-NEXT: retl ; ; X64-AVX-LABEL: elt0_v2i64: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = ; X64-AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 ; X64-AVX-NEXT: retq +; +; X86-AVX2-LABEL: elt0_v2i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; X86-AVX2-NEXT: retl +; +; X86-AVX512F-LABEL: elt0_v2i64: +; X86-AVX512F: # %bb.0: +; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; X86-AVX512F-NEXT: retl %ins = insertelement <2 x i64> , i64 %x, i32 0 ret <2 x i64> %ins } @@ -225,11 +237,23 @@ ; X86-AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X86-AVX-NEXT: retl ; -; X64-AVX-LABEL: elt1_v2f64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u> -; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: elt1_v2f64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u> +; X64-AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: elt1_v2f64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = <4.2E+1,u> +; X64-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-AVX2-NEXT: retq +; +; X64-AVX512F-LABEL: elt1_v2f64: +; X64-AVX512F: # %bb.0: +; X64-AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <4.2E+1,u> +; X64-AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-AVX512F-NEXT: retq %ins = insertelement <2 x double> , double %x, i32 1 ret <2 x double> %ins } @@ -267,11 +291,11 @@ ; X64-SSE4-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3] ; X64-SSE4-NEXT: retq ; -; X86-AVX-LABEL: elt7_v8i32: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0 -; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: elt7_v8i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0 +; X86-AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; X86-AVX1-NEXT: retl ; ; X64-AVX1-LABEL: elt7_v8i32: ; X64-AVX1: # %bb.0: @@ -281,6 +305,12 @@ ; X64-AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; X64-AVX1-NEXT: retq ; +; X86-AVX2-LABEL: elt7_v8i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm0 +; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; X86-AVX2-NEXT: retl +; ; X64-AVX2-LABEL: elt7_v8i32: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovd %edi, %xmm0 @@ -288,6 +318,12 @@ ; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; X64-AVX2-NEXT: retq ; +; X86-AVX512F-LABEL: elt7_v8i32: +; X86-AVX512F: # %bb.0: +; X86-AVX512F-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm0 +; X86-AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; X86-AVX512F-NEXT: retl +; ; X64-AVX512F-LABEL: elt7_v8i32: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovd %edi, %xmm0 @@ -330,11 +366,11 @@ ; X64-SSE4-NEXT: movaps {{.*#+}} xmm0 = [4.2E+1,1.0E+0,2.0E+0,3.0E+0] ; X64-SSE4-NEXT: retq ; -; X86-AVX-LABEL: elt6_v8f32: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0 -; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: elt6_v8f32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0 +; X86-AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] +; X86-AVX1-NEXT: retl ; ; X64-AVX1-LABEL: elt6_v8f32: ; X64-AVX1: # %bb.0: @@ -343,16 +379,28 @@ ; X64-AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] ; X64-AVX1-NEXT: retq ; +; X86-AVX2-LABEL: elt6_v8f32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm0 +; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] +; X86-AVX2-NEXT: retl +; ; X64-AVX2-LABEL: elt6_v8f32: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vbroadcastss %xmm0, %ymm0 -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] +; X64-AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] ; X64-AVX2-NEXT: retq ; +; X86-AVX512F-LABEL: elt6_v8f32: +; X86-AVX512F: # %bb.0: +; X86-AVX512F-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm0 +; X86-AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] +; X86-AVX512F-NEXT: retl +; ; X64-AVX512F-LABEL: elt6_v8f32: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vbroadcastss %xmm0, %ymm0 -; X64-AVX512F-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] +; X64-AVX512F-NEXT: vpbroadcastd %xmm0, %ymm0 +; X64-AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] ; X64-AVX512F-NEXT: retq %ins = insertelement <8 x float> , float %x, i32 6 ret <8 x float> %ins @@ -407,11 +455,11 @@ ; ; X86-AVX2-LABEL: elt5_v8i64: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,0,0] -; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X86-AVX2-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm0, %ymm1 -; X86-AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0] +; X86-AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,0,0,0] +; X86-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X86-AVX2-NEXT: vinserti128 $1, {{\.LCPI.*}}, %ymm0, %ymm1 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0] ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: elt5_v8i64: @@ -419,17 +467,17 @@ ; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <4,u,6,7> ; X64-AVX2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; X64-AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,1,2,3] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [42,1,2,3] ; X64-AVX2-NEXT: retq ; ; X86-AVX512F-LABEL: elt5_v8i64: ; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0] -; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX512F-NEXT: vmovaps {{.*#+}} xmm2 = [4,0,0,0] -; X86-AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; X86-AVX512F-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm1, %ymm1 -; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; X86-AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0] +; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X86-AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,0,0] +; X86-AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; X86-AVX512F-NEXT: vinserti128 $1, {{\.LCPI.*}}, %ymm1, %ymm1 +; X86-AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl ; ; X64-AVX512F-LABEL: elt5_v8i64: @@ -484,15 +532,15 @@ ; X86-AVX2-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0> ; X86-AVX2-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; X86-AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: elt1_v8f64: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0> -; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; X64-AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0> +; X64-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X64-AVX2-NEXT: retq ; ; X86-AVX512F-LABEL: elt1_v8f64: @@ -505,10 +553,10 @@ ; ; X64-AVX512F-LABEL: elt1_v8f64: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> -; X64-AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> -; X64-AVX512F-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0 +; X64-AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> +; X64-AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> +; X64-AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 ; X64-AVX512F-NEXT: retq %ins = insertelement <8 x double> , double %x, i32 1 ret <8 x double> %ins diff --git a/llvm/test/CodeGen/X86/insert-loaded-scalar.ll b/llvm/test/CodeGen/X86/insert-loaded-scalar.ll --- a/llvm/test/CodeGen/X86/insert-loaded-scalar.ll +++ b/llvm/test/CodeGen/X86/insert-loaded-scalar.ll @@ -53,10 +53,15 @@ ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; -; AVX-LABEL: load32_ins_elt0_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: load32_ins_elt0_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: load32_ins_elt0_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: retq %x = load i32, i32* %p %ins = insertelement <4 x i32> undef, i32 %x, i32 0 ret <4 x i32> %ins @@ -68,10 +73,15 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: load64_ins_elt0_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: load64_ins_elt0_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: load64_ins_elt0_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: retq %x = load i64, i64* %p %ins = insertelement <2 x i64> undef, i64 %x, i32 0 ret <2 x i64> %ins @@ -83,10 +93,15 @@ ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; -; AVX-LABEL: load32_ins_elt0_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: load32_ins_elt0_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: load32_ins_elt0_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: retq %x = load float, float* %p %ins = insertelement <4 x float> undef, float %x, i32 0 ret <4 x float> %ins @@ -98,10 +113,15 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: load64_ins_elt0_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: load64_ins_elt0_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: load64_ins_elt0_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: retq %x = load double, double* %p %ins = insertelement <2 x double> undef, double %x, i32 0 ret <2 x double> %ins @@ -162,10 +182,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: load32_ins_eltc_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load32_ins_eltc_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load32_ins_eltc_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd (%rdi), %xmm0 +; AVX2-NEXT: retq %x = load i32, i32* %p %ins = insertelement <4 x i32> undef, i32 %x, i32 2 ret <4 x i32> %ins @@ -178,10 +203,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: load64_ins_eltc_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: load64_ins_eltc_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: load64_ins_eltc_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX2-NEXT: retq %x = load i64, i64* %p %ins = insertelement <2 x i64> undef, i64 %x, i32 1 ret <2 x i64> %ins @@ -194,10 +224,15 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: load32_ins_eltc_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load32_ins_eltc_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load32_ins_eltc_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd (%rdi), %xmm0 +; AVX2-NEXT: retq %x = load float, float* %p %ins = insertelement <4 x float> undef, float %x, i32 3 ret <4 x float> %ins @@ -209,10 +244,15 @@ ; SSE-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: load64_ins_eltc_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: load64_ins_eltc_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: load64_ins_eltc_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX2-NEXT: retq %x = load double, double* %p %ins = insertelement <2 x double> undef, double %x, i32 1 ret <2 x double> %ins @@ -268,10 +308,15 @@ ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; -; AVX-LABEL: load32_ins_elt0_v8i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: load32_ins_elt0_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: load32_ins_elt0_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: retq %x = load i32, i32* %p %ins = insertelement <8 x i32> undef, i32 %x, i32 0 ret <8 x i32> %ins @@ -283,10 +328,15 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: load64_ins_elt0_v4i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: load64_ins_elt0_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: load64_ins_elt0_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: retq %x = load i64, i64* %p %ins = insertelement <4 x i64> undef, i64 %x, i32 0 ret <4 x i64> %ins @@ -298,10 +348,15 @@ ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; -; AVX-LABEL: load32_ins_elt0_v8f32: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: load32_ins_elt0_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: load32_ins_elt0_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: retq %x = load float, float* %p %ins = insertelement <8 x float> undef, float %x, i32 0 ret <8 x float> %ins @@ -313,10 +368,15 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: load64_ins_elt0_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: load64_ins_elt0_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: load64_ins_elt0_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: retq %x = load double, double* %p %ins = insertelement <4 x double> undef, double %x, i32 0 ret <4 x double> %ins @@ -379,10 +439,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: load32_ins_eltc_v8i32: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load32_ins_eltc_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load32_ins_eltc_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2-NEXT: retq %x = load i32, i32* %p %ins = insertelement <8 x i32> undef, i32 %x, i32 7 ret <8 x i32> %ins @@ -395,10 +460,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: load64_ins_eltc_v4i64: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load64_ins_eltc_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load64_ins_eltc_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq %x = load i64, i64* %p %ins = insertelement <4 x i64> undef, i64 %x, i32 3 ret <4 x i64> %ins @@ -411,10 +481,15 @@ ; SSE-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] ; SSE-NEXT: retq ; -; AVX-LABEL: load32_ins_eltc_v8f32: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load32_ins_eltc_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load32_ins_eltc_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2-NEXT: retq %x = load float, float* %p %ins = insertelement <8 x float> undef, float %x, i32 5 ret <8 x float> %ins @@ -426,10 +501,15 @@ ; SSE-NEXT: movddup {{.*#+}} xmm1 = mem[0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: load64_ins_eltc_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load64_ins_eltc_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load64_ins_eltc_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq %x = load double, double* %p %ins = insertelement <4 x double> undef, double %x, i32 3 ret <4 x double> %ins diff --git a/llvm/test/CodeGen/X86/insertelement-shuffle.ll b/llvm/test/CodeGen/X86/insertelement-shuffle.ll --- a/llvm/test/CodeGen/X86/insertelement-shuffle.ll +++ b/llvm/test/CodeGen/X86/insertelement-shuffle.ll @@ -72,8 +72,8 @@ define <8 x i64> @insert_subvector_into_undef(i32 %x0, i32 %x1) nounwind { ; X86_AVX256-LABEL: insert_subvector_into_undef: ; X86_AVX256: # %bb.0: -; X86_AVX256-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0 -; X86_AVX256-NEXT: vmovaps %ymm0, %ymm1 +; X86_AVX256-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %ymm0 +; X86_AVX256-NEXT: vmovdqa %ymm0, %ymm1 ; X86_AVX256-NEXT: retl ; ; X64_AVX256-LABEL: insert_subvector_into_undef: @@ -86,7 +86,7 @@ ; ; X86_AVX512-LABEL: insert_subvector_into_undef: ; X86_AVX512: # %bb.0: -; X86_AVX512-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 +; X86_AVX512-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %zmm0 ; X86_AVX512-NEXT: retl ; ; X64_AVX512-LABEL: insert_subvector_into_undef: diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll --- a/llvm/test/CodeGen/X86/insertelement-var-index.ll +++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll @@ -171,12 +171,12 @@ ; ; AVX2-LABEL: arg_f32_v4f32_undef: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: arg_f32_v4f32_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512-NEXT: retq %ins = insertelement <4 x float> undef, float %x, i32 %y ret <4 x float> %ins @@ -193,10 +193,20 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: arg_f64_v2f64_undef: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: arg_f64_v2f64_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: arg_f64_v2f64_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: arg_f64_v2f64_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512-NEXT: retq %ins = insertelement <2 x double> undef, double %x, i32 %y ret <2 x double> %ins } @@ -279,10 +289,20 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: load_i32_v4i32_undef: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_i32_v4i32_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_i32_v4i32_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_i32_v4i32_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd (%rdi), %xmm0 +; AVX512-NEXT: retq %x = load i32, i32* %p %ins = insertelement <4 x i32> undef, i32 %x, i32 %y ret <4 x i32> %ins @@ -295,10 +315,20 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: load_i64_v2i64_undef: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: load_i64_v2i64_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_i64_v2i64_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_i64_v2i64_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX512-NEXT: retq %x = load i64, i64* %p %ins = insertelement <2 x i64> undef, i64 %x, i32 %y ret <2 x i64> %ins @@ -311,10 +341,20 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: load_f32_v4f32_undef: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_f32_v4f32_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_f32_v4f32_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_f32_v4f32_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd (%rdi), %xmm0 +; AVX512-NEXT: retq %x = load float, float* %p %ins = insertelement <4 x float> undef, float %x, i32 %y ret <4 x float> %ins @@ -332,10 +372,20 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: load_f64_v2f64_undef: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: load_f64_v2f64_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_f64_v2f64_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_f64_v2f64_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX512-NEXT: retq %x = load double, double* %p %ins = insertelement <2 x double> undef, double %x, i32 %y ret <2 x double> %ins @@ -497,12 +547,12 @@ ; ; AVX2-LABEL: arg_f32_v8f32_undef: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: arg_f32_v8f32_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512-NEXT: retq %ins = insertelement <8 x float> undef, float %x, i32 %y ret <8 x float> %ins @@ -526,12 +576,12 @@ ; ; AVX2-LABEL: arg_f64_v4f64_undef: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: arg_f64_v4f64_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: retq %ins = insertelement <4 x double> undef, double %x, i32 %y ret <4 x double> %ins @@ -616,10 +666,20 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_i32_v8i32_undef: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_i32_v8i32_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_i32_v8i32_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_i32_v8i32_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX512-NEXT: retq %x = load i32, i32* %p %ins = insertelement <8 x i32> undef, i32 %x, i32 %y ret <8 x i32> %ins @@ -636,10 +696,20 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_i64_v4i64_undef: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_i64_v4i64_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_i64_v4i64_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_i64_v4i64_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: retq %x = load i64, i64* %p %ins = insertelement <4 x i64> undef, i64 %x, i32 %y ret <4 x i64> %ins @@ -656,10 +726,20 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_f32_v8f32_undef: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_f32_v8f32_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_f32_v8f32_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_f32_v8f32_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX512-NEXT: retq %x = load float, float* %p %ins = insertelement <8 x float> undef, float %x, i32 %y ret <8 x float> %ins @@ -676,10 +756,20 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_f64_v4f64_undef: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_f64_v4f64_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_f64_v4f64_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_f64_v4f64_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: retq %x = load double, double* %p %ins = insertelement <4 x double> undef, double %x, i32 %y ret <4 x double> %ins @@ -699,14 +789,32 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: arg_i8_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: movb %dil, -24(%rsp,%rsi) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: arg_i8_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $15, %esi +; AVX1-NEXT: movb %dil, -24(%rsp,%rsi) +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: arg_i8_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: movb %dil, -24(%rsp,%rsi) +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: arg_i8_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: andl $15, %esi +; AVX512-NEXT: movb %dil, -24(%rsp,%rsi) +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX512-NEXT: retq %ins = insertelement <16 x i8> %v, i8 %x, i32 %y ret <16 x i8> %ins } @@ -721,14 +829,32 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: arg_i16_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $7, %esi -; AVX-NEXT: movw %di, -24(%rsp,%rsi,2) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: arg_i16_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $7, %esi +; AVX1-NEXT: movw %di, -24(%rsp,%rsi,2) +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: arg_i16_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $7, %esi +; AVX2-NEXT: movw %di, -24(%rsp,%rsi,2) +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: arg_i16_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: andl $7, %esi +; AVX512-NEXT: movw %di, -24(%rsp,%rsi,2) +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX512-NEXT: retq %ins = insertelement <8 x i16> %v, i16 %x, i32 %y ret <8 x i16> %ins } @@ -743,14 +869,32 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: arg_i32_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $3, %esi -; AVX-NEXT: movl %edi, -24(%rsp,%rsi,4) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: arg_i32_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: movl %edi, -24(%rsp,%rsi,4) +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: arg_i32_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: movl %edi, -24(%rsp,%rsi,4) +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: arg_i32_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: andl $3, %esi +; AVX512-NEXT: movl %edi, -24(%rsp,%rsi,4) +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX512-NEXT: retq %ins = insertelement <4 x i32> %v, i32 %x, i32 %y ret <4 x i32> %ins } @@ -765,14 +909,32 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: arg_i64_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $1, %esi -; AVX-NEXT: movq %rdi, -24(%rsp,%rsi,8) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: arg_i64_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $1, %esi +; AVX1-NEXT: movq %rdi, -24(%rsp,%rsi,8) +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: arg_i64_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: movq %rdi, -24(%rsp,%rsi,8) +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: arg_i64_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: andl $1, %esi +; AVX512-NEXT: movq %rdi, -24(%rsp,%rsi,8) +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX512-NEXT: retq %ins = insertelement <2 x i64> %v, i64 %x, i32 %y ret <2 x i64> %ins } @@ -787,14 +949,32 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: arg_f32_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $edi killed $edi def $rdi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $3, %edi -; AVX-NEXT: vmovss %xmm1, -24(%rsp,%rdi,4) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: arg_f32_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $3, %edi +; AVX1-NEXT: vmovss %xmm1, -24(%rsp,%rdi,4) +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: arg_f32_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $edi killed $edi def $rdi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: vmovd %xmm1, -24(%rsp,%rdi,4) +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: arg_f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $edi killed $edi def $rdi +; AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: andl $3, %edi +; AVX512-NEXT: vmovd %xmm1, -24(%rsp,%rdi,4) +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX512-NEXT: retq %ins = insertelement <4 x float> %v, float %x, i32 %y ret <4 x float> %ins } @@ -809,14 +989,32 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: arg_f64_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $edi killed $edi def $rdi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $1, %edi -; AVX-NEXT: vmovsd %xmm1, -24(%rsp,%rdi,8) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: arg_f64_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: vmovsd %xmm1, -24(%rsp,%rdi,8) +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: arg_f64_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $edi killed $edi def $rdi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: vmovq %xmm1, -24(%rsp,%rdi,8) +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: arg_f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $edi killed $edi def $rdi +; AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: andl $1, %edi +; AVX512-NEXT: vmovq %xmm1, -24(%rsp,%rdi,8) +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX512-NEXT: retq %ins = insertelement <2 x double> %v, double %x, i32 %y ret <2 x double> %ins } @@ -832,15 +1030,35 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load_i8_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: movb (%rdi), %al -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: movb %al, -24(%rsp,%rsi) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_i8_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: movb (%rdi), %al +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $15, %esi +; AVX1-NEXT: movb %al, -24(%rsp,%rsi) +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_i8_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: movb (%rdi), %al +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: movb %al, -24(%rsp,%rsi) +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_i8_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: movb (%rdi), %al +; AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: andl $15, %esi +; AVX512-NEXT: movb %al, -24(%rsp,%rsi) +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX512-NEXT: retq %x = load i8, i8* %p %ins = insertelement <16 x i8> %v, i8 %x, i32 %y ret <16 x i8> %ins @@ -857,15 +1075,35 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load_i16_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $7, %esi -; AVX-NEXT: movw %ax, -24(%rsp,%rsi,2) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_i16_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $7, %esi +; AVX1-NEXT: movw %ax, -24(%rsp,%rsi,2) +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_i16_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: movzwl (%rdi), %eax +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $7, %esi +; AVX2-NEXT: movw %ax, -24(%rsp,%rsi,2) +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_i16_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: movzwl (%rdi), %eax +; AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: andl $7, %esi +; AVX512-NEXT: movw %ax, -24(%rsp,%rsi,2) +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX512-NEXT: retq %x = load i16, i16* %p %ins = insertelement <8 x i16> %v, i16 %x, i32 %y ret <8 x i16> %ins @@ -882,15 +1120,35 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load_i32_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: movl (%rdi), %eax -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $3, %esi -; AVX-NEXT: movl %eax, -24(%rsp,%rsi,4) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_i32_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: movl %eax, -24(%rsp,%rsi,4) +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_i32_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: movl %eax, -24(%rsp,%rsi,4) +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_i32_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: movl (%rdi), %eax +; AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: andl $3, %esi +; AVX512-NEXT: movl %eax, -24(%rsp,%rsi,4) +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX512-NEXT: retq %x = load i32, i32* %p %ins = insertelement <4 x i32> %v, i32 %x, i32 %y ret <4 x i32> %ins @@ -907,15 +1165,35 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load_i64_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $1, %esi -; AVX-NEXT: movq %rax, -24(%rsp,%rsi,8) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_i64_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $1, %esi +; AVX1-NEXT: movq %rax, -24(%rsp,%rsi,8) +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_i64_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: movq %rax, -24(%rsp,%rsi,8) +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_i64_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: andl $1, %esi +; AVX512-NEXT: movq %rax, -24(%rsp,%rsi,8) +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX512-NEXT: retq %x = load i64, i64* %p %ins = insertelement <2 x i64> %v, i64 %x, i32 %y ret <2 x i64> %ins @@ -932,15 +1210,35 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load_f32_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $3, %esi -; AVX-NEXT: vmovss %xmm1, -24(%rsp,%rsi,4) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_f32_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: vmovss %xmm1, -24(%rsp,%rsi,4) +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_f32_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: vmovd %xmm1, -24(%rsp,%rsi,4) +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: andl $3, %esi +; AVX512-NEXT: vmovd %xmm1, -24(%rsp,%rsi,4) +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX512-NEXT: retq %x = load float, float* %p %ins = insertelement <4 x float> %v, float %x, i32 %y ret <4 x float> %ins @@ -957,15 +1255,35 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load_f64_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $1, %esi -; AVX-NEXT: vmovsd %xmm1, -24(%rsp,%rsi,8) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_f64_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $1, %esi +; AVX1-NEXT: vmovsd %xmm1, -24(%rsp,%rsi,8) +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_f64_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: vmovq %xmm1, -24(%rsp,%rsi,8) +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: andl $1, %esi +; AVX512-NEXT: vmovq %xmm1, -24(%rsp,%rsi,8) +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; AVX512-NEXT: retq %x = load double, double* %p %ins = insertelement <2 x double> %v, double %x, i32 %y ret <2 x double> %ins @@ -983,20 +1301,50 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: arg_i8_v32i8: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: andl $31, %esi -; AVX-NEXT: movb %dil, (%rsp,%rsi) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: arg_i8_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: andl $31, %esi +; AVX1-NEXT: movb %dil, (%rsp,%rsi) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: arg_i8_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: andl $31, %esi +; AVX2-NEXT: movb %dil, (%rsp,%rsi) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: arg_i8_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) +; AVX512-NEXT: andl $31, %esi +; AVX512-NEXT: movb %dil, (%rsp,%rsi) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %ins = insertelement <32 x i8> %v, i8 %x, i32 %y ret <32 x i8> %ins } @@ -1013,20 +1361,50 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: arg_i16_v16i16: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: movw %di, (%rsp,%rsi,2) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: arg_i16_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: andl $15, %esi +; AVX1-NEXT: movw %di, (%rsp,%rsi,2) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: arg_i16_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: movw %di, (%rsp,%rsi,2) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: arg_i16_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) +; AVX512-NEXT: andl $15, %esi +; AVX512-NEXT: movw %di, (%rsp,%rsi,2) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %ins = insertelement <16 x i16> %v, i16 %x, i32 %y ret <16 x i16> %ins } @@ -1043,20 +1421,50 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: arg_i32_v8i32: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: andl $7, %esi -; AVX-NEXT: movl %edi, (%rsp,%rsi,4) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: arg_i32_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: andl $7, %esi +; AVX1-NEXT: movl %edi, (%rsp,%rsi,4) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: arg_i32_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: andl $7, %esi +; AVX2-NEXT: movl %edi, (%rsp,%rsi,4) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: arg_i32_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) +; AVX512-NEXT: andl $7, %esi +; AVX512-NEXT: movl %edi, (%rsp,%rsi,4) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %ins = insertelement <8 x i32> %v, i32 %x, i32 %y ret <8 x i32> %ins } @@ -1073,20 +1481,50 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: arg_i64_v4i64: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: andl $3, %esi -; AVX-NEXT: movq %rdi, (%rsp,%rsi,8) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: arg_i64_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: movq %rdi, (%rsp,%rsi,8) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: arg_i64_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: movq %rdi, (%rsp,%rsi,8) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: arg_i64_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) +; AVX512-NEXT: andl $3, %esi +; AVX512-NEXT: movq %rdi, (%rsp,%rsi,8) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %ins = insertelement <4 x i64> %v, i64 %x, i32 %y ret <4 x i64> %ins } @@ -1103,20 +1541,50 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: arg_f32_v8f32: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: # kill: def $edi killed $edi def $rdi -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: andl $7, %edi -; AVX-NEXT: vmovss %xmm1, (%rsp,%rdi,4) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: arg_f32_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: andl $7, %edi +; AVX1-NEXT: vmovss %xmm1, (%rsp,%rdi,4) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: arg_f32_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: # kill: def $edi killed $edi def $rdi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: andl $7, %edi +; AVX2-NEXT: vmovd %xmm1, (%rsp,%rdi,4) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: arg_f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: # kill: def $edi killed $edi def $rdi +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) +; AVX512-NEXT: andl $7, %edi +; AVX512-NEXT: vmovd %xmm1, (%rsp,%rdi,4) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %ins = insertelement <8 x float> %v, float %x, i32 %y ret <8 x float> %ins } @@ -1133,20 +1601,50 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: arg_f64_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: # kill: def $edi killed $edi def $rdi -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: andl $3, %edi -; AVX-NEXT: vmovsd %xmm1, (%rsp,%rdi,8) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: arg_f64_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: andl $3, %edi +; AVX1-NEXT: vmovsd %xmm1, (%rsp,%rdi,8) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: arg_f64_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: # kill: def $edi killed $edi def $rdi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: vmovq %xmm1, (%rsp,%rdi,8) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: arg_f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: # kill: def $edi killed $edi def $rdi +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) +; AVX512-NEXT: andl $3, %edi +; AVX512-NEXT: vmovq %xmm1, (%rsp,%rdi,8) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %ins = insertelement <4 x double> %v, double %x, i32 %y ret <4 x double> %ins } @@ -1164,21 +1662,53 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_i8_v32i8: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: movb (%rdi), %al -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: andl $31, %esi -; AVX-NEXT: movb %al, (%rsp,%rsi) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: load_i8_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: movb (%rdi), %al +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: andl $31, %esi +; AVX1-NEXT: movb %al, (%rsp,%rsi) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_i8_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: movb (%rdi), %al +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: andl $31, %esi +; AVX2-NEXT: movb %al, (%rsp,%rsi) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_i8_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: movb (%rdi), %al +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) +; AVX512-NEXT: andl $31, %esi +; AVX512-NEXT: movb %al, (%rsp,%rsi) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %x = load i8, i8* %p %ins = insertelement <32 x i8> %v, i8 %x, i32 %y ret <32 x i8> %ins @@ -1197,21 +1727,53 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_i16_v16i16: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: movw %ax, (%rsp,%rsi,2) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: load_i16_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: andl $15, %esi +; AVX1-NEXT: movw %ax, (%rsp,%rsi,2) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_i16_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: movzwl (%rdi), %eax +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: movw %ax, (%rsp,%rsi,2) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_i16_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: movzwl (%rdi), %eax +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) +; AVX512-NEXT: andl $15, %esi +; AVX512-NEXT: movw %ax, (%rsp,%rsi,2) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %x = load i16, i16* %p %ins = insertelement <16 x i16> %v, i16 %x, i32 %y ret <16 x i16> %ins @@ -1230,21 +1792,53 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_i32_v8i32: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: movl (%rdi), %eax -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: andl $7, %esi -; AVX-NEXT: movl %eax, (%rsp,%rsi,4) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: load_i32_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: andl $7, %esi +; AVX1-NEXT: movl %eax, (%rsp,%rsi,4) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_i32_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: andl $7, %esi +; AVX2-NEXT: movl %eax, (%rsp,%rsi,4) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_i32_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: movl (%rdi), %eax +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) +; AVX512-NEXT: andl $7, %esi +; AVX512-NEXT: movl %eax, (%rsp,%rsi,4) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %x = load i32, i32* %p %ins = insertelement <8 x i32> %v, i32 %x, i32 %y ret <8 x i32> %ins @@ -1263,21 +1857,53 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_i64_v4i64: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: andl $3, %esi -; AVX-NEXT: movq %rax, (%rsp,%rsi,8) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: load_i64_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: movq %rax, (%rsp,%rsi,8) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_i64_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: movq %rax, (%rsp,%rsi,8) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_i64_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) +; AVX512-NEXT: andl $3, %esi +; AVX512-NEXT: movq %rax, (%rsp,%rsi,8) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %x = load i64, i64* %p %ins = insertelement <4 x i64> %v, i64 %x, i32 %y ret <4 x i64> %ins @@ -1296,21 +1922,53 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_f32_v8f32: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: andl $7, %esi -; AVX-NEXT: vmovss %xmm1, (%rsp,%rsi,4) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: load_f32_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: andl $7, %esi +; AVX1-NEXT: vmovss %xmm1, (%rsp,%rsi,4) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_f32_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: andl $7, %esi +; AVX2-NEXT: vmovd %xmm1, (%rsp,%rsi,4) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) +; AVX512-NEXT: andl $7, %esi +; AVX512-NEXT: vmovd %xmm1, (%rsp,%rsi,4) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %x = load float, float* %p %ins = insertelement <8 x float> %v, float %x, i32 %y ret <8 x float> %ins @@ -1329,21 +1987,53 @@ ; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_f64_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: andl $3, %esi -; AVX-NEXT: vmovsd %xmm1, (%rsp,%rsi,8) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: load_f64_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: vmovsd %xmm1, (%rsp,%rsi,8) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_f64_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: vmovq %xmm1, (%rsp,%rsi,8) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: # kill: def $esi killed $esi def $rsi +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) +; AVX512-NEXT: andl $3, %esi +; AVX512-NEXT: vmovq %xmm1, (%rsp,%rsi,8) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %x = load double, double* %p %ins = insertelement <4 x double> %v, double %x, i32 %y ret <4 x double> %ins diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll --- a/llvm/test/CodeGen/X86/insertelement-zero.ll +++ b/llvm/test/CodeGen/X86/insertelement-zero.ll @@ -32,11 +32,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v2f64_z1: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: insert_v2f64_z1: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_v2f64_z1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %1 = insertelement <2 x double> %a, double 0.0, i32 0 ret <2 x double> %1 } @@ -70,11 +76,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v4f64_0zz3: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] -; AVX-NEXT: retq +; AVX1-LABEL: insert_v4f64_0zz3: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_v4f64_0zz3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX2-NEXT: retq %1 = insertelement <4 x double> %a, double 0.0, i32 1 %2 = insertelement <4 x double> %1, double 0.0, i32 2 ret <4 x double> %2 @@ -105,11 +117,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v2i64_z1: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: insert_v2i64_z1: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_v2i64_z1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %1 = insertelement <2 x i64> %a, i64 0, i32 0 ret <2 x i64> %1 } @@ -139,11 +157,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v4i64_01z3: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX-NEXT: retq +; AVX1-LABEL: insert_v4i64_01z3: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_v4i64_01z3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq %1 = insertelement <4 x i64> %a, i64 0, i32 2 ret <4 x i64> %1 } @@ -176,11 +200,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v4f32_01z3: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: insert_v4f32_01z3: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_v4f32_01z3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2-NEXT: retq %1 = insertelement <4 x float> %a, float 0.0, i32 2 ret <4 x float> %1 } @@ -217,11 +247,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v8f32_z12345z7: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] -; AVX-NEXT: retq +; AVX1-LABEL: insert_v8f32_z12345z7: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_v8f32_z12345z7: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] +; AVX2-NEXT: retq %1 = insertelement <8 x float> %a, float 0.0, i32 0 %2 = insertelement <8 x float> %1, float 0.0, i32 6 ret <8 x float> %2 @@ -255,11 +291,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v4i32_01z3: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: insert_v4i32_01z3: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_v4i32_01z3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2-NEXT: retq %1 = insertelement <4 x i32> %a, i32 0, i32 2 ret <4 x i32> %1 } @@ -299,11 +341,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v8i32_z12345z7: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] -; AVX-NEXT: retq +; AVX1-LABEL: insert_v8i32_z12345z7: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_v8i32_z12345z7: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] +; AVX2-NEXT: retq %1 = insertelement <8 x i32> %a, i32 0, i32 0 %2 = insertelement <8 x i32> %1, i32 0, i32 6 ret <8 x i32> %2 @@ -379,10 +427,15 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v16i16_z12345z789ABCDEz: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: insert_v16i16_z12345z789ABCDEz: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_v16i16_z12345z789ABCDEz: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq %1 = insertelement <16 x i16> %a, i16 0, i32 0 %2 = insertelement <16 x i16> %1, i16 0, i32 6 %3 = insertelement <16 x i16> %2, i16 0, i32 15 @@ -428,7 +481,7 @@ ; ; AVX2-FAST-LABEL: insert_v16i8_z123456789ABCDEz: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: retq %1 = insertelement <16 x i8> %a, i8 0, i32 0 %2 = insertelement <16 x i8> %1, i8 0, i32 15 @@ -584,13 +637,21 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: PR41512_v8f32: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: PR41512_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR41512_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq %ins1 = insertelement <8 x float> zeroinitializer, float %x, i32 0 %ins2 = insertelement <8 x float> zeroinitializer, float %y, i32 0 %r = shufflevector <8 x float> %ins1, <8 x float> %ins2, <8 x i32> @@ -605,12 +666,19 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: PR41512_loads: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: PR41512_loads: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR41512_loads: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq %x = load i32, i32* %p1 %y = load i32, i32* %p2 %ins1 = insertelement <4 x i32> , i32 %x, i32 0 diff --git a/llvm/test/CodeGen/X86/insertps-combine.ll b/llvm/test/CodeGen/X86/insertps-combine.ll --- a/llvm/test/CodeGen/X86/insertps-combine.ll +++ b/llvm/test/CodeGen/X86/insertps-combine.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2OR512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX2OR512 define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) { ; SSE-LABEL: shuffle_v4f32_0z27: @@ -148,11 +148,17 @@ ; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; SSE-NEXT: retq ; -; AVX-LABEL: insertps_undef_input1: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: insertps_undef_input1: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: insertps_undef_input1: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2OR512-NEXT: retq %res0 = fadd <4 x float> %a1, %res1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %res0, i8 21) %res2 = shufflevector <4 x float> %res1, <4 x float> zeroinitializer, <4 x i32> @@ -237,10 +243,15 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: consecutive_load_insertps_04zz: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: consecutive_load_insertps_04zz: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: consecutive_load_insertps_04zz: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2OR512-NEXT: retq %p0 = getelementptr inbounds float, float* %p, i64 1 %p1 = getelementptr inbounds float, float* %p, i64 2 %s0 = load float, float* %p0 @@ -257,10 +268,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: extract_zero_insertps_z0z7: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: extract_zero_insertps_z0z7: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: extract_zero_insertps_z0z7: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2OR512-NEXT: retq %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 21) %ext = extractelement <4 x float> %res, i32 0 ret float %ext diff --git a/llvm/test/CodeGen/X86/keylocker-intrinsics.ll b/llvm/test/CodeGen/X86/keylocker-intrinsics.ll --- a/llvm/test/CodeGen/X86/keylocker-intrinsics.ll +++ b/llvm/test/CodeGen/X86/keylocker-intrinsics.ll @@ -60,12 +60,12 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: encodekey128 %eax, %eax -; X32-NEXT: vmovaps %xmm0, (%ebp) -; X32-NEXT: vmovaps %xmm1, (%ebx) -; X32-NEXT: vmovaps %xmm2, (%edi) -; X32-NEXT: vmovaps %xmm4, (%esi) -; X32-NEXT: vmovaps %xmm5, (%edx) -; X32-NEXT: vmovaps %xmm6, (%ecx) +; X32-NEXT: vmovdqa %xmm0, (%ebp) +; X32-NEXT: vmovdqa %xmm1, (%ebx) +; X32-NEXT: vmovdqa %xmm2, (%edi) +; X32-NEXT: vmovdqa %xmm4, (%esi) +; X32-NEXT: vmovdqa %xmm5, (%edx) +; X32-NEXT: vmovdqa %xmm6, (%ecx) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -116,12 +116,12 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: encodekey256 %eax, %eax -; X32-NEXT: vmovaps %xmm0, (%ebp) -; X32-NEXT: vmovaps %xmm1, (%ebx) -; X32-NEXT: vmovaps %xmm2, (%edi) -; X32-NEXT: vmovaps %xmm3, (%esi) -; X32-NEXT: vmovaps %xmm4, (%edx) -; X32-NEXT: vmovaps %xmm5, (%ecx) +; X32-NEXT: vmovdqa %xmm0, (%ebp) +; X32-NEXT: vmovdqa %xmm1, (%ebx) +; X32-NEXT: vmovdqa %xmm2, (%edi) +; X32-NEXT: vmovdqa %xmm3, (%esi) +; X32-NEXT: vmovdqa %xmm4, (%edx) +; X32-NEXT: vmovdqa %xmm5, (%ecx) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -159,7 +159,7 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: aesenc128kl (%eax), %xmm0 ; X32-NEXT: sete %al -; X32-NEXT: vmovaps %xmm0, (%ecx) +; X32-NEXT: vmovdqa %xmm0, (%ecx) ; X32-NEXT: retl entry: %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> %data, i8* %h) @@ -183,7 +183,7 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: aesdec128kl (%eax), %xmm0 ; X32-NEXT: sete %al -; X32-NEXT: vmovaps %xmm0, (%ecx) +; X32-NEXT: vmovdqa %xmm0, (%ecx) ; X32-NEXT: retl entry: %0 = tail call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> %data, i8* %h) @@ -207,7 +207,7 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: aesenc256kl (%eax), %xmm0 ; X32-NEXT: sete %al -; X32-NEXT: vmovaps %xmm0, (%ecx) +; X32-NEXT: vmovdqa %xmm0, (%ecx) ; X32-NEXT: retl entry: %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %data, i8* %h) @@ -231,7 +231,7 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: aesdec256kl (%eax), %xmm0 ; X32-NEXT: sete %al -; X32-NEXT: vmovaps %xmm0, (%ecx) +; X32-NEXT: vmovdqa %xmm0, (%ecx) ; X32-NEXT: retl entry: %0 = tail call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> %data, i8* %h) @@ -267,29 +267,29 @@ ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $16, %esp -; X32-NEXT: vmovaps 24(%ebp), %xmm3 -; X32-NEXT: vmovaps 40(%ebp), %xmm4 -; X32-NEXT: vmovaps 56(%ebp), %xmm5 -; X32-NEXT: vmovaps 72(%ebp), %xmm6 -; X32-NEXT: vmovaps 88(%ebp), %xmm7 +; X32-NEXT: vmovdqa 24(%ebp), %xmm3 +; X32-NEXT: vmovdqa 40(%ebp), %xmm4 +; X32-NEXT: vmovdqa 56(%ebp), %xmm5 +; X32-NEXT: vmovdqa 72(%ebp), %xmm6 +; X32-NEXT: vmovdqa 88(%ebp), %xmm7 ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: aesencwide128kl (%eax) ; X32-NEXT: movl 104(%ebp), %eax -; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vmovdqa %xmm0, (%eax) ; X32-NEXT: movl 108(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 112(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 116(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 120(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 124(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 128(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 132(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: sete %al ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp @@ -342,29 +342,29 @@ ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $16, %esp -; X32-NEXT: vmovaps 24(%ebp), %xmm3 -; X32-NEXT: vmovaps 40(%ebp), %xmm4 -; X32-NEXT: vmovaps 56(%ebp), %xmm5 -; X32-NEXT: vmovaps 72(%ebp), %xmm6 -; X32-NEXT: vmovaps 88(%ebp), %xmm7 +; X32-NEXT: vmovdqa 24(%ebp), %xmm3 +; X32-NEXT: vmovdqa 40(%ebp), %xmm4 +; X32-NEXT: vmovdqa 56(%ebp), %xmm5 +; X32-NEXT: vmovdqa 72(%ebp), %xmm6 +; X32-NEXT: vmovdqa 88(%ebp), %xmm7 ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: aesdecwide128kl (%eax) ; X32-NEXT: movl 104(%ebp), %eax -; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vmovdqa %xmm0, (%eax) ; X32-NEXT: movl 108(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 112(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 116(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 120(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 124(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 128(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 132(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: sete %al ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp @@ -417,29 +417,29 @@ ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $16, %esp -; X32-NEXT: vmovaps 24(%ebp), %xmm3 -; X32-NEXT: vmovaps 40(%ebp), %xmm4 -; X32-NEXT: vmovaps 56(%ebp), %xmm5 -; X32-NEXT: vmovaps 72(%ebp), %xmm6 -; X32-NEXT: vmovaps 88(%ebp), %xmm7 +; X32-NEXT: vmovdqa 24(%ebp), %xmm3 +; X32-NEXT: vmovdqa 40(%ebp), %xmm4 +; X32-NEXT: vmovdqa 56(%ebp), %xmm5 +; X32-NEXT: vmovdqa 72(%ebp), %xmm6 +; X32-NEXT: vmovdqa 88(%ebp), %xmm7 ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: aesencwide256kl (%eax) ; X32-NEXT: movl 104(%ebp), %eax -; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vmovdqa %xmm0, (%eax) ; X32-NEXT: movl 108(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 112(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 116(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 120(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 124(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 128(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 132(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: sete %al ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp @@ -492,29 +492,29 @@ ; X32-NEXT: movl %esp, %ebp ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $16, %esp -; X32-NEXT: vmovaps 24(%ebp), %xmm3 -; X32-NEXT: vmovaps 40(%ebp), %xmm4 -; X32-NEXT: vmovaps 56(%ebp), %xmm5 -; X32-NEXT: vmovaps 72(%ebp), %xmm6 -; X32-NEXT: vmovaps 88(%ebp), %xmm7 +; X32-NEXT: vmovdqa 24(%ebp), %xmm3 +; X32-NEXT: vmovdqa 40(%ebp), %xmm4 +; X32-NEXT: vmovdqa 56(%ebp), %xmm5 +; X32-NEXT: vmovdqa 72(%ebp), %xmm6 +; X32-NEXT: vmovdqa 88(%ebp), %xmm7 ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: aesdecwide256kl (%eax) ; X32-NEXT: movl 104(%ebp), %eax -; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vmovdqa %xmm0, (%eax) ; X32-NEXT: movl 108(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 112(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 116(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 120(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 124(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 128(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 132(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: sete %al ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp @@ -558,7 +558,7 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: aesenc256kl foo, %xmm0 ; X32-NEXT: sete %al -; X32-NEXT: vmovaps %xmm0, (%ecx) +; X32-NEXT: vmovdqa %xmm0, (%ecx) ; X32-NEXT: retl entry: %h = bitcast [64 x i8]* @foo to i8* @@ -593,27 +593,27 @@ ; X32-NEXT: andl $-16, %esp ; X32-NEXT: subl $16, %esp ; X32-NEXT: movl 88(%ebp), %eax -; X32-NEXT: vmovaps 8(%ebp), %xmm3 -; X32-NEXT: vmovaps 24(%ebp), %xmm4 -; X32-NEXT: vmovaps 40(%ebp), %xmm5 -; X32-NEXT: vmovaps 56(%ebp), %xmm6 -; X32-NEXT: vmovaps 72(%ebp), %xmm7 +; X32-NEXT: vmovdqa 8(%ebp), %xmm3 +; X32-NEXT: vmovdqa 24(%ebp), %xmm4 +; X32-NEXT: vmovdqa 40(%ebp), %xmm5 +; X32-NEXT: vmovdqa 56(%ebp), %xmm6 +; X32-NEXT: vmovdqa 72(%ebp), %xmm7 ; X32-NEXT: aesdecwide256kl foo -; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vmovdqa %xmm0, (%eax) ; X32-NEXT: movl 92(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 96(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 100(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 104(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 108(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 112(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: movl 116(%ebp), %eax -; X32-NEXT: vmovaps %xmm1, (%eax) +; X32-NEXT: vmovdqa %xmm1, (%eax) ; X32-NEXT: sete %al ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -322,11 +322,23 @@ } define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind { -; CHECK-LABEL: signbits_ashr_concat_ashr_extract_sitofp: -; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: signbits_ashr_concat_ashr_extract_sitofp: +; X86: # %bb.0: +; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X86-NEXT: vcvtdq2pd %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-AVX1-LABEL: signbits_ashr_concat_ashr_extract_sitofp: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X64-AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: signbits_ashr_concat_ashr_extract_sitofp: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X64-AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %1 = ashr <2 x i64> %a0, %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> %3 = shufflevector <4 x i64> %a1, <4 x i64> %2, <4 x i32> diff --git a/llvm/test/CodeGen/X86/llrint-conv.ll b/llvm/test/CodeGen/X86/llrint-conv.ll --- a/llvm/test/CodeGen/X86/llrint-conv.ll +++ b/llvm/test/CodeGen/X86/llrint-conv.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X86,X86-NOSSE ; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2 -; RUN: llc < %s -mtriple=i686-unknown -mattr=avx | FileCheck %s --check-prefixes=X86,X86-AVX -; RUN: llc < %s -mtriple=i686-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X86,X86-AVX +; RUN: llc < %s -mtriple=i686-unknown -mattr=avx | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1 +; RUN: llc < %s -mtriple=i686-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX512 ; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefixes=X64,X64-SSE -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64,X64-AVX +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512 define i64 @testmsxs(float %x) { ; X86-NOSSE-LABEL: testmsxs: @@ -46,25 +46,45 @@ ; X86-SSE2-NEXT: .cfi_def_cfa %esp, 4 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: testmsxs: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %ebp -; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %ebp, -8 -; X86-AVX-NEXT: movl %esp, %ebp -; X86-AVX-NEXT: .cfi_def_cfa_register %ebp -; X86-AVX-NEXT: andl $-8, %esp -; X86-AVX-NEXT: subl $8, %esp -; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vmovss %xmm0, (%esp) -; X86-AVX-NEXT: flds (%esp) -; X86-AVX-NEXT: fistpll (%esp) -; X86-AVX-NEXT: movl (%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: movl %ebp, %esp -; X86-AVX-NEXT: popl %ebp -; X86-AVX-NEXT: .cfi_def_cfa %esp, 4 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: testmsxs: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %ebp, -8 +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: .cfi_def_cfa_register %ebp +; X86-AVX1-NEXT: andl $-8, %esp +; X86-AVX1-NEXT: subl $8, %esp +; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: vmovss %xmm0, (%esp) +; X86-AVX1-NEXT: flds (%esp) +; X86-AVX1-NEXT: fistpll (%esp) +; X86-AVX1-NEXT: movl (%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: .cfi_def_cfa %esp, 4 +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: testmsxs: +; X86-AVX512: # %bb.0: # %entry +; X86-AVX512-NEXT: pushl %ebp +; X86-AVX512-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX512-NEXT: .cfi_offset %ebp, -8 +; X86-AVX512-NEXT: movl %esp, %ebp +; X86-AVX512-NEXT: .cfi_def_cfa_register %ebp +; X86-AVX512-NEXT: andl $-8, %esp +; X86-AVX512-NEXT: subl $8, %esp +; X86-AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512-NEXT: vmovd %xmm0, (%esp) +; X86-AVX512-NEXT: flds (%esp) +; X86-AVX512-NEXT: fistpll (%esp) +; X86-AVX512-NEXT: movl (%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX512-NEXT: movl %ebp, %esp +; X86-AVX512-NEXT: popl %ebp +; X86-AVX512-NEXT: .cfi_def_cfa %esp, 4 +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: testmsxs: ; X64-SSE: # %bb.0: # %entry @@ -119,25 +139,45 @@ ; X86-SSE2-NEXT: .cfi_def_cfa %esp, 4 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: testmsxd: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %ebp -; X86-AVX-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX-NEXT: .cfi_offset %ebp, -8 -; X86-AVX-NEXT: movl %esp, %ebp -; X86-AVX-NEXT: .cfi_def_cfa_register %ebp -; X86-AVX-NEXT: andl $-8, %esp -; X86-AVX-NEXT: subl $8, %esp -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovsd %xmm0, (%esp) -; X86-AVX-NEXT: fldl (%esp) -; X86-AVX-NEXT: fistpll (%esp) -; X86-AVX-NEXT: movl (%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: movl %ebp, %esp -; X86-AVX-NEXT: popl %ebp -; X86-AVX-NEXT: .cfi_def_cfa %esp, 4 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: testmsxd: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %ebp, -8 +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: .cfi_def_cfa_register %ebp +; X86-AVX1-NEXT: andl $-8, %esp +; X86-AVX1-NEXT: subl $8, %esp +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX1-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX1-NEXT: fldl (%esp) +; X86-AVX1-NEXT: fistpll (%esp) +; X86-AVX1-NEXT: movl (%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: .cfi_def_cfa %esp, 4 +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: testmsxd: +; X86-AVX512: # %bb.0: # %entry +; X86-AVX512-NEXT: pushl %ebp +; X86-AVX512-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX512-NEXT: .cfi_offset %ebp, -8 +; X86-AVX512-NEXT: movl %esp, %ebp +; X86-AVX512-NEXT: .cfi_def_cfa_register %ebp +; X86-AVX512-NEXT: andl $-8, %esp +; X86-AVX512-NEXT: subl $8, %esp +; X86-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovq %xmm0, (%esp) +; X86-AVX512-NEXT: fldl (%esp) +; X86-AVX512-NEXT: fistpll (%esp) +; X86-AVX512-NEXT: movl (%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX512-NEXT: movl %ebp, %esp +; X86-AVX512-NEXT: popl %ebp +; X86-AVX512-NEXT: .cfi_def_cfa %esp, 4 +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: testmsxd: ; X64-SSE: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll --- a/llvm/test/CodeGen/X86/load-partial.ll +++ b/llvm/test/CodeGen/X86/load-partial.ll @@ -15,10 +15,15 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load_float4_float3: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_float4_float3: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_float4_float3: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0 %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1 %p2 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2 @@ -64,10 +69,15 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load_float8_float3: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_float8_float3: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_float8_float3: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0 %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1 %p2 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2 @@ -113,10 +123,15 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load_float4_float3_as_float2_float: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_float4_float3_as_float2_float: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_float4_float3_as_float2_float: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq %2 = bitcast <4 x float>* %0 to <2 x float>* %3 = load <2 x float>, <2 x float>* %2, align 4 %4 = extractelement <2 x float> %3, i32 0 @@ -162,10 +177,15 @@ ; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load_float4_float3_trunc: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_float4_float3_trunc: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_float4_float3_trunc: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: retq %2 = bitcast <4 x float>* %0 to i64* %3 = load i64, i64* %2, align 16 %4 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2 diff --git a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll --- a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll +++ b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefix=SSE ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512dq,+avx512bw | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2OR512 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2OR512 define <4 x i32> @add_op1_constant(i32* %p) nounwind { ; SSE-LABEL: add_op1_constant: @@ -665,18 +665,27 @@ ; SSE-NEXT: pushq %rax ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: callq fmodf +; SSE-NEXT: callq fmodf@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; -; AVX-LABEL: frem_op1_constant: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rax -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: callq fmodf -; AVX-NEXT: popq %rax -; AVX-NEXT: retq +; AVX1-LABEL: frem_op1_constant: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: callq fmodf@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: frem_op1_constant: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: pushq %rax +; AVX2OR512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2OR512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2OR512-NEXT: callq fmodf@PLT +; AVX2OR512-NEXT: popq %rax +; AVX2OR512-NEXT: retq %x = load float, float* %p %b = frem float %x, 42.0 %r = insertelement <4 x float> undef, float %b, i32 0 @@ -689,18 +698,27 @@ ; SSE-NEXT: pushq %rax ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: callq fmod +; SSE-NEXT: callq fmod@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; -; AVX-LABEL: frem_op0_constant: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rax -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq fmod -; AVX-NEXT: popq %rax -; AVX-NEXT: retq +; AVX1-LABEL: frem_op0_constant: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq fmod@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: frem_op0_constant: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: pushq %rax +; AVX2OR512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2OR512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2OR512-NEXT: callq fmod@PLT +; AVX2OR512-NEXT: popq %rax +; AVX2OR512-NEXT: retq %x = load double, double* %p %b = frem double 42.0, %x %r = insertelement <2 x double> undef, double %b, i32 0 diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -646,27 +646,49 @@ ; SSE42-NEXT: extractps $1, %xmm0, (%rdi) ; SSE42-NEXT: retq ; -; AVX1OR2-LABEL: compressstore_v2f32_v2i32: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpmovsxdq %xmm1, %xmm1 -; AVX1OR2-NEXT: vmovmskpd %xmm1, %eax -; AVX1OR2-NEXT: testb $1, %al -; AVX1OR2-NEXT: jne LBB2_1 -; AVX1OR2-NEXT: ## %bb.2: ## %else -; AVX1OR2-NEXT: testb $2, %al -; AVX1OR2-NEXT: jne LBB2_3 -; AVX1OR2-NEXT: LBB2_4: ## %else2 -; AVX1OR2-NEXT: retq -; AVX1OR2-NEXT: LBB2_1: ## %cond.store -; AVX1OR2-NEXT: vmovss %xmm0, (%rdi) -; AVX1OR2-NEXT: addq $4, %rdi -; AVX1OR2-NEXT: testb $2, %al -; AVX1OR2-NEXT: je LBB2_4 -; AVX1OR2-NEXT: LBB2_3: ## %cond.store1 -; AVX1OR2-NEXT: vextractps $1, %xmm0, (%rdi) -; AVX1OR2-NEXT: retq +; AVX1-LABEL: compressstore_v2f32_v2i32: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne LBB2_1 +; AVX1-NEXT: ## %bb.2: ## %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne LBB2_3 +; AVX1-NEXT: LBB2_4: ## %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: LBB2_1: ## %cond.store +; AVX1-NEXT: vmovss %xmm0, (%rdi) +; AVX1-NEXT: addq $4, %rdi +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je LBB2_4 +; AVX1-NEXT: LBB2_3: ## %cond.store1 +; AVX1-NEXT: vextractps $1, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: compressstore_v2f32_v2i32: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne LBB2_1 +; AVX2-NEXT: ## %bb.2: ## %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne LBB2_3 +; AVX2-NEXT: LBB2_4: ## %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: LBB2_1: ## %cond.store +; AVX2-NEXT: vmovd %xmm0, (%rdi) +; AVX2-NEXT: addq $4, %rdi +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je LBB2_4 +; AVX2-NEXT: LBB2_3: ## %cond.store1 +; AVX2-NEXT: vpextrd $1, %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: compressstore_v2f32_v2i32: ; AVX512F: ## %bb.0: @@ -777,41 +799,77 @@ ; SSE42-NEXT: extractps $3, %xmm0, (%rdi) ; SSE42-NEXT: retq ; -; AVX1OR2-LABEL: compressstore_v4f32_v4i1: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX1OR2-NEXT: vmovmskps %xmm1, %eax -; AVX1OR2-NEXT: testb $1, %al -; AVX1OR2-NEXT: jne LBB3_1 -; AVX1OR2-NEXT: ## %bb.2: ## %else -; AVX1OR2-NEXT: testb $2, %al -; AVX1OR2-NEXT: jne LBB3_3 -; AVX1OR2-NEXT: LBB3_4: ## %else2 -; AVX1OR2-NEXT: testb $4, %al -; AVX1OR2-NEXT: jne LBB3_5 -; AVX1OR2-NEXT: LBB3_6: ## %else5 -; AVX1OR2-NEXT: testb $8, %al -; AVX1OR2-NEXT: jne LBB3_7 -; AVX1OR2-NEXT: LBB3_8: ## %else8 -; AVX1OR2-NEXT: retq -; AVX1OR2-NEXT: LBB3_1: ## %cond.store -; AVX1OR2-NEXT: vmovss %xmm0, (%rdi) -; AVX1OR2-NEXT: addq $4, %rdi -; AVX1OR2-NEXT: testb $2, %al -; AVX1OR2-NEXT: je LBB3_4 -; AVX1OR2-NEXT: LBB3_3: ## %cond.store1 -; AVX1OR2-NEXT: vextractps $1, %xmm0, (%rdi) -; AVX1OR2-NEXT: addq $4, %rdi -; AVX1OR2-NEXT: testb $4, %al -; AVX1OR2-NEXT: je LBB3_6 -; AVX1OR2-NEXT: LBB3_5: ## %cond.store4 -; AVX1OR2-NEXT: vextractps $2, %xmm0, (%rdi) -; AVX1OR2-NEXT: addq $4, %rdi -; AVX1OR2-NEXT: testb $8, %al -; AVX1OR2-NEXT: je LBB3_8 -; AVX1OR2-NEXT: LBB3_7: ## %cond.store7 -; AVX1OR2-NEXT: vextractps $3, %xmm0, (%rdi) -; AVX1OR2-NEXT: retq +; AVX1-LABEL: compressstore_v4f32_v4i1: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne LBB3_1 +; AVX1-NEXT: ## %bb.2: ## %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne LBB3_3 +; AVX1-NEXT: LBB3_4: ## %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne LBB3_5 +; AVX1-NEXT: LBB3_6: ## %else5 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne LBB3_7 +; AVX1-NEXT: LBB3_8: ## %else8 +; AVX1-NEXT: retq +; AVX1-NEXT: LBB3_1: ## %cond.store +; AVX1-NEXT: vmovss %xmm0, (%rdi) +; AVX1-NEXT: addq $4, %rdi +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je LBB3_4 +; AVX1-NEXT: LBB3_3: ## %cond.store1 +; AVX1-NEXT: vextractps $1, %xmm0, (%rdi) +; AVX1-NEXT: addq $4, %rdi +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: je LBB3_6 +; AVX1-NEXT: LBB3_5: ## %cond.store4 +; AVX1-NEXT: vextractps $2, %xmm0, (%rdi) +; AVX1-NEXT: addq $4, %rdi +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: je LBB3_8 +; AVX1-NEXT: LBB3_7: ## %cond.store7 +; AVX1-NEXT: vextractps $3, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: compressstore_v4f32_v4i1: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne LBB3_1 +; AVX2-NEXT: ## %bb.2: ## %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne LBB3_3 +; AVX2-NEXT: LBB3_4: ## %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne LBB3_5 +; AVX2-NEXT: LBB3_6: ## %else5 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne LBB3_7 +; AVX2-NEXT: LBB3_8: ## %else8 +; AVX2-NEXT: retq +; AVX2-NEXT: LBB3_1: ## %cond.store +; AVX2-NEXT: vmovd %xmm0, (%rdi) +; AVX2-NEXT: addq $4, %rdi +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je LBB3_4 +; AVX2-NEXT: LBB3_3: ## %cond.store1 +; AVX2-NEXT: vpextrd $1, %xmm0, (%rdi) +; AVX2-NEXT: addq $4, %rdi +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: je LBB3_6 +; AVX2-NEXT: LBB3_5: ## %cond.store4 +; AVX2-NEXT: vpextrd $2, %xmm0, (%rdi) +; AVX2-NEXT: addq $4, %rdi +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: je LBB3_8 +; AVX2-NEXT: LBB3_7: ## %cond.store7 +; AVX2-NEXT: vpextrd $3, %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: compressstore_v4f32_v4i1: ; AVX512F: ## %bb.0: @@ -1225,14 +1283,14 @@ ; ; AVX2-LABEL: compressstore_v16f32_const: ; AVX2: ## %bb.0: -; AVX2-NEXT: vmovups %ymm0, (%rdi) -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,4] -; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovups %xmm0, 32(%rdi) -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vextractps $1, %xmm0, 48(%rdi) -; AVX2-NEXT: vextractps $2, %xmm0, 52(%rdi) -; AVX2-NEXT: vextractps $3, %xmm0, 56(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, (%rdi) +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,4] +; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %xmm0, 32(%rdi) +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrd $1, %xmm0, 48(%rdi) +; AVX2-NEXT: vpextrd $2, %xmm0, 52(%rdi) +; AVX2-NEXT: vpextrd $3, %xmm0, 56(%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2181,11 +2239,11 @@ ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je LBB6_8 ; AVX2-NEXT: LBB6_7: ## %cond.store7 -; AVX2-NEXT: vextractps $3, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $3, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: LBB6_8: ## %else8 ; AVX2-NEXT: testb $16, %al -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: jne LBB6_9 ; AVX2-NEXT: ## %bb.10: ## %else11 ; AVX2-NEXT: testb $32, %al @@ -2209,11 +2267,11 @@ ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX2-NEXT: je LBB6_24 ; AVX2-NEXT: LBB6_23: ## %cond.store31 -; AVX2-NEXT: vextractps $3, %xmm1, (%rdi) +; AVX2-NEXT: vpextrd $3, %xmm1, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: LBB6_24: ## %else32 ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX2-NEXT: jne LBB6_25 ; AVX2-NEXT: ## %bb.26: ## %else35 ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 @@ -2237,11 +2295,11 @@ ; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000 ; AVX2-NEXT: je LBB6_40 ; AVX2-NEXT: LBB6_39: ## %cond.store55 -; AVX2-NEXT: vextractps $3, %xmm2, (%rdi) +; AVX2-NEXT: vpextrd $3, %xmm2, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: LBB6_40: ## %else56 ; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX2-NEXT: jne LBB6_41 ; AVX2-NEXT: ## %bb.42: ## %else59 ; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 @@ -2265,11 +2323,11 @@ ; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000 ; AVX2-NEXT: je LBB6_56 ; AVX2-NEXT: LBB6_55: ## %cond.store79 -; AVX2-NEXT: vextractps $3, %xmm3, (%rdi) +; AVX2-NEXT: vpextrd $3, %xmm3, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: LBB6_56: ## %else80 ; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-NEXT: jne LBB6_57 ; AVX2-NEXT: ## %bb.58: ## %else83 ; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 @@ -2284,146 +2342,146 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; AVX2-NEXT: LBB6_1: ## %cond.store -; AVX2-NEXT: vmovss %xmm0, (%rdi) +; AVX2-NEXT: vmovd %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je LBB6_4 ; AVX2-NEXT: LBB6_3: ## %cond.store1 -; AVX2-NEXT: vextractps $1, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $1, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je LBB6_6 ; AVX2-NEXT: LBB6_5: ## %cond.store4 -; AVX2-NEXT: vextractps $2, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $2, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: jne LBB6_7 ; AVX2-NEXT: jmp LBB6_8 ; AVX2-NEXT: LBB6_9: ## %cond.store10 -; AVX2-NEXT: vmovss %xmm0, (%rdi) +; AVX2-NEXT: vmovd %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je LBB6_12 ; AVX2-NEXT: LBB6_11: ## %cond.store13 -; AVX2-NEXT: vextractps $1, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $1, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je LBB6_14 ; AVX2-NEXT: LBB6_13: ## %cond.store16 -; AVX2-NEXT: vextractps $2, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $2, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je LBB6_16 ; AVX2-NEXT: LBB6_15: ## %cond.store19 -; AVX2-NEXT: vextractps $3, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $3, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX2-NEXT: je LBB6_18 ; AVX2-NEXT: LBB6_17: ## %cond.store22 -; AVX2-NEXT: vmovss %xmm1, (%rdi) +; AVX2-NEXT: vmovd %xmm1, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $512, %eax ## imm = 0x200 ; AVX2-NEXT: je LBB6_20 ; AVX2-NEXT: LBB6_19: ## %cond.store25 -; AVX2-NEXT: vextractps $1, %xmm1, (%rdi) +; AVX2-NEXT: vpextrd $1, %xmm1, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX2-NEXT: je LBB6_22 ; AVX2-NEXT: LBB6_21: ## %cond.store28 -; AVX2-NEXT: vextractps $2, %xmm1, (%rdi) +; AVX2-NEXT: vpextrd $2, %xmm1, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX2-NEXT: jne LBB6_23 ; AVX2-NEXT: jmp LBB6_24 ; AVX2-NEXT: LBB6_25: ## %cond.store34 -; AVX2-NEXT: vmovss %xmm0, (%rdi) +; AVX2-NEXT: vmovd %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX2-NEXT: je LBB6_28 ; AVX2-NEXT: LBB6_27: ## %cond.store37 -; AVX2-NEXT: vextractps $1, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $1, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX2-NEXT: je LBB6_30 ; AVX2-NEXT: LBB6_29: ## %cond.store40 -; AVX2-NEXT: vextractps $2, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $2, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX2-NEXT: je LBB6_32 ; AVX2-NEXT: LBB6_31: ## %cond.store43 -; AVX2-NEXT: vextractps $3, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $3, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX2-NEXT: je LBB6_34 ; AVX2-NEXT: LBB6_33: ## %cond.store46 -; AVX2-NEXT: vmovss %xmm2, (%rdi) +; AVX2-NEXT: vmovd %xmm2, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000 ; AVX2-NEXT: je LBB6_36 ; AVX2-NEXT: LBB6_35: ## %cond.store49 -; AVX2-NEXT: vextractps $1, %xmm2, (%rdi) +; AVX2-NEXT: vpextrd $1, %xmm2, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000 ; AVX2-NEXT: je LBB6_38 ; AVX2-NEXT: LBB6_37: ## %cond.store52 -; AVX2-NEXT: vextractps $2, %xmm2, (%rdi) +; AVX2-NEXT: vpextrd $2, %xmm2, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000 ; AVX2-NEXT: jne LBB6_39 ; AVX2-NEXT: jmp LBB6_40 ; AVX2-NEXT: LBB6_41: ## %cond.store58 -; AVX2-NEXT: vmovss %xmm0, (%rdi) +; AVX2-NEXT: vmovd %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX2-NEXT: je LBB6_44 ; AVX2-NEXT: LBB6_43: ## %cond.store61 -; AVX2-NEXT: vextractps $1, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $1, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000 ; AVX2-NEXT: je LBB6_46 ; AVX2-NEXT: LBB6_45: ## %cond.store64 -; AVX2-NEXT: vextractps $2, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $2, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000 ; AVX2-NEXT: je LBB6_48 ; AVX2-NEXT: LBB6_47: ## %cond.store67 -; AVX2-NEXT: vextractps $3, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $3, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; AVX2-NEXT: je LBB6_50 ; AVX2-NEXT: LBB6_49: ## %cond.store70 -; AVX2-NEXT: vmovss %xmm3, (%rdi) +; AVX2-NEXT: vmovd %xmm3, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000 ; AVX2-NEXT: je LBB6_52 ; AVX2-NEXT: LBB6_51: ## %cond.store73 -; AVX2-NEXT: vextractps $1, %xmm3, (%rdi) +; AVX2-NEXT: vpextrd $1, %xmm3, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000 ; AVX2-NEXT: je LBB6_54 ; AVX2-NEXT: LBB6_53: ## %cond.store76 -; AVX2-NEXT: vextractps $2, %xmm3, (%rdi) +; AVX2-NEXT: vpextrd $2, %xmm3, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000 ; AVX2-NEXT: jne LBB6_55 ; AVX2-NEXT: jmp LBB6_56 ; AVX2-NEXT: LBB6_57: ## %cond.store82 -; AVX2-NEXT: vmovss %xmm0, (%rdi) +; AVX2-NEXT: vmovd %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX2-NEXT: je LBB6_60 ; AVX2-NEXT: LBB6_59: ## %cond.store85 -; AVX2-NEXT: vextractps $1, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $1, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX2-NEXT: je LBB6_62 ; AVX2-NEXT: LBB6_61: ## %cond.store88 -; AVX2-NEXT: vextractps $2, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $2, %xmm0, (%rdi) ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX2-NEXT: je LBB6_64 ; AVX2-NEXT: LBB6_63: ## %cond.store91 -; AVX2-NEXT: vextractps $3, %xmm0, (%rdi) +; AVX2-NEXT: vpextrd $3, %xmm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3115,42 +3173,79 @@ ; SSE42-NEXT: extractps $3, %xmm0, (%rdi) ; SSE42-NEXT: retq ; -; AVX1OR2-LABEL: compressstore_v4i32_v4i32: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1OR2-NEXT: vmovmskps %xmm1, %eax -; AVX1OR2-NEXT: testb $1, %al -; AVX1OR2-NEXT: jne LBB10_1 -; AVX1OR2-NEXT: ## %bb.2: ## %else -; AVX1OR2-NEXT: testb $2, %al -; AVX1OR2-NEXT: jne LBB10_3 -; AVX1OR2-NEXT: LBB10_4: ## %else2 -; AVX1OR2-NEXT: testb $4, %al -; AVX1OR2-NEXT: jne LBB10_5 -; AVX1OR2-NEXT: LBB10_6: ## %else5 -; AVX1OR2-NEXT: testb $8, %al -; AVX1OR2-NEXT: jne LBB10_7 -; AVX1OR2-NEXT: LBB10_8: ## %else8 -; AVX1OR2-NEXT: retq -; AVX1OR2-NEXT: LBB10_1: ## %cond.store -; AVX1OR2-NEXT: vmovss %xmm0, (%rdi) -; AVX1OR2-NEXT: addq $4, %rdi -; AVX1OR2-NEXT: testb $2, %al -; AVX1OR2-NEXT: je LBB10_4 -; AVX1OR2-NEXT: LBB10_3: ## %cond.store1 -; AVX1OR2-NEXT: vextractps $1, %xmm0, (%rdi) -; AVX1OR2-NEXT: addq $4, %rdi -; AVX1OR2-NEXT: testb $4, %al -; AVX1OR2-NEXT: je LBB10_6 -; AVX1OR2-NEXT: LBB10_5: ## %cond.store4 -; AVX1OR2-NEXT: vextractps $2, %xmm0, (%rdi) -; AVX1OR2-NEXT: addq $4, %rdi -; AVX1OR2-NEXT: testb $8, %al -; AVX1OR2-NEXT: je LBB10_8 -; AVX1OR2-NEXT: LBB10_7: ## %cond.store7 -; AVX1OR2-NEXT: vextractps $3, %xmm0, (%rdi) -; AVX1OR2-NEXT: retq +; AVX1-LABEL: compressstore_v4i32_v4i32: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskps %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne LBB10_1 +; AVX1-NEXT: ## %bb.2: ## %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne LBB10_3 +; AVX1-NEXT: LBB10_4: ## %else2 +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: jne LBB10_5 +; AVX1-NEXT: LBB10_6: ## %else5 +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: jne LBB10_7 +; AVX1-NEXT: LBB10_8: ## %else8 +; AVX1-NEXT: retq +; AVX1-NEXT: LBB10_1: ## %cond.store +; AVX1-NEXT: vmovss %xmm0, (%rdi) +; AVX1-NEXT: addq $4, %rdi +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je LBB10_4 +; AVX1-NEXT: LBB10_3: ## %cond.store1 +; AVX1-NEXT: vextractps $1, %xmm0, (%rdi) +; AVX1-NEXT: addq $4, %rdi +; AVX1-NEXT: testb $4, %al +; AVX1-NEXT: je LBB10_6 +; AVX1-NEXT: LBB10_5: ## %cond.store4 +; AVX1-NEXT: vextractps $2, %xmm0, (%rdi) +; AVX1-NEXT: addq $4, %rdi +; AVX1-NEXT: testb $8, %al +; AVX1-NEXT: je LBB10_8 +; AVX1-NEXT: LBB10_7: ## %cond.store7 +; AVX1-NEXT: vextractps $3, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: compressstore_v4i32_v4i32: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskps %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne LBB10_1 +; AVX2-NEXT: ## %bb.2: ## %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne LBB10_3 +; AVX2-NEXT: LBB10_4: ## %else2 +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: jne LBB10_5 +; AVX2-NEXT: LBB10_6: ## %else5 +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: jne LBB10_7 +; AVX2-NEXT: LBB10_8: ## %else8 +; AVX2-NEXT: retq +; AVX2-NEXT: LBB10_1: ## %cond.store +; AVX2-NEXT: vmovd %xmm0, (%rdi) +; AVX2-NEXT: addq $4, %rdi +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je LBB10_4 +; AVX2-NEXT: LBB10_3: ## %cond.store1 +; AVX2-NEXT: vpextrd $1, %xmm0, (%rdi) +; AVX2-NEXT: addq $4, %rdi +; AVX2-NEXT: testb $4, %al +; AVX2-NEXT: je LBB10_6 +; AVX2-NEXT: LBB10_5: ## %cond.store4 +; AVX2-NEXT: vpextrd $2, %xmm0, (%rdi) +; AVX2-NEXT: addq $4, %rdi +; AVX2-NEXT: testb $8, %al +; AVX2-NEXT: je LBB10_8 +; AVX2-NEXT: LBB10_7: ## %cond.store7 +; AVX2-NEXT: vpextrd $3, %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: compressstore_v4i32_v4i32: ; AVX512F: ## %bb.0: diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -247,28 +247,28 @@ ; AVX2-NEXT: LBB1_8: ## %else10 ; AVX2-NEXT: retq ; AVX2-NEXT: LBB1_1: ## %cond.load -; AVX2-NEXT: vmovsd (%rdi), %xmm1 ## xmm1 = mem[0],zero -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; AVX2-NEXT: vmovq (%rdi), %xmm1 ## xmm1 = mem[0],zero +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je LBB1_4 ; AVX2-NEXT: LBB1_3: ## %cond.load1 -; AVX2-NEXT: vmovhpd (%rdi), %xmm0, %xmm1 ## xmm1 = xmm0[0],mem[0] -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX2-NEXT: vmovhps (%rdi), %xmm0, %xmm1 ## xmm1 = xmm0[0,1],mem[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je LBB1_6 ; AVX2-NEXT: LBB1_5: ## %cond.load5 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovlpd (%rdi), %xmm1, %xmm1 ## xmm1 = mem[0],xmm1[1] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovlps (%rdi), %xmm1, %xmm1 ## xmm1 = mem[0,1],xmm1[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je LBB1_8 ; AVX2-NEXT: LBB1_7: ## %cond.load9 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovhpd (%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovhps (%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0,1],mem[0,1] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: expandload_v4f64_v4i64: @@ -1164,28 +1164,51 @@ ; SSE42-NEXT: insertps $16, (%rdi), %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3] ; SSE42-NEXT: retq ; -; AVX1OR2-LABEL: expandload_v2f32_v2i1: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpmovsxdq %xmm1, %xmm1 -; AVX1OR2-NEXT: vmovmskpd %xmm1, %eax -; AVX1OR2-NEXT: testb $1, %al -; AVX1OR2-NEXT: jne LBB4_1 -; AVX1OR2-NEXT: ## %bb.2: ## %else -; AVX1OR2-NEXT: testb $2, %al -; AVX1OR2-NEXT: jne LBB4_3 -; AVX1OR2-NEXT: LBB4_4: ## %else2 -; AVX1OR2-NEXT: retq -; AVX1OR2-NEXT: LBB4_1: ## %cond.load -; AVX1OR2-NEXT: vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero -; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX1OR2-NEXT: addq $4, %rdi -; AVX1OR2-NEXT: testb $2, %al -; AVX1OR2-NEXT: je LBB4_4 -; AVX1OR2-NEXT: LBB4_3: ## %cond.load1 -; AVX1OR2-NEXT: vinsertps $16, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: expandload_v2f32_v2i1: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne LBB4_1 +; AVX1-NEXT: ## %bb.2: ## %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne LBB4_3 +; AVX1-NEXT: LBB4_4: ## %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: LBB4_1: ## %cond.load +; AVX1-NEXT: vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: addq $4, %rdi +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je LBB4_4 +; AVX1-NEXT: LBB4_3: ## %cond.load1 +; AVX1-NEXT: vinsertps $16, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: expandload_v2f32_v2i1: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne LBB4_1 +; AVX2-NEXT: ## %bb.2: ## %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne LBB4_3 +; AVX2-NEXT: LBB4_4: ## %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: LBB4_1: ## %cond.load +; AVX2-NEXT: vmovd (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: addq $4, %rdi +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je LBB4_4 +; AVX2-NEXT: LBB4_3: ## %cond.load1 +; AVX2-NEXT: vinsertps $16, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: expandload_v2f32_v2i1: ; AVX512F: ## %bb.0: @@ -1299,16 +1322,27 @@ ; SSE42-NEXT: blendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] ; SSE42-NEXT: retq ; -; AVX1OR2-LABEL: expandload_v16f32_const: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vmovsd 44(%rdi), %xmm0 ## xmm0 = mem[0],zero -; AVX1OR2-NEXT: vinsertps $32, 52(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX1OR2-NEXT: vmovsd 32(%rdi), %xmm2 ## xmm2 = mem[0],zero -; AVX1OR2-NEXT: vinsertps $32, 40(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3] -; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 -; AVX1OR2-NEXT: vmovups (%rdi), %ymm0 -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: expandload_v16f32_const: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovsd 44(%rdi), %xmm0 ## xmm0 = mem[0],zero +; AVX1-NEXT: vinsertps $32, 52(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: vmovsd 32(%rdi), %xmm2 ## xmm2 = mem[0],zero +; AVX1-NEXT: vinsertps $32, 40(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: expandload_v16f32_const: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovsd 44(%rdi), %xmm0 ## xmm0 = mem[0],zero +; AVX2-NEXT: vinsertps $32, 52(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX2-NEXT: vmovsd 32(%rdi), %xmm2 ## xmm2 = mem[0],zero +; AVX2-NEXT: vinsertps $32, 40(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: expandload_v16f32_const: ; AVX512F: ## %bb.0: @@ -1354,13 +1388,21 @@ ; SSE42-NEXT: movups 44(%rdi), %xmm3 ; SSE42-NEXT: retq ; -; AVX1OR2-LABEL: expandload_v16f32_const_undef: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vmovsd 32(%rdi), %xmm0 ## xmm0 = mem[0],zero -; AVX1OR2-NEXT: vinsertps $32, 40(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX1OR2-NEXT: vinsertf128 $1, 44(%rdi), %ymm0, %ymm1 -; AVX1OR2-NEXT: vmovups (%rdi), %ymm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: expandload_v16f32_const_undef: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovsd 32(%rdi), %xmm0 ## xmm0 = mem[0],zero +; AVX1-NEXT: vinsertps $32, 40(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: vinsertf128 $1, 44(%rdi), %ymm0, %ymm1 +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: expandload_v16f32_const_undef: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovsd 32(%rdi), %xmm0 ## xmm0 = mem[0],zero +; AVX2-NEXT: vinsertps $32, 40(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX2-NEXT: vinsertf128 $1, 44(%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: expandload_v16f32_const_undef: ; AVX512F: ## %bb.0: @@ -2515,10 +2557,10 @@ ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je LBB8_10 ; AVX2-NEXT: LBB8_9: ## %cond.load13 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je LBB8_12 @@ -2568,10 +2610,10 @@ ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX2-NEXT: je LBB8_26 ; AVX2-NEXT: LBB8_25: ## %cond.load45 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX2-NEXT: je LBB8_28 @@ -2621,10 +2663,10 @@ ; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX2-NEXT: je LBB8_42 ; AVX2-NEXT: LBB8_41: ## %cond.load77 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX2-NEXT: je LBB8_44 @@ -2674,10 +2716,10 @@ ; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX2-NEXT: je LBB8_58 ; AVX2-NEXT: LBB8_57: ## %cond.load109 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX2-NEXT: je LBB8_60 diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -103,8 +103,8 @@ ; AVX2-NEXT: je .LBB0_2 ; AVX2-NEXT: # %bb.1: # %cond.load ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX2-NEXT: .LBB0_2: # %else ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB0_4 @@ -262,8 +262,8 @@ ; AVX2-NEXT: je .LBB1_2 ; AVX2-NEXT: # %bb.1: # %cond.load ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX2-NEXT: .LBB1_2: # %else ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB1_4 @@ -419,8 +419,8 @@ ; AVX2-NEXT: je .LBB2_2 ; AVX2-NEXT: # %bb.1: # %cond.load ; AVX2-NEXT: vmovq %xmm0, %rcx -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; AVX2-NEXT: .LBB2_2: # %else ; AVX2-NEXT: testb $2, %al ; AVX2-NEXT: je .LBB2_4 diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -152,12 +152,12 @@ ; WIDEN_AVX2-NEXT: retq ; WIDEN_AVX2-NEXT: .LBB3_1: # %cond.store ; WIDEN_AVX2-NEXT: vmovq %xmm1, %rcx -; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rcx) +; WIDEN_AVX2-NEXT: vmovd %xmm0, (%rcx) ; WIDEN_AVX2-NEXT: testb $2, %al ; WIDEN_AVX2-NEXT: je .LBB3_4 ; WIDEN_AVX2-NEXT: .LBB3_3: # %cond.store1 ; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax -; WIDEN_AVX2-NEXT: vextractps $1, %xmm0, (%rax) +; WIDEN_AVX2-NEXT: vpextrd $1, %xmm0, (%rax) ; WIDEN_AVX2-NEXT: retq call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) ret void @@ -235,12 +235,12 @@ ; WIDEN_AVX2-NEXT: retq ; WIDEN_AVX2-NEXT: .LBB5_1: # %cond.store ; WIDEN_AVX2-NEXT: vmovq %xmm1, %rcx -; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rcx) +; WIDEN_AVX2-NEXT: vmovd %xmm0, (%rcx) ; WIDEN_AVX2-NEXT: testb $2, %al ; WIDEN_AVX2-NEXT: je .LBB5_4 ; WIDEN_AVX2-NEXT: .LBB5_3: # %cond.store1 ; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax -; WIDEN_AVX2-NEXT: vextractps $1, %xmm0, (%rax) +; WIDEN_AVX2-NEXT: vpextrd $1, %xmm0, (%rax) ; WIDEN_AVX2-NEXT: retq %gep = getelementptr i32, i32 *%base, <2 x i32> %ind call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %gep, i32 4, <2 x i1> %mask) diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -22,14 +22,32 @@ ; SSE-NEXT: LBB0_2: ## %else ; SSE-NEXT: retq ; -; AVX-LABEL: load_v1f64_v1i64: -; AVX: ## %bb.0: -; AVX-NEXT: testq %rdi, %rdi -; AVX-NEXT: jne LBB0_2 -; AVX-NEXT: ## %bb.1: ## %cond.load -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: LBB0_2: ## %else -; AVX-NEXT: retq +; AVX1-LABEL: load_v1f64_v1i64: +; AVX1: ## %bb.0: +; AVX1-NEXT: testq %rdi, %rdi +; AVX1-NEXT: jne LBB0_2 +; AVX1-NEXT: ## %bb.1: ## %cond.load +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: LBB0_2: ## %else +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_v1f64_v1i64: +; AVX2: ## %bb.0: +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: jne LBB0_2 +; AVX2-NEXT: ## %bb.1: ## %cond.load +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: LBB0_2: ## %else +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_v1f64_v1i64: +; AVX512: ## %bb.0: +; AVX512-NEXT: testq %rdi, %rdi +; AVX512-NEXT: jne LBB0_2 +; AVX512-NEXT: ## %bb.1: ## %cond.load +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: LBB0_2: ## %else +; AVX512-NEXT: retq ; ; X86-AVX512-LABEL: load_v1f64_v1i64: ; X86-AVX512: ## %bb.0: @@ -40,12 +58,12 @@ ; X86-AVX512-NEXT: jne LBB0_1 ; X86-AVX512-NEXT: ## %bb.2: ## %cond.load ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X86-AVX512-NEXT: jmp LBB0_3 ; X86-AVX512-NEXT: LBB0_1: -; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X86-AVX512-NEXT: LBB0_3: ## %else -; X86-AVX512-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX512-NEXT: vmovq %xmm0, (%esp) ; X86-AVX512-NEXT: fldl (%esp) ; X86-AVX512-NEXT: addl $12, %esp ; X86-AVX512-NEXT: retl @@ -6334,10 +6352,15 @@ ; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; SSE42-NEXT: retq ; -; AVX1OR2-LABEL: mload_constmask_v4f32: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: mload_constmask_v4f32: +; AVX1: ## %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: mload_constmask_v4f32: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: mload_constmask_v4f32: ; AVX512F: ## %bb.0: @@ -6380,15 +6403,25 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: mload_constmask_v4f32_all: -; AVX: ## %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: mload_constmask_v4f32_all: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: mload_constmask_v4f32_all: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mload_constmask_v4f32_all: +; AVX512: ## %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: retq ; ; X86-AVX512-LABEL: mload_constmask_v4f32_all: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vmovups (%eax), %xmm0 +; X86-AVX512-NEXT: vmovdqu (%eax), %xmm0 ; X86-AVX512-NEXT: retl %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> , <4 x float>undef) ret <4 x float> %res @@ -6701,10 +6734,15 @@ ; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm1 ; SSE42-NEXT: retq ; -; AVX1OR2-LABEL: mload_constmask_v8i32: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: mload_constmask_v8i32: +; AVX1: ## %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: mload_constmask_v8i32: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: mload_constmask_v8i32: ; AVX512F: ## %bb.0: @@ -6754,10 +6792,15 @@ ; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm1 ; SSE42-NEXT: retq ; -; AVX1OR2-LABEL: mload_constmask_v4i64: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: mload_constmask_v4i64: +; AVX1: ## %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: mload_constmask_v4i64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: mload_constmask_v4i64: ; AVX512F: ## %bb.0: @@ -6803,11 +6846,17 @@ ; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: mload_constmask_v8f64: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: mload_constmask_v8f64: +; AVX1: ## %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: mload_constmask_v8f64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: mload_constmask_v8f64: ; AVX512F: ## %bb.0: @@ -6865,24 +6914,36 @@ ; SSE-NEXT: movaps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: mload_constmask_v16f64_allones_split: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0] -; AVX1OR2-NEXT: ## ymm0 = mem[0,1,0,1] -; AVX1OR2-NEXT: vmaskmovpd 64(%rdi), %ymm0, %ymm1 -; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] -; AVX1OR2-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm0 -; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] -; AVX1OR2-NEXT: vmovups (%rdi), %ymm0 -; AVX1OR2-NEXT: vmovups 32(%rdi), %ymm1 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: mload_constmask_v16f64_allones_split: +; AVX1: ## %bb.0: +; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0] +; AVX1-NEXT: ## ymm0 = mem[0,1,0,1] +; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm0, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: mload_constmask_v16f64_allones_split: +; AVX2: ## %bb.0: +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0] +; AVX2-NEXT: ## ymm0 = mem[0,1,0,1] +; AVX2-NEXT: vmaskmovpd 64(%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX2-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: mload_constmask_v16f64_allones_split: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: movb $85, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovupd 64(%rdi), %zmm1 {%k1} -; AVX512F-NEXT: vmovups (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VLDQ-LABEL: mload_constmask_v16f64_allones_split: @@ -6890,7 +6951,7 @@ ; AVX512VLDQ-NEXT: movb $85, %al ; AVX512VLDQ-NEXT: kmovw %eax, %k1 ; AVX512VLDQ-NEXT: vmovupd 64(%rdi), %zmm1 {%k1} -; AVX512VLDQ-NEXT: vmovups (%rdi), %zmm0 +; AVX512VLDQ-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: mload_constmask_v16f64_allones_split: @@ -6898,7 +6959,7 @@ ; AVX512VLBW-NEXT: movb $85, %al ; AVX512VLBW-NEXT: kmovd %eax, %k1 ; AVX512VLBW-NEXT: vmovupd 64(%rdi), %zmm1 {%k1} -; AVX512VLBW-NEXT: vmovups (%rdi), %zmm0 +; AVX512VLBW-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512VLBW-NEXT: retq ; ; X86-AVX512-LABEL: mload_constmask_v16f64_allones_split: @@ -6907,7 +6968,7 @@ ; X86-AVX512-NEXT: movb $85, %cl ; X86-AVX512-NEXT: kmovd %ecx, %k1 ; X86-AVX512-NEXT: vmovupd 64(%eax), %zmm1 {%k1} -; X86-AVX512-NEXT: vmovups (%eax), %zmm0 +; X86-AVX512-NEXT: vmovdqu64 (%eax), %zmm0 ; X86-AVX512-NEXT: retl %res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %addr, i32 4, <16 x i1> , <16 x double> %dst) ret <16 x double> %res diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -22,23 +22,41 @@ ; SSE-NEXT: LBB0_2: ## %else ; SSE-NEXT: retq ; -; AVX-LABEL: store_v1f64_v1i64: -; AVX: ## %bb.0: -; AVX-NEXT: testq %rdi, %rdi -; AVX-NEXT: jns LBB0_2 -; AVX-NEXT: ## %bb.1: ## %cond.store -; AVX-NEXT: vmovsd %xmm0, (%rsi) -; AVX-NEXT: LBB0_2: ## %else -; AVX-NEXT: retq +; AVX1-LABEL: store_v1f64_v1i64: +; AVX1: ## %bb.0: +; AVX1-NEXT: testq %rdi, %rdi +; AVX1-NEXT: jns LBB0_2 +; AVX1-NEXT: ## %bb.1: ## %cond.store +; AVX1-NEXT: vmovsd %xmm0, (%rsi) +; AVX1-NEXT: LBB0_2: ## %else +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_v1f64_v1i64: +; AVX2: ## %bb.0: +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: jns LBB0_2 +; AVX2-NEXT: ## %bb.1: ## %cond.store +; AVX2-NEXT: vmovq %xmm0, (%rsi) +; AVX2-NEXT: LBB0_2: ## %else +; AVX2-NEXT: retq +; +; AVX512-LABEL: store_v1f64_v1i64: +; AVX512: ## %bb.0: +; AVX512-NEXT: testq %rdi, %rdi +; AVX512-NEXT: jns LBB0_2 +; AVX512-NEXT: ## %bb.1: ## %cond.store +; AVX512-NEXT: vmovq %xmm0, (%rsi) +; AVX512-NEXT: LBB0_2: ## %else +; AVX512-NEXT: retq ; ; X86-AVX512-LABEL: store_v1f64_v1i64: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X86-AVX512-NEXT: jns LBB0_2 ; X86-AVX512-NEXT: ## %bb.1: ## %cond.store -; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vmovsd %xmm0, (%eax) +; X86-AVX512-NEXT: vmovq %xmm0, (%eax) ; X86-AVX512-NEXT: LBB0_2: ## %else ; X86-AVX512-NEXT: retl %mask = icmp slt <1 x i64> %trigger, zeroinitializer @@ -4643,15 +4661,25 @@ ; SSE-NEXT: movups %xmm1, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: mstore_constmask_v4i32_v4i32: -; AVX: ## %bb.0: -; AVX-NEXT: vmovups %xmm1, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: mstore_constmask_v4i32_v4i32: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovups %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: mstore_constmask_v4i32_v4i32: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovdqu %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: mstore_constmask_v4i32_v4i32: +; AVX512: ## %bb.0: +; AVX512-NEXT: vmovdqu %xmm1, (%rdi) +; AVX512-NEXT: retq ; ; X86-AVX512-LABEL: mstore_constmask_v4i32_v4i32: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vmovups %xmm1, (%eax) +; X86-AVX512-NEXT: vmovdqu %xmm1, (%eax) ; X86-AVX512-NEXT: retl %mask = icmp eq <4 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1>) @@ -4729,8 +4757,8 @@ ; AVX2-NEXT: vpmaskmovq %ymm5, %ymm0, 32(%rdi) ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,0,18446744073709551615] ; AVX2-NEXT: vpmaskmovq %ymm4, %ymm0, (%rdi) -; AVX2-NEXT: vmovups %ymm7, 96(%rdi) -; AVX2-NEXT: vmovups %ymm6, 64(%rdi) +; AVX2-NEXT: vmovdqu %ymm7, 96(%rdi) +; AVX2-NEXT: vmovdqu %ymm6, 64(%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4739,7 +4767,7 @@ ; AVX512F-NEXT: movb $-37, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 %zmm2, (%rdi) {%k1} -; AVX512F-NEXT: vmovups %zmm3, 64(%rdi) +; AVX512F-NEXT: vmovdqu64 %zmm3, 64(%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4748,7 +4776,7 @@ ; AVX512VLDQ-NEXT: movb $-37, %al ; AVX512VLDQ-NEXT: kmovw %eax, %k1 ; AVX512VLDQ-NEXT: vmovdqu64 %zmm2, (%rdi) {%k1} -; AVX512VLDQ-NEXT: vmovups %zmm3, 64(%rdi) +; AVX512VLDQ-NEXT: vmovdqu64 %zmm3, 64(%rdi) ; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq ; @@ -4757,7 +4785,7 @@ ; AVX512VLBW-NEXT: movb $-37, %al ; AVX512VLBW-NEXT: kmovd %eax, %k1 ; AVX512VLBW-NEXT: vmovdqu64 %zmm2, (%rdi) {%k1} -; AVX512VLBW-NEXT: vmovups %zmm3, 64(%rdi) +; AVX512VLBW-NEXT: vmovdqu64 %zmm3, 64(%rdi) ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq ; @@ -4767,7 +4795,7 @@ ; X86-AVX512-NEXT: movb $-37, %cl ; X86-AVX512-NEXT: kmovd %ecx, %k1 ; X86-AVX512-NEXT: vmovdqu64 %zmm2, (%eax) {%k1} -; X86-AVX512-NEXT: vmovups %zmm3, 64(%eax) +; X86-AVX512-NEXT: vmovdqu64 %zmm3, 64(%eax) ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl %mask = icmp eq <16 x i64> %trigger, zeroinitializer @@ -4783,15 +4811,25 @@ ; SSE-NEXT: movss %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: one_mask_bit_set1: -; AVX: ## %bb.0: -; AVX-NEXT: vmovss %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: one_mask_bit_set1: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovss %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: one_mask_bit_set1: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovd %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: one_mask_bit_set1: +; AVX512: ## %bb.0: +; AVX512-NEXT: vmovd %xmm0, (%rdi) +; AVX512-NEXT: retq ; ; X86-AVX512-LABEL: one_mask_bit_set1: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vmovss %xmm0, (%eax) +; X86-AVX512-NEXT: vmovd %xmm0, (%eax) ; X86-AVX512-NEXT: retl call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1>) ret void @@ -4811,10 +4849,20 @@ ; SSE4-NEXT: extractps $2, %xmm0, 8(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: one_mask_bit_set2: -; AVX: ## %bb.0: -; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: one_mask_bit_set2: +; AVX1: ## %bb.0: +; AVX1-NEXT: vextractps $2, %xmm0, 8(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: one_mask_bit_set2: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: one_mask_bit_set2: +; AVX512: ## %bb.0: +; AVX512-NEXT: vextractps $2, %xmm0, 8(%rdi) +; AVX512-NEXT: retq ; ; X86-AVX512-LABEL: one_mask_bit_set2: ; X86-AVX512: ## %bb.0: @@ -4833,18 +4881,32 @@ ; SSE-NEXT: movlps %xmm1, 16(%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: one_mask_bit_set3: -; AVX: ## %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovlps %xmm0, 16(%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: one_mask_bit_set3: +; AVX1: ## %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovlps %xmm0, 16(%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: one_mask_bit_set3: +; AVX2: ## %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, 16(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: one_mask_bit_set3: +; AVX512: ## %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, 16(%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; X86-AVX512-LABEL: one_mask_bit_set3: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X86-AVX512-NEXT: vmovlps %xmm0, 16(%eax) +; X86-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; X86-AVX512-NEXT: vmovq %xmm0, 16(%eax) ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1>) @@ -4885,25 +4947,32 @@ ; SSE-NEXT: movlps %xmm3, 48(%rdi) ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: one_mask_bit_set5: -; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1OR2-NEXT: vmovlps %xmm0, 48(%rdi) -; AVX1OR2-NEXT: vzeroupper -; AVX1OR2-NEXT: retq +; AVX1-LABEL: one_mask_bit_set5: +; AVX1: ## %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vmovlps %xmm0, 48(%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: one_mask_bit_set5: +; AVX2: ## %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, 48(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: one_mask_bit_set5: ; AVX512: ## %bb.0: -; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; AVX512-NEXT: vmovlps %xmm0, 48(%rdi) +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, 48(%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; X86-AVX512-LABEL: one_mask_bit_set5: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; X86-AVX512-NEXT: vmovlps %xmm0, 48(%eax) +; X86-AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; X86-AVX512-NEXT: vmovq %xmm0, 48(%eax) ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1>) @@ -4945,8 +5014,8 @@ ; ; AVX512-LABEL: one_mask_bit_set6: ; AVX512: ## %bb.0: -; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; AVX512-NEXT: vmovlps %xmm0, 48(%rdi) +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, 48(%rdi) ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512-NEXT: vpextrq $1, %xmm0, 88(%rdi) ; AVX512-NEXT: vzeroupper @@ -4955,11 +5024,11 @@ ; X86-AVX512-LABEL: one_mask_bit_set6: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; X86-AVX512-NEXT: vmovlps %xmm0, 48(%eax) -; X86-AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 -; X86-AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X86-AVX512-NEXT: vmovlps %xmm0, 88(%eax) +; X86-AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; X86-AVX512-NEXT: vmovq %xmm0, 48(%eax) +; X86-AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 +; X86-AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X86-AVX512-NEXT: vmovq %xmm0, 88(%eax) ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl call void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %val, <16 x i64>* %addr, i32 4, <16 x i1>) diff --git a/llvm/test/CodeGen/X86/memcpy.ll b/llvm/test/CodeGen/X86/memcpy.ll --- a/llvm/test/CodeGen/X86/memcpy.ll +++ b/llvm/test/CodeGen/X86/memcpy.ll @@ -106,32 +106,32 @@ ; ; LINUX-SKL-LABEL: test3: ; LINUX-SKL: # %bb.0: # %entry -; LINUX-SKL-NEXT: vmovups (%rsi), %ymm0 -; LINUX-SKL-NEXT: vmovups 32(%rsi), %ymm1 -; LINUX-SKL-NEXT: vmovups %ymm1, 32(%rdi) -; LINUX-SKL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKL-NEXT: vmovdqu (%rsi), %ymm0 +; LINUX-SKL-NEXT: vmovdqu 32(%rsi), %ymm1 +; LINUX-SKL-NEXT: vmovdqu %ymm1, 32(%rdi) +; LINUX-SKL-NEXT: vmovdqu %ymm0, (%rdi) ; LINUX-SKL-NEXT: vzeroupper ; LINUX-SKL-NEXT: retq ; ; LINUX-SKX-LABEL: test3: ; LINUX-SKX: # %bb.0: # %entry -; LINUX-SKX-NEXT: vmovups (%rsi), %ymm0 -; LINUX-SKX-NEXT: vmovups 32(%rsi), %ymm1 -; LINUX-SKX-NEXT: vmovups %ymm1, 32(%rdi) -; LINUX-SKX-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKX-NEXT: vmovdqu (%rsi), %ymm0 +; LINUX-SKX-NEXT: vmovdqu 32(%rsi), %ymm1 +; LINUX-SKX-NEXT: vmovdqu %ymm1, 32(%rdi) +; LINUX-SKX-NEXT: vmovdqu %ymm0, (%rdi) ; LINUX-SKX-NEXT: vzeroupper ; LINUX-SKX-NEXT: retq ; ; LINUX-KNL-LABEL: test3: ; LINUX-KNL: # %bb.0: # %entry -; LINUX-KNL-NEXT: vmovups (%rsi), %zmm0 -; LINUX-KNL-NEXT: vmovups %zmm0, (%rdi) +; LINUX-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; LINUX-KNL-NEXT: vmovdqu64 %zmm0, (%rdi) ; LINUX-KNL-NEXT: retq ; ; LINUX-AVX512BW-LABEL: test3: ; LINUX-AVX512BW: # %bb.0: # %entry -; LINUX-AVX512BW-NEXT: vmovups (%rsi), %zmm0 -; LINUX-AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; LINUX-AVX512BW-NEXT: vmovdqu64 (%rsi), %zmm0 +; LINUX-AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) ; LINUX-AVX512BW-NEXT: vzeroupper ; LINUX-AVX512BW-NEXT: retq entry: @@ -140,11 +140,6 @@ } define void @test3_pgso(i8* nocapture %A, i8* nocapture %B) nounwind noredzone !prof !14 { -; LINUX-LABEL: test3_pgso: -; LINUX: # %bb.0: # %entry -; LINUX-NEXT: movl $64, %edx -; LINUX-NEXT: jmp memcpy@PLT # TAILCALL -; ; DARWIN-LABEL: test3_pgso: ; DARWIN: ## %bb.0: ## %entry ; DARWIN-NEXT: movq 56(%rsi), %rax @@ -164,6 +159,42 @@ ; DARWIN-NEXT: movq %rcx, 8(%rdi) ; DARWIN-NEXT: movq %rax, (%rdi) ; DARWIN-NEXT: retq +; +; LINUX-LABEL: test3_pgso: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: movl $64, %edx +; LINUX-NEXT: jmp memcpy@PLT # TAILCALL +; +; LINUX-SKL-LABEL: test3_pgso: +; LINUX-SKL: # %bb.0: # %entry +; LINUX-SKL-NEXT: vmovdqu (%rsi), %ymm0 +; LINUX-SKL-NEXT: vmovdqu 32(%rsi), %ymm1 +; LINUX-SKL-NEXT: vmovdqu %ymm1, 32(%rdi) +; LINUX-SKL-NEXT: vmovdqu %ymm0, (%rdi) +; LINUX-SKL-NEXT: vzeroupper +; LINUX-SKL-NEXT: retq +; +; LINUX-SKX-LABEL: test3_pgso: +; LINUX-SKX: # %bb.0: # %entry +; LINUX-SKX-NEXT: vmovdqu (%rsi), %ymm0 +; LINUX-SKX-NEXT: vmovdqu 32(%rsi), %ymm1 +; LINUX-SKX-NEXT: vmovdqu %ymm1, 32(%rdi) +; LINUX-SKX-NEXT: vmovdqu %ymm0, (%rdi) +; LINUX-SKX-NEXT: vzeroupper +; LINUX-SKX-NEXT: retq +; +; LINUX-KNL-LABEL: test3_pgso: +; LINUX-KNL: # %bb.0: # %entry +; LINUX-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; LINUX-KNL-NEXT: vmovdqu64 %zmm0, (%rdi) +; LINUX-KNL-NEXT: retq +; +; LINUX-AVX512BW-LABEL: test3_pgso: +; LINUX-AVX512BW: # %bb.0: # %entry +; LINUX-AVX512BW-NEXT: vmovdqu64 (%rsi), %zmm0 +; LINUX-AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) +; LINUX-AVX512BW-NEXT: vzeroupper +; LINUX-AVX512BW-NEXT: retq entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false) ret void @@ -184,32 +215,32 @@ ; ; LINUX-SKL-LABEL: test3_minsize: ; LINUX-SKL: # %bb.0: -; LINUX-SKL-NEXT: vmovups (%rsi), %ymm0 -; LINUX-SKL-NEXT: vmovups 32(%rsi), %ymm1 -; LINUX-SKL-NEXT: vmovups %ymm1, 32(%rdi) -; LINUX-SKL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKL-NEXT: vmovdqu (%rsi), %ymm0 +; LINUX-SKL-NEXT: vmovdqu 32(%rsi), %ymm1 +; LINUX-SKL-NEXT: vmovdqu %ymm1, 32(%rdi) +; LINUX-SKL-NEXT: vmovdqu %ymm0, (%rdi) ; LINUX-SKL-NEXT: vzeroupper ; LINUX-SKL-NEXT: retq ; ; LINUX-SKX-LABEL: test3_minsize: ; LINUX-SKX: # %bb.0: -; LINUX-SKX-NEXT: vmovups (%rsi), %ymm0 -; LINUX-SKX-NEXT: vmovups 32(%rsi), %ymm1 -; LINUX-SKX-NEXT: vmovups %ymm1, 32(%rdi) -; LINUX-SKX-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKX-NEXT: vmovdqu (%rsi), %ymm0 +; LINUX-SKX-NEXT: vmovdqu 32(%rsi), %ymm1 +; LINUX-SKX-NEXT: vmovdqu %ymm1, 32(%rdi) +; LINUX-SKX-NEXT: vmovdqu %ymm0, (%rdi) ; LINUX-SKX-NEXT: vzeroupper ; LINUX-SKX-NEXT: retq ; ; LINUX-KNL-LABEL: test3_minsize: ; LINUX-KNL: # %bb.0: -; LINUX-KNL-NEXT: vmovups (%rsi), %zmm0 -; LINUX-KNL-NEXT: vmovups %zmm0, (%rdi) +; LINUX-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; LINUX-KNL-NEXT: vmovdqu64 %zmm0, (%rdi) ; LINUX-KNL-NEXT: retq ; ; LINUX-AVX512BW-LABEL: test3_minsize: ; LINUX-AVX512BW: # %bb.0: -; LINUX-AVX512BW-NEXT: vmovups (%rsi), %zmm0 -; LINUX-AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; LINUX-AVX512BW-NEXT: vmovdqu64 (%rsi), %zmm0 +; LINUX-AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) ; LINUX-AVX512BW-NEXT: vzeroupper ; LINUX-AVX512BW-NEXT: retq tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false) @@ -231,32 +262,32 @@ ; ; LINUX-SKL-LABEL: test3_minsize_optsize: ; LINUX-SKL: # %bb.0: -; LINUX-SKL-NEXT: vmovups (%rsi), %ymm0 -; LINUX-SKL-NEXT: vmovups 32(%rsi), %ymm1 -; LINUX-SKL-NEXT: vmovups %ymm1, 32(%rdi) -; LINUX-SKL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKL-NEXT: vmovdqu (%rsi), %ymm0 +; LINUX-SKL-NEXT: vmovdqu 32(%rsi), %ymm1 +; LINUX-SKL-NEXT: vmovdqu %ymm1, 32(%rdi) +; LINUX-SKL-NEXT: vmovdqu %ymm0, (%rdi) ; LINUX-SKL-NEXT: vzeroupper ; LINUX-SKL-NEXT: retq ; ; LINUX-SKX-LABEL: test3_minsize_optsize: ; LINUX-SKX: # %bb.0: -; LINUX-SKX-NEXT: vmovups (%rsi), %ymm0 -; LINUX-SKX-NEXT: vmovups 32(%rsi), %ymm1 -; LINUX-SKX-NEXT: vmovups %ymm1, 32(%rdi) -; LINUX-SKX-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKX-NEXT: vmovdqu (%rsi), %ymm0 +; LINUX-SKX-NEXT: vmovdqu 32(%rsi), %ymm1 +; LINUX-SKX-NEXT: vmovdqu %ymm1, 32(%rdi) +; LINUX-SKX-NEXT: vmovdqu %ymm0, (%rdi) ; LINUX-SKX-NEXT: vzeroupper ; LINUX-SKX-NEXT: retq ; ; LINUX-KNL-LABEL: test3_minsize_optsize: ; LINUX-KNL: # %bb.0: -; LINUX-KNL-NEXT: vmovups (%rsi), %zmm0 -; LINUX-KNL-NEXT: vmovups %zmm0, (%rdi) +; LINUX-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; LINUX-KNL-NEXT: vmovdqu64 %zmm0, (%rdi) ; LINUX-KNL-NEXT: retq ; ; LINUX-AVX512BW-LABEL: test3_minsize_optsize: ; LINUX-AVX512BW: # %bb.0: -; LINUX-AVX512BW-NEXT: vmovups (%rsi), %zmm0 -; LINUX-AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; LINUX-AVX512BW-NEXT: vmovdqu64 (%rsi), %zmm0 +; LINUX-AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) ; LINUX-AVX512BW-NEXT: vzeroupper ; LINUX-AVX512BW-NEXT: retq tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false) @@ -307,32 +338,32 @@ ; ; LINUX-SKL-LABEL: test4: ; LINUX-SKL: # %bb.0: # %entry -; LINUX-SKL-NEXT: vmovups (%rsi), %ymm0 -; LINUX-SKL-NEXT: vmovups 32(%rsi), %ymm1 -; LINUX-SKL-NEXT: vmovups %ymm1, 32(%rdi) -; LINUX-SKL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKL-NEXT: vmovdqu (%rsi), %ymm0 +; LINUX-SKL-NEXT: vmovdqu 32(%rsi), %ymm1 +; LINUX-SKL-NEXT: vmovdqu %ymm1, 32(%rdi) +; LINUX-SKL-NEXT: vmovdqu %ymm0, (%rdi) ; LINUX-SKL-NEXT: vzeroupper ; LINUX-SKL-NEXT: retq ; ; LINUX-SKX-LABEL: test4: ; LINUX-SKX: # %bb.0: # %entry -; LINUX-SKX-NEXT: vmovups (%rsi), %ymm0 -; LINUX-SKX-NEXT: vmovups 32(%rsi), %ymm1 -; LINUX-SKX-NEXT: vmovups %ymm1, 32(%rdi) -; LINUX-SKX-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKX-NEXT: vmovdqu (%rsi), %ymm0 +; LINUX-SKX-NEXT: vmovdqu 32(%rsi), %ymm1 +; LINUX-SKX-NEXT: vmovdqu %ymm1, 32(%rdi) +; LINUX-SKX-NEXT: vmovdqu %ymm0, (%rdi) ; LINUX-SKX-NEXT: vzeroupper ; LINUX-SKX-NEXT: retq ; ; LINUX-KNL-LABEL: test4: ; LINUX-KNL: # %bb.0: # %entry -; LINUX-KNL-NEXT: vmovups (%rsi), %zmm0 -; LINUX-KNL-NEXT: vmovups %zmm0, (%rdi) +; LINUX-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; LINUX-KNL-NEXT: vmovdqu64 %zmm0, (%rdi) ; LINUX-KNL-NEXT: retq ; ; LINUX-AVX512BW-LABEL: test4: ; LINUX-AVX512BW: # %bb.0: # %entry -; LINUX-AVX512BW-NEXT: vmovups (%rsi), %zmm0 -; LINUX-AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; LINUX-AVX512BW-NEXT: vmovdqu64 (%rsi), %zmm0 +; LINUX-AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) ; LINUX-AVX512BW-NEXT: vzeroupper ; LINUX-AVX512BW-NEXT: retq entry: @@ -362,26 +393,26 @@ ; ; LINUX-SKL-LABEL: test5: ; LINUX-SKL: # %bb.0: # %entry -; LINUX-SKL-NEXT: vmovups {{.*}}(%rip), %xmm0 -; LINUX-SKL-NEXT: vmovups %xmm0, (%rdi) +; LINUX-SKL-NEXT: vmovdqu {{.*}}(%rip), %xmm0 +; LINUX-SKL-NEXT: vmovdqu %xmm0, (%rdi) ; LINUX-SKL-NEXT: retq ; ; LINUX-SKX-LABEL: test5: ; LINUX-SKX: # %bb.0: # %entry -; LINUX-SKX-NEXT: vmovups {{.*}}(%rip), %xmm0 -; LINUX-SKX-NEXT: vmovups %xmm0, (%rdi) +; LINUX-SKX-NEXT: vmovdqu {{.*}}(%rip), %xmm0 +; LINUX-SKX-NEXT: vmovdqu %xmm0, (%rdi) ; LINUX-SKX-NEXT: retq ; ; LINUX-KNL-LABEL: test5: ; LINUX-KNL: # %bb.0: # %entry -; LINUX-KNL-NEXT: vmovups {{.*}}(%rip), %xmm0 -; LINUX-KNL-NEXT: vmovups %xmm0, (%rdi) +; LINUX-KNL-NEXT: vmovdqu {{.*}}(%rip), %xmm0 +; LINUX-KNL-NEXT: vmovdqu %xmm0, (%rdi) ; LINUX-KNL-NEXT: retq ; ; LINUX-AVX512BW-LABEL: test5: ; LINUX-AVX512BW: # %bb.0: # %entry -; LINUX-AVX512BW-NEXT: vmovups {{.*}}(%rip), %xmm0 -; LINUX-AVX512BW-NEXT: vmovups %xmm0, (%rdi) +; LINUX-AVX512BW-NEXT: vmovdqu {{.*}}(%rip), %xmm0 +; LINUX-AVX512BW-NEXT: vmovdqu %xmm0, (%rdi) ; LINUX-AVX512BW-NEXT: retq entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str, i64 0, i64 0), i64 16, i1 false) @@ -460,32 +491,32 @@ ; LINUX-SKL: # %bb.0: ; LINUX-SKL-NEXT: movb 16(%rsi), %al ; LINUX-SKL-NEXT: movb %al, 16(%rdi) -; LINUX-SKL-NEXT: vmovups (%rsi), %xmm0 -; LINUX-SKL-NEXT: vmovups %xmm0, (%rdi) +; LINUX-SKL-NEXT: vmovdqu (%rsi), %xmm0 +; LINUX-SKL-NEXT: vmovdqu %xmm0, (%rdi) ; LINUX-SKL-NEXT: retq ; ; LINUX-SKX-LABEL: PR15348: ; LINUX-SKX: # %bb.0: ; LINUX-SKX-NEXT: movb 16(%rsi), %al ; LINUX-SKX-NEXT: movb %al, 16(%rdi) -; LINUX-SKX-NEXT: vmovups (%rsi), %xmm0 -; LINUX-SKX-NEXT: vmovups %xmm0, (%rdi) +; LINUX-SKX-NEXT: vmovdqu (%rsi), %xmm0 +; LINUX-SKX-NEXT: vmovdqu %xmm0, (%rdi) ; LINUX-SKX-NEXT: retq ; ; LINUX-KNL-LABEL: PR15348: ; LINUX-KNL: # %bb.0: ; LINUX-KNL-NEXT: movb 16(%rsi), %al ; LINUX-KNL-NEXT: movb %al, 16(%rdi) -; LINUX-KNL-NEXT: vmovups (%rsi), %xmm0 -; LINUX-KNL-NEXT: vmovups %xmm0, (%rdi) +; LINUX-KNL-NEXT: vmovdqu (%rsi), %xmm0 +; LINUX-KNL-NEXT: vmovdqu %xmm0, (%rdi) ; LINUX-KNL-NEXT: retq ; ; LINUX-AVX512BW-LABEL: PR15348: ; LINUX-AVX512BW: # %bb.0: ; LINUX-AVX512BW-NEXT: movb 16(%rsi), %al ; LINUX-AVX512BW-NEXT: movb %al, 16(%rdi) -; LINUX-AVX512BW-NEXT: vmovups (%rsi), %xmm0 -; LINUX-AVX512BW-NEXT: vmovups %xmm0, (%rdi) +; LINUX-AVX512BW-NEXT: vmovdqu (%rsi), %xmm0 +; LINUX-AVX512BW-NEXT: vmovdqu %xmm0, (%rdi) ; LINUX-AVX512BW-NEXT: retq call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 17, i1 false) ret void @@ -512,26 +543,26 @@ ; ; LINUX-SKL-LABEL: addrspace256: ; LINUX-SKL: # %bb.0: -; LINUX-SKL-NEXT: vmovups %gs:(%rsi), %xmm0 -; LINUX-SKL-NEXT: vmovups %xmm0, %gs:(%rdi) +; LINUX-SKL-NEXT: vmovdqu %gs:(%rsi), %xmm0 +; LINUX-SKL-NEXT: vmovdqu %xmm0, %gs:(%rdi) ; LINUX-SKL-NEXT: retq ; ; LINUX-SKX-LABEL: addrspace256: ; LINUX-SKX: # %bb.0: -; LINUX-SKX-NEXT: vmovups %gs:(%rsi), %xmm0 -; LINUX-SKX-NEXT: vmovups %xmm0, %gs:(%rdi) +; LINUX-SKX-NEXT: vmovdqu %gs:(%rsi), %xmm0 +; LINUX-SKX-NEXT: vmovdqu %xmm0, %gs:(%rdi) ; LINUX-SKX-NEXT: retq ; ; LINUX-KNL-LABEL: addrspace256: ; LINUX-KNL: # %bb.0: -; LINUX-KNL-NEXT: vmovups %gs:(%rsi), %xmm0 -; LINUX-KNL-NEXT: vmovups %xmm0, %gs:(%rdi) +; LINUX-KNL-NEXT: vmovdqu %gs:(%rsi), %xmm0 +; LINUX-KNL-NEXT: vmovdqu %xmm0, %gs:(%rdi) ; LINUX-KNL-NEXT: retq ; ; LINUX-AVX512BW-LABEL: addrspace256: ; LINUX-AVX512BW: # %bb.0: -; LINUX-AVX512BW-NEXT: vmovups %gs:(%rsi), %xmm0 -; LINUX-AVX512BW-NEXT: vmovups %xmm0, %gs:(%rdi) +; LINUX-AVX512BW-NEXT: vmovdqu %gs:(%rsi), %xmm0 +; LINUX-AVX512BW-NEXT: vmovdqu %xmm0, %gs:(%rdi) ; LINUX-AVX512BW-NEXT: retq tail call void @llvm.memcpy.p256i8.p256i8.i64(i8 addrspace(256)* align 8 %a, i8 addrspace(256)* align 8 %b, i64 16, i1 false) ret void diff --git a/llvm/test/CodeGen/X86/memset-nonzero.ll b/llvm/test/CodeGen/X86/memset-nonzero.ll --- a/llvm/test/CodeGen/X86/memset-nonzero.ll +++ b/llvm/test/CodeGen/X86/memset-nonzero.ll @@ -26,11 +26,23 @@ ; SSE2FAST-NEXT: movups %xmm0, (%rdi) ; SSE2FAST-NEXT: retq ; -; AVX-LABEL: memset_16_nonzero_bytes: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX-NEXT: vmovups %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: memset_16_nonzero_bytes: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vmovups %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: memset_16_nonzero_bytes: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovdqu %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: memset_16_nonzero_bytes: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512-NEXT: vmovdqu %xmm0, (%rdi) +; AVX512-NEXT: retq %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1) ret void } @@ -52,12 +64,26 @@ ; SSE2FAST-NEXT: movups %xmm0, (%rdi) ; SSE2FAST-NEXT: retq ; -; AVX-LABEL: memset_32_nonzero_bytes: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: memset_32_nonzero_bytes: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vmovups %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: memset_32_nonzero_bytes: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovdqu %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: memset_32_nonzero_bytes: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1) ret void } @@ -95,23 +121,23 @@ ; ; AVX2-LABEL: memset_64_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX2-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: memset_64_nonzero_bytes: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] -; AVX512F-NEXT: vmovups %zmm0, (%rdi) +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: memset_64_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; AVX512NW-NEXT: retq @@ -166,27 +192,27 @@ ; ; AVX2-LABEL: memset_128_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX2-NEXT: vmovups %ymm0, 96(%rdi) -; AVX2-NEXT: vmovups %ymm0, 64(%rdi) -; AVX2-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: memset_128_nonzero_bytes: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] -; AVX512F-NEXT: vmovups %zmm0, 64(%rdi) -; AVX512F-NEXT: vmovups %zmm0, (%rdi) +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] +; AVX512F-NEXT: vmovdqu64 %zmm0, 64(%rdi) +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: memset_128_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi) -; AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vmovdqu64 %zmm0, 64(%rdi) +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1) @@ -200,7 +226,7 @@ ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: movl $256, %edx # imm = 0x100 ; SSE-NEXT: movl $42, %esi -; SSE-NEXT: callq memset +; SSE-NEXT: callq memset@PLT ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq @@ -242,35 +268,35 @@ ; ; AVX2-LABEL: memset_256_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX2-NEXT: vmovups %ymm0, 224(%rdi) -; AVX2-NEXT: vmovups %ymm0, 192(%rdi) -; AVX2-NEXT: vmovups %ymm0, 160(%rdi) -; AVX2-NEXT: vmovups %ymm0, 128(%rdi) -; AVX2-NEXT: vmovups %ymm0, 96(%rdi) -; AVX2-NEXT: vmovups %ymm0, 64(%rdi) -; AVX2-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vmovdqu %ymm0, 224(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 192(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 160(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 128(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: memset_256_nonzero_bytes: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] -; AVX512F-NEXT: vmovups %zmm0, 192(%rdi) -; AVX512F-NEXT: vmovups %zmm0, 128(%rdi) -; AVX512F-NEXT: vmovups %zmm0, 64(%rdi) -; AVX512F-NEXT: vmovups %zmm0, (%rdi) +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] +; AVX512F-NEXT: vmovdqu64 %zmm0, 192(%rdi) +; AVX512F-NEXT: vmovdqu64 %zmm0, 128(%rdi) +; AVX512F-NEXT: vmovdqu64 %zmm0, 64(%rdi) +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: memset_256_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] -; AVX512BW-NEXT: vmovups %zmm0, 192(%rdi) -; AVX512BW-NEXT: vmovups %zmm0, 128(%rdi) -; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi) -; AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vmovdqu64 %zmm0, 192(%rdi) +; AVX512BW-NEXT: vmovdqu64 %zmm0, 128(%rdi) +; AVX512BW-NEXT: vmovdqu64 %zmm0, 64(%rdi) +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1) diff --git a/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll b/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll --- a/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll +++ b/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll @@ -5,8 +5,8 @@ ; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=pentium2 | FileCheck %s --check-prefix=NOSSE ; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=pentium3 | FileCheck %s --check-prefix=SSE ; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=yonah | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=corei7-avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=core-avx2 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=corei7-avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=core-avx2 | FileCheck %s --check-prefixes=AVX,AVX2 define void @test1(i32 %t) nounwind { ; NOSSE-LABEL: test1: @@ -56,28 +56,51 @@ ; SSE-NEXT: popl %ebp ; SSE-NEXT: retl ; -; AVX-LABEL: test1: -; AVX: # %bb.0: -; AVX-NEXT: pushl %ebp -; AVX-NEXT: movl %esp, %ebp -; AVX-NEXT: pushl %esi -; AVX-NEXT: andl $-32, %esp -; AVX-NEXT: subl $64, %esp -; AVX-NEXT: movl %esp, %esi -; AVX-NEXT: movl 8(%ebp), %eax -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %ymm0, (%esi) -; AVX-NEXT: addl $3, %eax -; AVX-NEXT: andl $-4, %eax -; AVX-NEXT: calll __alloca -; AVX-NEXT: movl %esp, %eax -; AVX-NEXT: pushl %eax -; AVX-NEXT: vzeroupper -; AVX-NEXT: calll _dummy -; AVX-NEXT: leal -4(%ebp), %esp -; AVX-NEXT: popl %esi -; AVX-NEXT: popl %ebp -; AVX-NEXT: retl +; AVX1-LABEL: test1: +; AVX1: # %bb.0: +; AVX1-NEXT: pushl %ebp +; AVX1-NEXT: movl %esp, %ebp +; AVX1-NEXT: pushl %esi +; AVX1-NEXT: andl $-32, %esp +; AVX1-NEXT: subl $64, %esp +; AVX1-NEXT: movl %esp, %esi +; AVX1-NEXT: movl 8(%ebp), %eax +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps %ymm0, (%esi) +; AVX1-NEXT: addl $3, %eax +; AVX1-NEXT: andl $-4, %eax +; AVX1-NEXT: calll __alloca +; AVX1-NEXT: movl %esp, %eax +; AVX1-NEXT: pushl %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: calll _dummy +; AVX1-NEXT: leal -4(%ebp), %esp +; AVX1-NEXT: popl %esi +; AVX1-NEXT: popl %ebp +; AVX1-NEXT: retl +; +; AVX2-LABEL: test1: +; AVX2: # %bb.0: +; AVX2-NEXT: pushl %ebp +; AVX2-NEXT: movl %esp, %ebp +; AVX2-NEXT: pushl %esi +; AVX2-NEXT: andl $-32, %esp +; AVX2-NEXT: subl $64, %esp +; AVX2-NEXT: movl %esp, %esi +; AVX2-NEXT: movl 8(%ebp), %eax +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %ymm0, (%esi) +; AVX2-NEXT: addl $3, %eax +; AVX2-NEXT: andl $-4, %eax +; AVX2-NEXT: calll __alloca +; AVX2-NEXT: movl %esp, %eax +; AVX2-NEXT: pushl %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: calll _dummy +; AVX2-NEXT: leal -4(%ebp), %esp +; AVX2-NEXT: popl %esi +; AVX2-NEXT: popl %ebp +; AVX2-NEXT: retl %tmp1210 = alloca i8, i32 32, align 4 call void @llvm.memset.p0i8.i64(i8* align 4 %tmp1210, i8 0, i64 32, i1 false) %x = alloca i8, i32 %t @@ -128,27 +151,49 @@ ; SSE-NEXT: popl %ebp ; SSE-NEXT: retl ; -; AVX-LABEL: test2: -; AVX: # %bb.0: -; AVX-NEXT: pushl %ebp -; AVX-NEXT: movl %esp, %ebp -; AVX-NEXT: pushl %esi -; AVX-NEXT: andl $-16, %esp -; AVX-NEXT: subl $32, %esp -; AVX-NEXT: movl %esp, %esi -; AVX-NEXT: movl 8(%ebp), %eax -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, (%esi) -; AVX-NEXT: addl $3, %eax -; AVX-NEXT: andl $-4, %eax -; AVX-NEXT: calll __alloca -; AVX-NEXT: movl %esp, %eax -; AVX-NEXT: pushl %eax -; AVX-NEXT: calll _dummy -; AVX-NEXT: leal -4(%ebp), %esp -; AVX-NEXT: popl %esi -; AVX-NEXT: popl %ebp -; AVX-NEXT: retl +; AVX1-LABEL: test2: +; AVX1: # %bb.0: +; AVX1-NEXT: pushl %ebp +; AVX1-NEXT: movl %esp, %ebp +; AVX1-NEXT: pushl %esi +; AVX1-NEXT: andl $-16, %esp +; AVX1-NEXT: subl $32, %esp +; AVX1-NEXT: movl %esp, %esi +; AVX1-NEXT: movl 8(%ebp), %eax +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, (%esi) +; AVX1-NEXT: addl $3, %eax +; AVX1-NEXT: andl $-4, %eax +; AVX1-NEXT: calll __alloca +; AVX1-NEXT: movl %esp, %eax +; AVX1-NEXT: pushl %eax +; AVX1-NEXT: calll _dummy +; AVX1-NEXT: leal -4(%ebp), %esp +; AVX1-NEXT: popl %esi +; AVX1-NEXT: popl %ebp +; AVX1-NEXT: retl +; +; AVX2-LABEL: test2: +; AVX2: # %bb.0: +; AVX2-NEXT: pushl %ebp +; AVX2-NEXT: movl %esp, %ebp +; AVX2-NEXT: pushl %esi +; AVX2-NEXT: andl $-16, %esp +; AVX2-NEXT: subl $32, %esp +; AVX2-NEXT: movl %esp, %esi +; AVX2-NEXT: movl 8(%ebp), %eax +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%esi) +; AVX2-NEXT: addl $3, %eax +; AVX2-NEXT: andl $-4, %eax +; AVX2-NEXT: calll __alloca +; AVX2-NEXT: movl %esp, %eax +; AVX2-NEXT: pushl %eax +; AVX2-NEXT: calll _dummy +; AVX2-NEXT: leal -4(%ebp), %esp +; AVX2-NEXT: popl %esi +; AVX2-NEXT: popl %ebp +; AVX2-NEXT: retl %tmp1210 = alloca i8, i32 16, align 4 call void @llvm.memset.p0i8.i64(i8* align 4 %tmp1210, i8 0, i64 16, i1 false) %x = alloca i8, i32 %t diff --git a/llvm/test/CodeGen/X86/memset-zero.ll b/llvm/test/CodeGen/X86/memset-zero.ll --- a/llvm/test/CodeGen/X86/memset-zero.ll +++ b/llvm/test/CodeGen/X86/memset-zero.ll @@ -356,14 +356,14 @@ ; ; SKYLAKE-LABEL: memset_16: ; SKYLAKE: # %bb.0: # %entry -; SKYLAKE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SKYLAKE-NEXT: vmovups %xmm0, (%rdi) +; SKYLAKE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKYLAKE-NEXT: vmovdqu %xmm0, (%rdi) ; SKYLAKE-NEXT: retq ; ; KNL-LABEL: memset_16: ; KNL: # %bb.0: # %entry -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovups %xmm0, (%rdi) +; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovdqu %xmm0, (%rdi) ; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 16, i1 false) @@ -404,15 +404,15 @@ ; ; SKYLAKE-LABEL: memset_17: ; SKYLAKE: # %bb.0: # %entry -; SKYLAKE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SKYLAKE-NEXT: vmovups %xmm0, (%rdi) +; SKYLAKE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKYLAKE-NEXT: vmovdqu %xmm0, (%rdi) ; SKYLAKE-NEXT: movb $0, 16(%rdi) ; SKYLAKE-NEXT: retq ; ; KNL-LABEL: memset_17: ; KNL: # %bb.0: # %entry -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovups %xmm0, (%rdi) +; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovdqu %xmm0, (%rdi) ; KNL-NEXT: movb $0, 16(%rdi) ; KNL-NEXT: retq entry: @@ -454,15 +454,15 @@ ; ; SKYLAKE-LABEL: memset_19: ; SKYLAKE: # %bb.0: # %entry -; SKYLAKE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SKYLAKE-NEXT: vmovups %xmm0, (%rdi) +; SKYLAKE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKYLAKE-NEXT: vmovdqu %xmm0, (%rdi) ; SKYLAKE-NEXT: movl $0, 15(%rdi) ; SKYLAKE-NEXT: retq ; ; KNL-LABEL: memset_19: ; KNL: # %bb.0: # %entry -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovups %xmm0, (%rdi) +; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovdqu %xmm0, (%rdi) ; KNL-NEXT: movl $0, 15(%rdi) ; KNL-NEXT: retq entry: @@ -508,16 +508,16 @@ ; ; SKYLAKE-LABEL: memset_31: ; SKYLAKE: # %bb.0: # %entry -; SKYLAKE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SKYLAKE-NEXT: vmovups %xmm0, 15(%rdi) -; SKYLAKE-NEXT: vmovups %xmm0, (%rdi) +; SKYLAKE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKYLAKE-NEXT: vmovdqu %xmm0, 15(%rdi) +; SKYLAKE-NEXT: vmovdqu %xmm0, (%rdi) ; SKYLAKE-NEXT: retq ; ; KNL-LABEL: memset_31: ; KNL: # %bb.0: # %entry -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovups %xmm0, 15(%rdi) -; KNL-NEXT: vmovups %xmm0, (%rdi) +; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovdqu %xmm0, 15(%rdi) +; KNL-NEXT: vmovdqu %xmm0, (%rdi) ; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 31, i1 false) @@ -562,15 +562,15 @@ ; ; SKYLAKE-LABEL: memset_32: ; SKYLAKE: # %bb.0: # %entry -; SKYLAKE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SKYLAKE-NEXT: vmovups %ymm0, (%rdi) +; SKYLAKE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKYLAKE-NEXT: vmovdqu %ymm0, (%rdi) ; SKYLAKE-NEXT: vzeroupper ; SKYLAKE-NEXT: retq ; ; KNL-LABEL: memset_32: ; KNL: # %bb.0: # %entry -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovups %ymm0, (%rdi) +; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovdqu %ymm0, (%rdi) ; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 32, i1 false) @@ -614,15 +614,15 @@ ; ; SKYLAKE-LABEL: memset_32_align32: ; SKYLAKE: # %bb.0: # %entry -; SKYLAKE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SKYLAKE-NEXT: vmovaps %ymm0, (%rdi) +; SKYLAKE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKYLAKE-NEXT: vmovdqa %ymm0, (%rdi) ; SKYLAKE-NEXT: vzeroupper ; SKYLAKE-NEXT: retq ; ; KNL-LABEL: memset_32_align32: ; KNL: # %bb.0: # %entry -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovaps %ymm0, (%rdi) +; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovdqa %ymm0, (%rdi) ; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* align 32 %a, i8 0, i64 32, i1 false) @@ -672,16 +672,16 @@ ; ; SKYLAKE-LABEL: memset_35: ; SKYLAKE: # %bb.0: # %entry -; SKYLAKE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SKYLAKE-NEXT: vmovups %ymm0, (%rdi) +; SKYLAKE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKYLAKE-NEXT: vmovdqu %ymm0, (%rdi) ; SKYLAKE-NEXT: movl $0, 31(%rdi) ; SKYLAKE-NEXT: vzeroupper ; SKYLAKE-NEXT: retq ; ; KNL-LABEL: memset_35: ; KNL: # %bb.0: # %entry -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovups %ymm0, (%rdi) +; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovdqu %ymm0, (%rdi) ; KNL-NEXT: movl $0, 31(%rdi) ; KNL-NEXT: retq entry: @@ -743,16 +743,16 @@ ; ; SKYLAKE-LABEL: memset_64: ; SKYLAKE: # %bb.0: # %entry -; SKYLAKE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SKYLAKE-NEXT: vmovups %ymm0, 32(%rdi) -; SKYLAKE-NEXT: vmovups %ymm0, (%rdi) +; SKYLAKE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKYLAKE-NEXT: vmovdqu %ymm0, 32(%rdi) +; SKYLAKE-NEXT: vmovdqu %ymm0, (%rdi) ; SKYLAKE-NEXT: vzeroupper ; SKYLAKE-NEXT: retq ; ; KNL-LABEL: memset_64: ; KNL: # %bb.0: # %entry -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovups %zmm0, (%rdi) +; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovdqu64 %zmm0, (%rdi) ; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 64, i1 false) @@ -809,16 +809,16 @@ ; ; SKYLAKE-LABEL: memset_64_align64: ; SKYLAKE: # %bb.0: # %entry -; SKYLAKE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SKYLAKE-NEXT: vmovaps %ymm0, 32(%rdi) -; SKYLAKE-NEXT: vmovaps %ymm0, (%rdi) +; SKYLAKE-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SKYLAKE-NEXT: vmovdqa %ymm0, 32(%rdi) +; SKYLAKE-NEXT: vmovdqa %ymm0, (%rdi) ; SKYLAKE-NEXT: vzeroupper ; SKYLAKE-NEXT: retq ; ; KNL-LABEL: memset_64_align64: ; KNL: # %bb.0: # %entry -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovaps %zmm0, (%rdi) +; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovdqa64 %zmm0, (%rdi) ; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* align 64 %a, i8 0, i64 64, i1 false) diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2OR512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX2OR512 ; ; 32-bit SSE tests to make sure we do reasonable things. ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86-SSE,X86-SSE1 @@ -15,10 +15,15 @@ ; SSE-NEXT: movups 16(%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: merge_2f64_f64_23: -; AVX: # %bb.0: -; AVX-NEXT: vmovups 16(%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_2f64_f64_23: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups 16(%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_2f64_f64_23: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqu 16(%rdi), %xmm0 +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_2f64_f64_23: ; X86-SSE1: # %bb.0: @@ -48,10 +53,15 @@ ; SSE-NEXT: movups 8(%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: merge_2i64_i64_12: -; AVX: # %bb.0: -; AVX-NEXT: vmovups 8(%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_2i64_i64_12: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups 8(%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_2i64_i64_12: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqu 8(%rdi), %xmm0 +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_2i64_i64_12: ; X86-SSE1: # %bb.0: @@ -97,10 +107,15 @@ ; SSE-NEXT: movups 8(%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: merge_4f32_f32_2345: -; AVX: # %bb.0: -; AVX-NEXT: vmovups 8(%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_4f32_f32_2345: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups 8(%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_4f32_f32_2345: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqu 8(%rdi), %xmm0 +; AVX2OR512-NEXT: retq ; ; X86-SSE-LABEL: merge_4f32_f32_2345: ; X86-SSE: # %bb.0: @@ -128,10 +143,15 @@ ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; -; AVX-LABEL: merge_4f32_f32_3zuu: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_4f32_f32_3zuu: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_4f32_f32_3zuu: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2OR512-NEXT: retq ; ; X86-SSE-LABEL: merge_4f32_f32_3zuu: ; X86-SSE: # %bb.0: @@ -151,10 +171,15 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: merge_4f32_f32_34uu: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_4f32_f32_34uu: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_4f32_f32_34uu: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_4f32_f32_34uu: ; X86-SSE1: # %bb.0: @@ -193,11 +218,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: merge_4f32_f32_34z6: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3] -; AVX-NEXT: retq +; AVX1-LABEL: merge_4f32_f32_34z6: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3] +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_4f32_f32_34z6: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2OR512-NEXT: vpblendd {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3] +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_4f32_f32_34z6: ; X86-SSE1: # %bb.0: @@ -233,10 +264,15 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: merge_4f32_f32_45zz: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_4f32_f32_45zz: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_4f32_f32_45zz: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_4f32_f32_45zz: ; X86-SSE1: # %bb.0: @@ -361,10 +397,15 @@ ; SSE-NEXT: movups 8(%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: merge_4i32_i32_23u5: -; AVX: # %bb.0: -; AVX-NEXT: vmovups 8(%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_4i32_i32_23u5: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups 8(%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_4i32_i32_23u5: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqu 8(%rdi), %xmm0 +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_4i32_i32_23u5: ; X86-SSE1: # %bb.0: @@ -407,11 +448,17 @@ ; SSE-NEXT: incl 8(%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: merge_4i32_i32_23u5_inc2: -; AVX: # %bb.0: -; AVX-NEXT: vmovups 8(%rdi), %xmm0 -; AVX-NEXT: incl 8(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: merge_4i32_i32_23u5_inc2: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups 8(%rdi), %xmm0 +; AVX1-NEXT: incl 8(%rdi) +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_4i32_i32_23u5_inc2: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqu 8(%rdi), %xmm0 +; AVX2OR512-NEXT: incl 8(%rdi) +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_4i32_i32_23u5_inc2: ; X86-SSE1: # %bb.0: @@ -464,11 +511,17 @@ ; SSE-NEXT: incl 12(%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: merge_4i32_i32_23u5_inc3: -; AVX: # %bb.0: -; AVX-NEXT: vmovups 8(%rdi), %xmm0 -; AVX-NEXT: incl 12(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: merge_4i32_i32_23u5_inc3: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups 8(%rdi), %xmm0 +; AVX1-NEXT: incl 12(%rdi) +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_4i32_i32_23u5_inc3: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqu 8(%rdi), %xmm0 +; AVX2OR512-NEXT: incl 12(%rdi) +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_4i32_i32_23u5_inc3: ; X86-SSE1: # %bb.0: @@ -520,10 +573,15 @@ ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; -; AVX-LABEL: merge_4i32_i32_3zuu: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_4i32_i32_3zuu: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_4i32_i32_3zuu: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_4i32_i32_3zuu: ; X86-SSE1: # %bb.0: @@ -552,10 +610,15 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: merge_4i32_i32_34uu: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_4i32_i32_34uu: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_4i32_i32_34uu: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_4i32_i32_34uu: ; X86-SSE1: # %bb.0: @@ -587,10 +650,15 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: merge_4i32_i32_45zz: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_4i32_i32_45zz: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_4i32_i32_45zz: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_4i32_i32_45zz: ; X86-SSE1: # %bb.0: @@ -625,11 +693,17 @@ ; SSE-NEXT: incl 16(%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: merge_4i32_i32_45zz_inc4: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: incl 16(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: merge_4i32_i32_45zz_inc4: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: incl 16(%rdi) +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_4i32_i32_45zz_inc4: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2OR512-NEXT: incl 16(%rdi) +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_4i32_i32_45zz_inc4: ; X86-SSE1: # %bb.0: @@ -679,11 +753,17 @@ ; SSE-NEXT: incl 20(%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: merge_4i32_i32_45zz_inc5: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: incl 20(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: merge_4i32_i32_45zz_inc5: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: incl 20(%rdi) +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_4i32_i32_45zz_inc5: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2OR512-NEXT: incl 20(%rdi) +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_4i32_i32_45zz_inc5: ; X86-SSE1: # %bb.0: @@ -732,10 +812,15 @@ ; SSE-NEXT: movups 4(%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: merge_8i16_i16_23u567u9: -; AVX: # %bb.0: -; AVX-NEXT: vmovups 4(%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_8i16_i16_23u567u9: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups 4(%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_8i16_i16_23u567u9: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqu 4(%rdi), %xmm0 +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_8i16_i16_23u567u9: ; X86-SSE1: # %bb.0: @@ -793,10 +878,15 @@ ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; -; AVX-LABEL: merge_8i16_i16_34uuuuuu: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_8i16_i16_34uuuuuu: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_8i16_i16_34uuuuuu: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_8i16_i16_34uuuuuu: ; X86-SSE1: # %bb.0: @@ -826,10 +916,15 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: merge_8i16_i16_45u7zzzz: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_8i16_i16_45u7zzzz: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_8i16_i16_45u7zzzz: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_8i16_i16_45u7zzzz: ; X86-SSE1: # %bb.0: @@ -870,10 +965,15 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_16i8_i8_01u3456789ABCDuF: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_16i8_i8_01u3456789ABCDuF: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF: ; X86-SSE1: # %bb.0: @@ -969,10 +1069,15 @@ ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; -; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: ; X86-SSE1: # %bb.0: @@ -1015,10 +1120,15 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: ; X86-SSE1: # %bb.0: @@ -1068,11 +1178,17 @@ ; SSE-NEXT: movaps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: merge_4i32_i32_combine: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovaps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: merge_4i32_i32_combine: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovaps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_4i32_i32_combine: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2OR512-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_4i32_i32_combine: ; X86-SSE1: # %bb.0: @@ -1113,12 +1229,19 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: merge_2i64_i64_12_volatile: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: merge_2i64_i64_12_volatile: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: merge_2i64_i64_12_volatile: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2OR512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: merge_2i64_i64_12_volatile: ; X86-SSE1: # %bb.0: @@ -1265,10 +1388,15 @@ ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; -; AVX-LABEL: load_i32_zext_i128_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: load_i32_zext_i128_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2OR512-LABEL: load_i32_zext_i128_v4i32: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2OR512-NEXT: retq ; ; X86-SSE1-LABEL: load_i32_zext_i128_v4i32: ; X86-SSE1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -7,10 +7,20 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86-AVX define <4 x double> @merge_4f64_2f64_23(<2 x double>* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_4f64_2f64_23: -; AVX: # %bb.0: -; AVX-NEXT: vmovups 32(%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_4f64_2f64_23: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups 32(%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_4f64_2f64_23: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_4f64_2f64_23: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm0 +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_4f64_2f64_23: ; X86-AVX: # %bb.0: @@ -26,10 +36,20 @@ } define <4 x double> @merge_4f64_2f64_2z(<2 x double>* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_4f64_2f64_2z: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_4f64_2f64_2z: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_4f64_2f64_2z: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_4f64_2f64_2z: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_4f64_2f64_2z: ; X86-AVX: # %bb.0: @@ -43,10 +63,20 @@ } define <4 x double> @merge_4f64_f64_2345(double* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_4f64_f64_2345: -; AVX: # %bb.0: -; AVX-NEXT: vmovups 16(%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_4f64_f64_2345: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups 16(%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_4f64_f64_2345: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu 16(%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_4f64_f64_2345: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu 16(%rdi), %ymm0 +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_4f64_f64_2345: ; X86-AVX: # %bb.0: @@ -69,10 +99,20 @@ } define <4 x double> @merge_4f64_f64_3zuu(double* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_4f64_f64_3zuu: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_4f64_f64_3zuu: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_4f64_f64_3zuu: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_4f64_f64_3zuu: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_4f64_f64_3zuu: ; X86-AVX: # %bb.0: @@ -87,10 +127,20 @@ } define <4 x double> @merge_4f64_f64_34uu(double* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_4f64_f64_34uu: -; AVX: # %bb.0: -; AVX-NEXT: vmovups 24(%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_4f64_f64_34uu: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups 24(%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_4f64_f64_34uu: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu 24(%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_4f64_f64_34uu: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu 24(%rdi), %xmm0 +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_4f64_f64_34uu: ; X86-AVX: # %bb.0: @@ -107,10 +157,20 @@ } define <4 x double> @merge_4f64_f64_45zz(double* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_4f64_f64_45zz: -; AVX: # %bb.0: -; AVX-NEXT: vmovups 32(%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_4f64_f64_45zz: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups 32(%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_4f64_f64_45zz: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu 32(%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_4f64_f64_45zz: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm0 +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_4f64_f64_45zz: ; X86-AVX: # %bb.0: @@ -127,11 +187,23 @@ } define <4 x double> @merge_4f64_f64_34z6(double* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_4f64_f64_34z6: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] -; AVX-NEXT: retq +; AVX1-LABEL: merge_4f64_f64_34z6: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_4f64_f64_34z6: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_4f64_f64_34z6: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_4f64_f64_34z6: ; X86-AVX: # %bb.0: @@ -153,10 +225,20 @@ } define <4 x i64> @merge_4i64_2i64_3z(<2 x i64>* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_4i64_2i64_3z: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_4i64_2i64_3z: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_4i64_2i64_3z: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_4i64_2i64_3z: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_4i64_2i64_3z: ; X86-AVX: # %bb.0: @@ -170,10 +252,20 @@ } define <4 x i64> @merge_4i64_i64_1234(i64* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_4i64_i64_1234: -; AVX: # %bb.0: -; AVX-NEXT: vmovups 8(%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_4i64_i64_1234: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups 8(%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_4i64_i64_1234: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu 8(%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_4i64_i64_1234: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu 8(%rdi), %ymm0 +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_4i64_i64_1234: ; X86-AVX: # %bb.0: @@ -196,10 +288,20 @@ } define <4 x i64> @merge_4i64_i64_1zzu(i64* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_4i64_i64_1zzu: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_4i64_i64_1zzu: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_4i64_i64_1zzu: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_4i64_i64_1zzu: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_4i64_i64_1zzu: ; X86-AVX: # %bb.0: @@ -215,10 +317,20 @@ } define <4 x i64> @merge_4i64_i64_23zz(i64* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_4i64_i64_23zz: -; AVX: # %bb.0: -; AVX-NEXT: vmovups 16(%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_4i64_i64_23zz: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups 16(%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_4i64_i64_23zz: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu 16(%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_4i64_i64_23zz: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu 16(%rdi), %xmm0 +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_4i64_i64_23zz: ; X86-AVX: # %bb.0: @@ -264,11 +376,23 @@ } define <8 x float> @merge_8f32_4f32_z2(<4 x float>* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_8f32_4f32_z2: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_8f32_4f32_z2: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_8f32_4f32_z2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_8f32_4f32_z2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, 32(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_8f32_4f32_z2: ; X86-AVX: # %bb.0: @@ -283,10 +407,20 @@ } define <8 x float> @merge_8f32_f32_12zzuuzz(float* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_8f32_f32_12zzuuzz: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_8f32_f32_12zzuuzz: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_8f32_f32_12zzuuzz: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_8f32_f32_12zzuuzz: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_8f32_f32_12zzuuzz: ; X86-AVX: # %bb.0: @@ -307,11 +441,23 @@ } define <8 x float> @merge_8f32_f32_1u3u5zu8(float* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_8f32_f32_1u3u5zu8: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7] -; AVX-NEXT: retq +; AVX1-LABEL: merge_8f32_f32_1u3u5zu8: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_8f32_f32_1u3u5zu8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_8f32_f32_1u3u5zu8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7] +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_8f32_f32_1u3u5zu8: ; X86-AVX: # %bb.0: @@ -336,11 +482,23 @@ } define <8 x i32> @merge_8i32_4i32_z3(<4 x i32>* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_8i32_4i32_z3: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_8i32_4i32_z3: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, 48(%rdi), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_8i32_4i32_z3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_8i32_4i32_z3: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_8i32_4i32_z3: ; X86-AVX: # %bb.0: @@ -355,12 +513,26 @@ } define <8 x i32> @merge_8i32_i32_56zz9uzz(i32* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_8i32_i32_56zz9uzz: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_8i32_i32_56zz9uzz: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_8i32_i32_56zz9uzz: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_8i32_i32_56zz9uzz: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_8i32_i32_56zz9uzz: ; X86-AVX: # %bb.0: @@ -386,11 +558,23 @@ } define <8 x i32> @merge_8i32_i32_1u3u5zu8(i32* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_8i32_i32_1u3u5zu8: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7] -; AVX-NEXT: retq +; AVX1-LABEL: merge_8i32_i32_1u3u5zu8: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_8i32_i32_1u3u5zu8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_8i32_i32_1u3u5zu8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7] +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_8i32_i32_1u3u5zu8: ; X86-AVX: # %bb.0: @@ -415,10 +599,20 @@ } define <16 x i16> @merge_16i16_i16_89zzzuuuuuuuuuuuz(i16* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: ; X86-AVX: # %bb.0: @@ -439,10 +633,20 @@ } define <16 x i16> @merge_16i16_i16_45u7uuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_16i16_i16_45u7uuuuuuuuuuuu: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_16i16_i16_45u7uuuuuuuuuuuu: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_16i16_i16_45u7uuuuuuuuuuuu: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_16i16_i16_45u7uuuuuuuuuuuu: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_16i16_i16_45u7uuuuuuuuuuuu: ; X86-AVX: # %bb.0: @@ -462,10 +666,20 @@ } define <16 x i16> @merge_16i16_i16_0uu3uuuuuuuuCuEF(i16* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_16i16_i16_0uu3uuuuuuuuCuEF: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_16i16_i16_0uu3uuuuuuuuCuEF: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_16i16_i16_0uu3uuuuuuuuCuEF: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_16i16_i16_0uu3uuuuuuuuCuEF: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_16i16_i16_0uu3uuuuuuuuCuEF: ; X86-AVX: # %bb.0: @@ -491,11 +705,23 @@ } define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF(i16* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF: ; X86-AVX: # %bb.0: @@ -525,10 +751,20 @@ } define <32 x i8> @merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i8* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: ; X86-AVX: # %bb.0: @@ -548,10 +784,20 @@ } define <32 x i8> @merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu(i8* %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu: ; X86-AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -8,17 +8,17 @@ define <8 x double> @merge_8f64_2f64_12u4(<2 x double>* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_2f64_12u4: ; ALL: # %bb.0: -; ALL-NEXT: vmovups 16(%rdi), %ymm0 -; ALL-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1 -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vmovdqu 16(%rdi), %ymm0 +; ALL-NEXT: vinserti128 $1, 64(%rdi), %ymm0, %ymm1 +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8f64_2f64_12u4: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups 16(%eax), %ymm0 -; X86-AVX512F-NEXT: vinsertf128 $1, 64(%eax), %ymm0, %ymm1 -; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; X86-AVX512F-NEXT: vmovdqu 16(%eax), %ymm0 +; X86-AVX512F-NEXT: vinserti128 $1, 64(%eax), %ymm0, %ymm1 +; X86-AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 1 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 @@ -35,19 +35,19 @@ define <8 x double> @merge_8f64_2f64_23z5(<2 x double>* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_2f64_23z5: ; ALL: # %bb.0: -; ALL-NEXT: vmovups 32(%rdi), %ymm0 -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vinsertf128 $1, 80(%rdi), %ymm1, %ymm1 -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vmovdqu 32(%rdi), %ymm0 +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vinserti128 $1, 80(%rdi), %ymm1, %ymm1 +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8f64_2f64_23z5: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups 32(%eax), %ymm0 -; X86-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X86-AVX512F-NEXT: vinsertf128 $1, 80(%eax), %ymm1, %ymm1 -; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; X86-AVX512F-NEXT: vmovdqu 32(%eax), %ymm0 +; X86-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX512F-NEXT: vinserti128 $1, 80(%eax), %ymm1, %ymm1 +; X86-AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 3 @@ -64,15 +64,15 @@ define <8 x double> @merge_8f64_4f64_z2(<4 x double>* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_4f64_z2: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vinsertf64x4 $1, 64(%rdi), %zmm0, %zmm0 +; ALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vinserti64x4 $1, 64(%rdi), %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8f64_4f64_z2: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X86-AVX512F-NEXT: vinsertf64x4 $1, 64(%eax), %zmm0, %zmm0 +; X86-AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X86-AVX512F-NEXT: vinserti64x4 $1, 64(%eax), %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr1 = getelementptr inbounds <4 x double>, <4 x double>* %ptr, i64 2 %val1 = load <4 x double>, <4 x double>* %ptr1 @@ -83,13 +83,13 @@ define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_f64_23uuuuu9: ; ALL: # %bb.0: -; ALL-NEXT: vmovups 16(%rdi), %zmm0 +; ALL-NEXT: vmovdqu64 16(%rdi), %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8f64_f64_23uuuuu9: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups 16(%eax), %zmm0 +; X86-AVX512F-NEXT: vmovdqu64 16(%eax), %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds double, double* %ptr, i64 2 %ptr1 = getelementptr inbounds double, double* %ptr, i64 3 @@ -106,13 +106,13 @@ define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_f64_12zzuuzz: ; ALL: # %bb.0: -; ALL-NEXT: vmovups 8(%rdi), %xmm0 +; ALL-NEXT: vmovdqu 8(%rdi), %xmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8f64_f64_12zzuuzz: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups 8(%eax), %xmm0 +; X86-AVX512F-NEXT: vmovdqu 8(%eax), %xmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds double, double* %ptr, i64 1 %ptr1 = getelementptr inbounds double, double* %ptr, i64 2 @@ -159,15 +159,15 @@ define <8 x i64> @merge_8i64_4i64_z3(<4 x i64>* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8i64_4i64_z3: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vinsertf64x4 $1, 96(%rdi), %zmm0, %zmm0 +; ALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vinserti64x4 $1, 96(%rdi), %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8i64_4i64_z3: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X86-AVX512F-NEXT: vinsertf64x4 $1, 96(%eax), %zmm0, %zmm0 +; X86-AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X86-AVX512F-NEXT: vinserti64x4 $1, 96(%eax), %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr1 = getelementptr inbounds <4 x i64>, <4 x i64>* %ptr, i64 3 %val1 = load <4 x i64>, <4 x i64>* %ptr1 @@ -178,17 +178,17 @@ define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8i64_i64_56zz9uzz: ; ALL: # %bb.0: -; ALL-NEXT: vmovups 40(%rdi), %xmm0 -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vmovdqu 40(%rdi), %xmm0 +; ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8i64_i64_56zz9uzz: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups 40(%eax), %xmm0 -; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; X86-AVX512F-NEXT: vmovdqu 40(%eax), %xmm0 +; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X86-AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 5 %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 6 @@ -238,13 +238,13 @@ define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(float* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz: ; ALL: # %bb.0: -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds float, float* %ptr, i64 8 %ptr1 = getelementptr inbounds float, float* %ptr, i64 9 @@ -262,13 +262,13 @@ define <16 x float> @merge_16f32_f32_45u7uuuuuuuuuuuu(float* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu: ; ALL: # %bb.0: -; ALL-NEXT: vmovups 16(%rdi), %xmm0 +; ALL-NEXT: vmovdqu 16(%rdi), %xmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups 16(%eax), %xmm0 +; X86-AVX512F-NEXT: vmovdqu 16(%eax), %xmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds float, float* %ptr, i64 4 %ptr1 = getelementptr inbounds float, float* %ptr, i64 5 @@ -285,13 +285,13 @@ define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(float* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF: ; ALL: # %bb.0: -; ALL-NEXT: vmovups (%rdi), %zmm0 +; ALL-NEXT: vmovdqu64 (%rdi), %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups (%eax), %zmm0 +; X86-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds float, float* %ptr, i64 0 %ptr3 = getelementptr inbounds float, float* %ptr, i64 3 @@ -352,13 +352,13 @@ define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(i32* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz: ; ALL: # %bb.0: -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 1 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 2 @@ -376,13 +376,13 @@ define <16 x i32> @merge_16i32_i32_23u5uuuuuuuuuuuu(i32* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu: ; ALL: # %bb.0: -; ALL-NEXT: vmovups 8(%rdi), %xmm0 +; ALL-NEXT: vmovdqu 8(%rdi), %xmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups 8(%eax), %xmm0 +; X86-AVX512F-NEXT: vmovdqu 8(%eax), %xmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3 @@ -399,13 +399,13 @@ define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF(i32* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF: ; ALL: # %bb.0: -; ALL-NEXT: vmovups (%rdi), %zmm0 +; ALL-NEXT: vmovdqu64 (%rdi), %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups (%eax), %zmm0 +; X86-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3 @@ -462,13 +462,13 @@ define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(i16* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: ; ALL: # %bb.0: -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 1 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 2 @@ -487,13 +487,13 @@ define <32 x i16> @merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: ; ALL: # %bb.0: -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5 @@ -510,13 +510,13 @@ define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: ; ALL: # %bb.0: -; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3 @@ -535,13 +535,13 @@ define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; ALL: # %bb.0: -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2 @@ -566,13 +566,13 @@ define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; ALL: # %bb.0: -; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -107,13 +107,21 @@ ; X64-SSE41-NEXT: movaps %xmm1, 16(%rsi) ; X64-SSE41-NEXT: retq ; -; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntload: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0 -; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 -; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: merge_2_v4f32_align32_mix_ntload: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovaps 16(%rdi), %xmm1 +; X64-AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX1-NEXT: vmovaps %xmm1, 16(%rsi) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: merge_2_v4f32_align32_mix_ntload: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovntdqa (%rdi), %xmm0 +; X64-AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; X64-AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX2-NEXT: vmovdqa %xmm1, 16(%rsi) +; X64-AVX2-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* %3 = load <4 x float>, <4 x float>* %a0, align 32, !nontemporal !0 @@ -145,13 +153,21 @@ ; X64-SSE-NEXT: movaps %xmm1, 16(%rsi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntstore: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 -; X64-AVX-NEXT: vmovntps %xmm0, (%rsi) -; X64-AVX-NEXT: vmovaps %xmm1, 16(%rsi) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: merge_2_v4f32_align32_mix_ntstore: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovaps 16(%rdi), %xmm1 +; X64-AVX1-NEXT: vmovntps %xmm0, (%rsi) +; X64-AVX1-NEXT: vmovaps %xmm1, 16(%rsi) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: merge_2_v4f32_align32_mix_ntstore: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; X64-AVX2-NEXT: vmovntdq %xmm0, (%rsi) +; X64-AVX2-NEXT: vmovdqa %xmm1, 16(%rsi) +; X64-AVX2-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* %3 = load <4 x float>, <4 x float>* %a0, align 32 @@ -239,13 +255,21 @@ ; X64-SSE-NEXT: movntps %xmm1, 16(%rsi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: merge_2_v4f32_align16_ntstore: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 -; X64-AVX-NEXT: vmovntps %xmm0, (%rsi) -; X64-AVX-NEXT: vmovntps %xmm1, 16(%rsi) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: merge_2_v4f32_align16_ntstore: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovaps 16(%rdi), %xmm1 +; X64-AVX1-NEXT: vmovntps %xmm0, (%rsi) +; X64-AVX1-NEXT: vmovntps %xmm1, 16(%rsi) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: merge_2_v4f32_align16_ntstore: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; X64-AVX2-NEXT: vmovntdq %xmm0, (%rsi) +; X64-AVX2-NEXT: vmovntdq %xmm1, 16(%rsi) +; X64-AVX2-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* %3 = load <4 x float>, <4 x float>* %a0, align 16 @@ -278,12 +302,19 @@ ; X64-SSE-NEXT: movups %xmm1, 16(%rsi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: merge_2_v4f32_align1_ntload: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX-NEXT: vmovups %ymm0, (%rsi) -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: merge_2_v4f32_align1_ntload: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, (%rsi) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: merge_2_v4f32_align1_ntload: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqu %ymm0, (%rsi) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* %3 = load <4 x float>, <4 x float>* %a0, align 1, !nontemporal !0 diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -440,8 +440,8 @@ ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: vpmovm2d %k0, %ymm1 ; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) +; CHECK-NEXT: vmovdqa %ymm1, (%rdi) +; CHECK-NEXT: vmovdqa %ymm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer @@ -456,7 +456,7 @@ ; CHECK-NEXT: vpmovw2m %ymm0, %k0 ; CHECK-NEXT: vpmovm2d %k0, %zmm0 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, (%rdi) +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer @@ -478,10 +478,10 @@ ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 -; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) -; CHECK-NEXT: vmovaps %ymm3, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) -; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) +; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi) +; CHECK-NEXT: vmovdqa %ymm3, (%rdi) +; CHECK-NEXT: vmovdqa %ymm0, 96(%rdi) +; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer @@ -498,8 +498,8 @@ ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm1, (%rdi) +; CHECK-NEXT: vmovdqa64 %zmm0, 64(%rdi) +; CHECK-NEXT: vmovdqa64 %zmm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer @@ -519,8 +519,8 @@ ; CHECK-NEXT: vpmovm2d %k0, %ymm1 ; CHECK-NEXT: vpsrld $31, %ymm1, %ymm1 ; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vmovaps %ymm1, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) +; CHECK-NEXT: vmovdqa %ymm1, (%rdi) +; CHECK-NEXT: vmovdqa %ymm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer @@ -536,7 +536,7 @@ ; CHECK-NEXT: vpmovm2d %k0, %zmm0 ; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, (%rdi) +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer @@ -560,10 +560,10 @@ ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2 -; CHECK-NEXT: vmovaps %ymm2, 32(%rdi) -; CHECK-NEXT: vmovaps %ymm3, (%rdi) -; CHECK-NEXT: vmovaps %ymm0, 96(%rdi) -; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) +; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi) +; CHECK-NEXT: vmovdqa %ymm3, (%rdi) +; CHECK-NEXT: vmovdqa %ymm0, 96(%rdi) +; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer @@ -581,8 +581,8 @@ ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 -; CHECK-NEXT: vmovaps %zmm0, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm1, (%rdi) +; CHECK-NEXT: vmovdqa64 %zmm0, 64(%rdi) +; CHECK-NEXT: vmovdqa64 %zmm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %mask = icmp slt <16 x i16> %a, zeroinitializer diff --git a/llvm/test/CodeGen/X86/movddup-load-fold.ll b/llvm/test/CodeGen/X86/movddup-load-fold.ll --- a/llvm/test/CodeGen/X86/movddup-load-fold.ll +++ b/llvm/test/CodeGen/X86/movddup-load-fold.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=i686-- -mattr=+avx2 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=i686-- -mattr=+avx512vl | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=i686-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2OR512 +; RUN: llc < %s -mtriple=i686-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2OR512 ; Test an isel pattern for a splatted VZLOAD. @@ -12,10 +12,15 @@ ; SSE-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] ; SSE-NEXT: retl ; -; AVX-LABEL: movddup_load_fold: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-NEXT: retl +; AVX1-LABEL: movddup_load_fold: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: retl +; +; AVX2OR512-LABEL: movddup_load_fold: +; AVX2OR512: # %bb.0: +; AVX2OR512-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm0 +; AVX2OR512-NEXT: retl %i0 = insertelement <4 x float> zeroinitializer, float %x, i32 0 %i1 = insertelement <4 x float> %i0, float %y, i32 1 %dup = shufflevector <4 x float> %i1, <4 x float> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/musttail-fastcall.ll b/llvm/test/CodeGen/X86/musttail-fastcall.ll --- a/llvm/test/CodeGen/X86/musttail-fastcall.ll +++ b/llvm/test/CodeGen/X86/musttail-fastcall.ll @@ -69,12 +69,12 @@ ; AVX-DAG: vmovups %ymm4, {{.*}} ; AVX-DAG: vmovups %ymm5, {{.*}} -; AVX512-DAG: vmovups %zmm0, {{.*}} -; AVX512-DAG: vmovups %zmm1, {{.*}} -; AVX512-DAG: vmovups %zmm2, {{.*}} -; AVX512-DAG: vmovups %zmm3, {{.*}} -; AVX512-DAG: vmovups %zmm4, {{.*}} -; AVX512-DAG: vmovups %zmm5, {{.*}} +; AVX512-DAG: vmovdqu64 %zmm0, {{.*}} +; AVX512-DAG: vmovdqu64 %zmm1, {{.*}} +; AVX512-DAG: vmovdqu64 %zmm2, {{.*}} +; AVX512-DAG: vmovdqu64 %zmm3, {{.*}} +; AVX512-DAG: vmovdqu64 %zmm4, {{.*}} +; AVX512-DAG: vmovdqu64 %zmm5, {{.*}} ; CHECK: calll _puts @@ -92,12 +92,12 @@ ; AVX-DAG: vmovups {{.*}}, %ymm4 ; AVX-DAG: vmovups {{.*}}, %ymm5 -; AVX512-DAG: vmovups {{.*}}, %zmm0 -; AVX512-DAG: vmovups {{.*}}, %zmm1 -; AVX512-DAG: vmovups {{.*}}, %zmm2 -; AVX512-DAG: vmovups {{.*}}, %zmm3 -; AVX512-DAG: vmovups {{.*}}, %zmm4 -; AVX512-DAG: vmovups {{.*}}, %zmm5 +; AVX512-DAG: vmovdqu64 {{.*}}, %zmm0 +; AVX512-DAG: vmovdqu64 {{.*}}, %zmm1 +; AVX512-DAG: vmovdqu64 {{.*}}, %zmm2 +; AVX512-DAG: vmovdqu64 {{.*}}, %zmm3 +; AVX512-DAG: vmovdqu64 {{.*}}, %zmm4 +; AVX512-DAG: vmovdqu64 {{.*}}, %zmm5 ; CHECK-DAG: movl {{.*}}, %ecx ; CHECK-DAG: movl {{.*}}, %edx @@ -142,19 +142,19 @@ ; AVX-DAG: vmovups %ymm4, {{.*}} ; AVX-DAG: vmovups %ymm5, {{.*}} -; AVX512F-DAG: vmovups %zmm0, {{.*}} -; AVX512F-DAG: vmovups %zmm1, {{.*}} -; AVX512F-DAG: vmovups %zmm2, {{.*}} -; AVX512F-DAG: vmovups %zmm3, {{.*}} -; AVX512F-DAG: vmovups %zmm4, {{.*}} -; AVX512F-DAG: vmovups %zmm5, {{.*}} +; AVX512F-DAG: vmovdqu64 %zmm0, {{.*}} +; AVX512F-DAG: vmovdqu64 %zmm1, {{.*}} +; AVX512F-DAG: vmovdqu64 %zmm2, {{.*}} +; AVX512F-DAG: vmovdqu64 %zmm3, {{.*}} +; AVX512F-DAG: vmovdqu64 %zmm4, {{.*}} +; AVX512F-DAG: vmovdqu64 %zmm5, {{.*}} -; AVX512VL-DAG: vmovups %ymm0, {{.*}} -; AVX512VL-DAG: vmovups %ymm1, {{.*}} -; AVX512VL-DAG: vmovups %ymm2, {{.*}} -; AVX512VL-DAG: vmovups %ymm3, {{.*}} -; AVX512VL-DAG: vmovups %ymm4, {{.*}} -; AVX512VL-DAG: vmovups %ymm5, {{.*}} +; AVX512VL-DAG: vmovdqu %ymm0, {{.*}} +; AVX512VL-DAG: vmovdqu %ymm1, {{.*}} +; AVX512VL-DAG: vmovdqu %ymm2, {{.*}} +; AVX512VL-DAG: vmovdqu %ymm3, {{.*}} +; AVX512VL-DAG: vmovdqu %ymm4, {{.*}} +; AVX512VL-DAG: vmovdqu %ymm5, {{.*}} ; CHECK: calll _puts @@ -172,19 +172,19 @@ ; AVX-DAG: vmovups {{.*}}, %ymm4 ; AVX-DAG: vmovups {{.*}}, %ymm5 -; AVX512F-DAG: vmovups {{.*}}, %zmm0 -; AVX512F-DAG: vmovups {{.*}}, %zmm1 -; AVX512F-DAG: vmovups {{.*}}, %zmm2 -; AVX512F-DAG: vmovups {{.*}}, %zmm3 -; AVX512F-DAG: vmovups {{.*}}, %zmm4 -; AVX512F-DAG: vmovups {{.*}}, %zmm5 - -; AVX512VL-DAG: vmovups {{.*}}, %ymm0 -; AVX512VL-DAG: vmovups {{.*}}, %ymm1 -; AVX512VL-DAG: vmovups {{.*}}, %ymm2 -; AVX512VL-DAG: vmovups {{.*}}, %ymm3 -; AVX512VL-DAG: vmovups {{.*}}, %ymm4 -; AVX512VL-DAG: vmovups {{.*}}, %ymm5 +; AVX512F-DAG: vmovdqu64 {{.*}}, %zmm0 +; AVX512F-DAG: vmovdqu64 {{.*}}, %zmm1 +; AVX512F-DAG: vmovdqu64 {{.*}}, %zmm2 +; AVX512F-DAG: vmovdqu64 {{.*}}, %zmm3 +; AVX512F-DAG: vmovdqu64 {{.*}}, %zmm4 +; AVX512F-DAG: vmovdqu64 {{.*}}, %zmm5 + +; AVX512VL-DAG: vmovdqu {{.*}}, %ymm0 +; AVX512VL-DAG: vmovdqu {{.*}}, %ymm1 +; AVX512VL-DAG: vmovdqu {{.*}}, %ymm2 +; AVX512VL-DAG: vmovdqu {{.*}}, %ymm3 +; AVX512VL-DAG: vmovdqu {{.*}}, %ymm4 +; AVX512VL-DAG: vmovdqu {{.*}}, %ymm5 ; CHECK-DAG: movl {{.*}}, %ecx ; CHECK-DAG: movl {{.*}}, %edx diff --git a/llvm/test/CodeGen/X86/nontemporal-2.ll b/llvm/test/CodeGen/X86/nontemporal-2.ll --- a/llvm/test/CodeGen/X86/nontemporal-2.ll +++ b/llvm/test/CodeGen/X86/nontemporal-2.ll @@ -108,16 +108,22 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_zero_v4f32: ; VLX: # %bb.0: -; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntps %xmm0, (%rdi) +; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; VLX-NEXT: vmovntdq %xmm0, (%rdi) ; VLX-NEXT: retq store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1 ret void @@ -130,16 +136,22 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_zero_v4i32: ; VLX: # %bb.0: -; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntps %xmm0, (%rdi) +; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; VLX-NEXT: vmovntdq %xmm0, (%rdi) ; VLX-NEXT: retq store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1 store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1 @@ -153,16 +165,22 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_zero_v2f64: ; VLX: # %bb.0: -; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntps %xmm0, (%rdi) +; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; VLX-NEXT: vmovntdq %xmm0, (%rdi) ; VLX-NEXT: retq store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1 ret void @@ -175,16 +193,22 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_zero_v2i64: ; VLX: # %bb.0: -; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntps %xmm0, (%rdi) +; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; VLX-NEXT: vmovntdq %xmm0, (%rdi) ; VLX-NEXT: retq store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1 ret void @@ -197,16 +221,22 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_zero_v8i16: ; VLX: # %bb.0: -; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntps %xmm0, (%rdi) +; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; VLX-NEXT: vmovntdq %xmm0, (%rdi) ; VLX-NEXT: retq store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1 ret void @@ -219,16 +249,22 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_zero_v16i8: ; VLX: # %bb.0: -; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntps %xmm0, (%rdi) +; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; VLX-NEXT: vmovntdq %xmm0, (%rdi) ; VLX-NEXT: retq store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1 ret void @@ -244,17 +280,24 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v8f32: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; VLX-LABEL: test_zero_v8f32: ; VLX: # %bb.0: -; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; VLX-NEXT: vmovntdq %ymm0, (%rdi) ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1 @@ -269,17 +312,24 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v8i32: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; VLX-LABEL: test_zero_v8i32: ; VLX: # %bb.0: -; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; VLX-NEXT: vmovntdq %ymm0, (%rdi) ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1 @@ -294,17 +344,24 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; VLX-LABEL: test_zero_v4f64: ; VLX: # %bb.0: -; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; VLX-NEXT: vmovntdq %ymm0, (%rdi) ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1 @@ -319,17 +376,24 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v4i64: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; VLX-LABEL: test_zero_v4i64: ; VLX: # %bb.0: -; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; VLX-NEXT: vmovntdq %ymm0, (%rdi) ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1 @@ -344,17 +408,24 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v16i16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; VLX-LABEL: test_zero_v16i16: ; VLX: # %bb.0: -; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; VLX-NEXT: vmovntdq %ymm0, (%rdi) ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1 @@ -369,17 +440,24 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v32i8: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; VLX-LABEL: test_zero_v32i8: ; VLX: # %bb.0: -; VLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; VLX-NEXT: vmovntdq %ymm0, (%rdi) ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1 @@ -407,14 +485,19 @@ ; SSE41-NEXT: movss %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: test_arg_f32: -; AVX: # %bb.0: -; AVX-NEXT: vmovss %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_arg_f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_arg_f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_arg_f32: ; VLX: # %bb.0: -; VLX-NEXT: vmovss %xmm0, (%rdi) +; VLX-NEXT: vmovd %xmm0, (%rdi) ; VLX-NEXT: retq store float %arg, float* %dst, align 1, !nontemporal !1 ret void @@ -455,14 +538,19 @@ ; SSE41-NEXT: movsd %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: test_arg_f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_arg_f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_arg_f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_arg_f64: ; VLX: # %bb.0: -; VLX-NEXT: vmovsd %xmm0, (%rdi) +; VLX-NEXT: vmovq %xmm0, (%rdi) ; VLX-NEXT: retq store double %arg, double* %dst, align 1, !nontemporal !1 ret void @@ -508,11 +596,17 @@ ; SSE41-NEXT: movntil %eax, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: test_extract_f32: -; AVX: # %bb.0: -; AVX-NEXT: vextractps $1, %xmm0, %eax -; AVX-NEXT: movntil %eax, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_extract_f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: movntil %eax, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_extract_f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrd $1, %xmm0, %eax +; AVX2-NEXT: movntil %eax, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_extract_f32: ; VLX: # %bb.0: @@ -545,15 +639,21 @@ ; SSE41-NEXT: movntil %eax, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: test_extract_i32: -; AVX: # %bb.0: -; AVX-NEXT: vextractps $1, %xmm0, %eax -; AVX-NEXT: movntil %eax, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_extract_i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: movntil %eax, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_extract_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrd $1, %xmm0, %eax +; AVX2-NEXT: movntil %eax, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_extract_i32: ; VLX: # %bb.0: -; VLX-NEXT: vextractps $1, %xmm0, %eax +; VLX-NEXT: vpextrd $1, %xmm0, %eax ; VLX-NEXT: movntil %eax, (%rdi) ; VLX-NEXT: retq %1 = extractelement <4 x i32> %arg, i32 1 @@ -637,14 +737,19 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_arg_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_arg_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_arg_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_arg_v4f32: ; VLX: # %bb.0: -; VLX-NEXT: vmovntps %xmm0, (%rdi) +; VLX-NEXT: vmovntdq %xmm0, (%rdi) ; VLX-NEXT: retq store <4 x float> %arg, <4 x float>* %dst, align 16, !nontemporal !1 ret void @@ -656,14 +761,19 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_arg_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_arg_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_arg_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_arg_v4i32: ; VLX: # %bb.0: -; VLX-NEXT: vmovntps %xmm0, (%rdi) +; VLX-NEXT: vmovntdq %xmm0, (%rdi) ; VLX-NEXT: retq store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1 ret void @@ -675,14 +785,19 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_arg_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_arg_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_arg_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_arg_v2f64: ; VLX: # %bb.0: -; VLX-NEXT: vmovntps %xmm0, (%rdi) +; VLX-NEXT: vmovntdq %xmm0, (%rdi) ; VLX-NEXT: retq store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1 ret void @@ -694,14 +809,19 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_arg_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_arg_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_arg_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_arg_v2i64: ; VLX: # %bb.0: -; VLX-NEXT: vmovntps %xmm0, (%rdi) +; VLX-NEXT: vmovntdq %xmm0, (%rdi) ; VLX-NEXT: retq store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1 ret void @@ -713,14 +833,19 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_arg_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_arg_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_arg_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_arg_v8i16: ; VLX: # %bb.0: -; VLX-NEXT: vmovntps %xmm0, (%rdi) +; VLX-NEXT: vmovntdq %xmm0, (%rdi) ; VLX-NEXT: retq store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1 ret void @@ -732,14 +857,19 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_arg_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_arg_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_arg_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; VLX-LABEL: test_arg_v16i8: ; VLX: # %bb.0: -; VLX-NEXT: vmovntps %xmm0, (%rdi) +; VLX-NEXT: vmovntdq %xmm0, (%rdi) ; VLX-NEXT: retq store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1 ret void @@ -754,15 +884,21 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_arg_v8f32: -; AVX: # %bb.0: -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_arg_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_arg_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; VLX-LABEL: test_arg_v8f32: ; VLX: # %bb.0: -; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vmovntdq %ymm0, (%rdi) ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1 @@ -776,15 +912,21 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_arg_v8i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_arg_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_arg_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; VLX-LABEL: test_arg_v8i32: ; VLX: # %bb.0: -; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vmovntdq %ymm0, (%rdi) ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1 @@ -798,15 +940,21 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_arg_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_arg_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_arg_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; VLX-LABEL: test_arg_v4f64: ; VLX: # %bb.0: -; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vmovntdq %ymm0, (%rdi) ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1 @@ -820,15 +968,21 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_arg_v4i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_arg_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_arg_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; VLX-LABEL: test_arg_v4i64: ; VLX: # %bb.0: -; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vmovntdq %ymm0, (%rdi) ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1 @@ -842,15 +996,21 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_arg_v16i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_arg_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_arg_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; VLX-LABEL: test_arg_v16i16: ; VLX: # %bb.0: -; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vmovntdq %ymm0, (%rdi) ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1 @@ -864,15 +1024,21 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_arg_v32i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_arg_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_arg_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; VLX-LABEL: test_arg_v32i8: ; VLX: # %bb.0: -; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vmovntdq %ymm0, (%rdi) ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1 diff --git a/llvm/test/CodeGen/X86/nontemporal-3.ll b/llvm/test/CodeGen/X86/nontemporal-3.ll --- a/llvm/test/CodeGen/X86/nontemporal-3.ll +++ b/llvm/test/CodeGen/X86/nontemporal-3.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=SSE,SSE4A ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=AVX512 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 @@ -448,18 +448,25 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v4f64_align16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v4f64_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v4f64_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v4f64_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vmovntps %xmm0, (%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, (%rdi) ; AVX512-NEXT: retq store <4 x double> zeroinitializer, <4 x double>* %dst, align 16, !nontemporal !1 ret void @@ -473,18 +480,25 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v8f32_align16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v8f32_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v8f32_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v8f32_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vmovntps %xmm0, (%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, (%rdi) ; AVX512-NEXT: retq store <8 x float> zeroinitializer, <8 x float>* %dst, align 16, !nontemporal !1 ret void @@ -498,18 +512,25 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v4i64_align16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v4i64_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v4i64_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v4i64_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vmovntps %xmm0, (%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, (%rdi) ; AVX512-NEXT: retq store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 16, !nontemporal !1 ret void @@ -523,18 +544,25 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v8i32_align16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v8i32_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v8i32_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v8i32_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vmovntps %xmm0, (%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, (%rdi) ; AVX512-NEXT: retq store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 16, !nontemporal !1 ret void @@ -548,18 +576,25 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v16i16_align16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v16i16_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v16i16_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v16i16_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vmovntps %xmm0, (%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, (%rdi) ; AVX512-NEXT: retq store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 16, !nontemporal !1 ret void @@ -573,18 +608,25 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v32i8_align16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v32i8_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v32i8_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v32i8_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vmovntps %xmm0, (%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, (%rdi) ; AVX512-NEXT: retq store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 16, !nontemporal !1 ret void @@ -991,22 +1033,31 @@ ; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v8f64_align16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: vmovntps %xmm0, 48(%rdi) -; AVX-NEXT: vmovntps %xmm0, 32(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v8f64_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX1-NEXT: vmovntps %xmm0, 32(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v8f64_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: vmovntdq %xmm0, 48(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, 32(%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v8f64_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vmovntps %xmm0, (%rdi) -; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) -; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, (%rdi) +; AVX512-NEXT: vmovntdq %xmm0, 48(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, 32(%rdi) ; AVX512-NEXT: retq store <8 x double> zeroinitializer, <8 x double>* %dst, align 16, !nontemporal !1 ret void @@ -1022,22 +1073,31 @@ ; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v16f32_align16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: vmovntps %xmm0, 48(%rdi) -; AVX-NEXT: vmovntps %xmm0, 32(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v16f32_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX1-NEXT: vmovntps %xmm0, 32(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v16f32_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: vmovntdq %xmm0, 48(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, 32(%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v16f32_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vmovntps %xmm0, (%rdi) -; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) -; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, (%rdi) +; AVX512-NEXT: vmovntdq %xmm0, 48(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, 32(%rdi) ; AVX512-NEXT: retq store <16 x float> zeroinitializer, <16 x float>* %dst, align 16, !nontemporal !1 ret void @@ -1053,22 +1113,31 @@ ; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v8i64_align16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: vmovntps %xmm0, 48(%rdi) -; AVX-NEXT: vmovntps %xmm0, 32(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v8i64_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX1-NEXT: vmovntps %xmm0, 32(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v8i64_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: vmovntdq %xmm0, 48(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, 32(%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v8i64_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vmovntps %xmm0, (%rdi) -; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) -; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, (%rdi) +; AVX512-NEXT: vmovntdq %xmm0, 48(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, 32(%rdi) ; AVX512-NEXT: retq store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 16, !nontemporal !1 ret void @@ -1084,22 +1153,31 @@ ; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v16i32_align16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: vmovntps %xmm0, 48(%rdi) -; AVX-NEXT: vmovntps %xmm0, 32(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v16i32_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX1-NEXT: vmovntps %xmm0, 32(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v16i32_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: vmovntdq %xmm0, 48(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, 32(%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v16i32_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vmovntps %xmm0, (%rdi) -; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) -; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, (%rdi) +; AVX512-NEXT: vmovntdq %xmm0, 48(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, 32(%rdi) ; AVX512-NEXT: retq store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 16, !nontemporal !1 ret void @@ -1115,22 +1193,31 @@ ; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v32i16_align16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: vmovntps %xmm0, 48(%rdi) -; AVX-NEXT: vmovntps %xmm0, 32(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v32i16_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX1-NEXT: vmovntps %xmm0, 32(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v32i16_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: vmovntdq %xmm0, 48(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, 32(%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v32i16_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vmovntps %xmm0, (%rdi) -; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) -; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, (%rdi) +; AVX512-NEXT: vmovntdq %xmm0, 48(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, 32(%rdi) ; AVX512-NEXT: retq store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 16, !nontemporal !1 ret void @@ -1146,22 +1233,31 @@ ; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v64i8_align16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX-NEXT: vmovntps %xmm0, (%rdi) -; AVX-NEXT: vmovntps %xmm0, 48(%rdi) -; AVX-NEXT: vmovntps %xmm0, 32(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v64i8_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX1-NEXT: vmovntps %xmm0, (%rdi) +; AVX1-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX1-NEXT: vmovntps %xmm0, 32(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v64i8_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, (%rdi) +; AVX2-NEXT: vmovntdq %xmm0, 48(%rdi) +; AVX2-NEXT: vmovntdq %xmm0, 32(%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v64i8_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) -; AVX512-NEXT: vmovntps %xmm0, (%rdi) -; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) -; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, (%rdi) +; AVX512-NEXT: vmovntdq %xmm0, 48(%rdi) +; AVX512-NEXT: vmovntdq %xmm0, 32(%rdi) ; AVX512-NEXT: retq store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 16, !nontemporal !1 ret void @@ -1177,19 +1273,27 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v8f64_align32: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v8f64_align32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %ymm0, 32(%rdi) +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v8f64_align32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %ymm0, 32(%rdi) +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v8f64_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX512-NEXT: vmovntps %ymm0, (%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %ymm0, 32(%rdi) +; AVX512-NEXT: vmovntdq %ymm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <8 x double> zeroinitializer, <8 x double>* %dst, align 32, !nontemporal !1 @@ -1206,19 +1310,27 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v16f32_align32: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v16f32_align32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %ymm0, 32(%rdi) +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v16f32_align32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %ymm0, 32(%rdi) +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v16f32_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX512-NEXT: vmovntps %ymm0, (%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %ymm0, 32(%rdi) +; AVX512-NEXT: vmovntdq %ymm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <16 x float> zeroinitializer, <16 x float>* %dst, align 32, !nontemporal !1 @@ -1235,19 +1347,27 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v8i64_align32: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v8i64_align32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %ymm0, 32(%rdi) +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v8i64_align32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %ymm0, 32(%rdi) +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v8i64_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX512-NEXT: vmovntps %ymm0, (%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %ymm0, 32(%rdi) +; AVX512-NEXT: vmovntdq %ymm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 32, !nontemporal !1 @@ -1264,19 +1384,27 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v16i32_align32: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v16i32_align32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %ymm0, 32(%rdi) +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v16i32_align32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %ymm0, 32(%rdi) +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v16i32_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX512-NEXT: vmovntps %ymm0, (%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %ymm0, 32(%rdi) +; AVX512-NEXT: vmovntdq %ymm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 32, !nontemporal !1 @@ -1293,19 +1421,27 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v32i16_align32: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v32i16_align32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %ymm0, 32(%rdi) +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v32i16_align32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %ymm0, 32(%rdi) +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v32i16_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX512-NEXT: vmovntps %ymm0, (%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %ymm0, 32(%rdi) +; AVX512-NEXT: vmovntdq %ymm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 32, !nontemporal !1 @@ -1322,19 +1458,27 @@ ; SSE-NEXT: movntps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: test_zero_v64i8_align32: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX-NEXT: vmovntps %ymm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_zero_v64i8_align32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovntps %ymm0, 32(%rdi) +; AVX1-NEXT: vmovntps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_zero_v64i8_align32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovntdq %ymm0, 32(%rdi) +; AVX2-NEXT: vmovntdq %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_zero_v64i8_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX512-NEXT: vmovntps %ymm0, (%rdi) +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntdq %ymm0, 32(%rdi) +; AVX512-NEXT: vmovntdq %ymm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 32, !nontemporal !1 diff --git a/llvm/test/CodeGen/X86/nontemporal-loads-2.ll b/llvm/test/CodeGen/X86/nontemporal-loads-2.ll --- a/llvm/test/CodeGen/X86/nontemporal-loads-2.ll +++ b/llvm/test/CodeGen/X86/nontemporal-loads-2.ll @@ -16,10 +16,20 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v2f64_align1: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v2f64_align1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v2f64_align1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v2f64_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: retq %1 = load <2 x double>, <2 x double>* %src, align 1, !nontemporal !1 ret <2 x double> %1 } @@ -30,10 +40,20 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v4f32_align1: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v4f32_align1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4f32_align1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v4f32_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: retq %1 = load <4 x float>, <4 x float>* %src, align 1, !nontemporal !1 ret <4 x float> %1 } @@ -44,10 +64,20 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v2i64_align1: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v2i64_align1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v2i64_align1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v2i64_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: retq %1 = load <2 x i64>, <2 x i64>* %src, align 1, !nontemporal !1 ret <2 x i64> %1 } @@ -58,10 +88,20 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v4i32_align1: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v4i32_align1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i32_align1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v4i32_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %src, align 1, !nontemporal !1 ret <4 x i32> %1 } @@ -72,10 +112,20 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v8i16_align1: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v8i16_align1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i16_align1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i16_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %src, align 1, !nontemporal !1 ret <8 x i16> %1 } @@ -86,10 +136,20 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v16i8_align1: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i8_align1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i8_align1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i8_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %src, align 1, !nontemporal !1 ret <16 x i8> %1 } @@ -103,10 +163,20 @@ ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v4f64_align1: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v4f64_align1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4f64_align1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v4f64_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512-NEXT: retq %1 = load <4 x double>, <4 x double>* %src, align 1, !nontemporal !1 ret <4 x double> %1 } @@ -118,10 +188,20 @@ ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v8f32_align1: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v8f32_align1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8f32_align1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8f32_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512-NEXT: retq %1 = load <8 x float>, <8 x float>* %src, align 1, !nontemporal !1 ret <8 x float> %1 } @@ -133,10 +213,20 @@ ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v4i64_align1: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v4i64_align1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i64_align1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v4i64_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512-NEXT: retq %1 = load <4 x i64>, <4 x i64>* %src, align 1, !nontemporal !1 ret <4 x i64> %1 } @@ -148,10 +238,20 @@ ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v8i32_align1: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v8i32_align1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32_align1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512-NEXT: retq %1 = load <8 x i32>, <8 x i32>* %src, align 1, !nontemporal !1 ret <8 x i32> %1 } @@ -163,10 +263,20 @@ ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v16i16_align1: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i16_align1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i16_align1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i16_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %src, align 1, !nontemporal !1 ret <16 x i16> %1 } @@ -178,10 +288,20 @@ ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v32i8_align1: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v32i8_align1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i8_align1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i8_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512-NEXT: retq %1 = load <32 x i8>, <32 x i8>* %src, align 1, !nontemporal !1 ret <32 x i8> %1 } @@ -199,20 +319,50 @@ ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v4f64_align16: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX-NEXT: vmovntdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: test_v4f64_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4f64_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsp) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v4f64_align16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %1 = load <4 x double>, <4 x double>* %src, align 16, !nontemporal !1 ret <4 x double> %1 } @@ -230,20 +380,50 @@ ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v8f32_align16: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX-NEXT: vmovntdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: test_v8f32_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8f32_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsp) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8f32_align16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %1 = load <8 x float>, <8 x float>* %src, align 16, !nontemporal !1 ret <8 x float> %1 } @@ -261,20 +441,50 @@ ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v4i64_align16: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX-NEXT: vmovntdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: test_v4i64_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i64_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsp) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v4i64_align16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %1 = load <4 x i64>, <4 x i64>* %src, align 16, !nontemporal !1 ret <4 x i64> %1 } @@ -292,20 +502,50 @@ ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v8i32_align16: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX-NEXT: vmovntdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: test_v8i32_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsp) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32_align16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %1 = load <8 x i32>, <8 x i32>* %src, align 16, !nontemporal !1 ret <8 x i32> %1 } @@ -323,20 +563,50 @@ ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v16i16_align16: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX-NEXT: vmovntdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i16_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i16_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsp) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i16_align16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %src, align 16, !nontemporal !1 ret <16 x i16> %1 } @@ -354,20 +624,50 @@ ; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v32i8_align16: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX-NEXT: vmovntdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsp) -; AVX-NEXT: vmovaps (%rsp), %ymm0 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: test_v32i8_align16: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) +; AVX1-NEXT: vmovaps (%rsp), %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i8_align16: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsp) +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i8_align16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-32, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) +; AVX512-NEXT: vmovdqa (%rsp), %ymm0 +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %1 = load <32 x i8>, <32 x i8>* %src, align 16, !nontemporal !1 ret <32 x i8> %1 } @@ -391,13 +691,13 @@ ; ; AVX2-LABEL: test_v8f64_align1: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovups 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8f64_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <8 x double>, <8 x double>* %src, align 1, !nontemporal !1 ret <8 x double> %1 @@ -420,13 +720,13 @@ ; ; AVX2-LABEL: test_v16f32_align1: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovups 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16f32_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <16 x float>, <16 x float>* %src, align 1, !nontemporal !1 ret <16 x float> %1 @@ -449,13 +749,13 @@ ; ; AVX2-LABEL: test_v8i64_align1: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovups 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i64_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <8 x i64>, <8 x i64>* %src, align 1, !nontemporal !1 ret <8 x i64> %1 @@ -478,13 +778,13 @@ ; ; AVX2-LABEL: test_v16i32_align1: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovups 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16i32_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <16 x i32>, <16 x i32>* %src, align 1, !nontemporal !1 ret <16 x i32> %1 @@ -507,13 +807,13 @@ ; ; AVX2-LABEL: test_v32i16_align1: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovups 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v32i16_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %src, align 1, !nontemporal !1 ret <32 x i16> %1 @@ -536,13 +836,13 @@ ; ; AVX2-LABEL: test_v64i8_align1: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovups 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v64i8_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %src, align 1, !nontemporal !1 ret <64 x i8> %1 @@ -599,8 +899,8 @@ ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps (%rsp), %ymm0 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -619,7 +919,7 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -678,8 +978,8 @@ ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps (%rsp), %ymm0 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -698,7 +998,7 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -757,8 +1057,8 @@ ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps (%rsp), %ymm0 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -777,7 +1077,7 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -836,8 +1136,8 @@ ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps (%rsp), %ymm0 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -856,7 +1156,7 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -915,8 +1215,8 @@ ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps (%rsp), %ymm0 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -935,7 +1235,7 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -994,8 +1294,8 @@ ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovaps (%rsp), %ymm0 -; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX2-NEXT: vmovdqa (%rsp), %ymm0 +; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -1014,7 +1314,7 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -1065,7 +1365,7 @@ ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -1116,7 +1416,7 @@ ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -1167,7 +1467,7 @@ ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -1218,7 +1518,7 @@ ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -1269,7 +1569,7 @@ ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq @@ -1320,7 +1620,7 @@ ; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll --- a/llvm/test/CodeGen/X86/nontemporal-loads.ll +++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll @@ -1387,14 +1387,19 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %xmm0 +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 ; AVX512-NEXT: retq %1 = load <4 x float>, <4 x float>* %src, align 1, !nontemporal !1 ret <4 x float> %1 @@ -1406,14 +1411,19 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %xmm0 +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 ; AVX512-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %src, align 1, !nontemporal !1 ret <4 x i32> %1 @@ -1425,14 +1435,19 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %xmm0 +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 ; AVX512-NEXT: retq %1 = load <2 x double>, <2 x double>* %src, align 1, !nontemporal !1 ret <2 x double> %1 @@ -1444,14 +1459,19 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %xmm0 +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 ; AVX512-NEXT: retq %1 = load <2 x i64>, <2 x i64>* %src, align 1, !nontemporal !1 ret <2 x i64> %1 @@ -1463,14 +1483,19 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %xmm0 +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 ; AVX512-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %src, align 1, !nontemporal !1 ret <8 x i16> %1 @@ -1482,14 +1507,19 @@ ; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %xmm0 +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 ; AVX512-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %src, align 1, !nontemporal !1 ret <16 x i8> %1 @@ -1504,14 +1534,19 @@ ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v8f32: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v8f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512-NEXT: retq %1 = load <8 x float>, <8 x float>* %src, align 1, !nontemporal !1 ret <8 x float> %1 @@ -1524,14 +1559,19 @@ ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v8i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512-NEXT: retq %1 = load <8 x i32>, <8 x i32>* %src, align 1, !nontemporal !1 ret <8 x i32> %1 @@ -1544,14 +1584,19 @@ ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v4f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512-NEXT: retq %1 = load <4 x double>, <4 x double>* %src, align 1, !nontemporal !1 ret <4 x double> %1 @@ -1564,14 +1609,19 @@ ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v4i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v4i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512-NEXT: retq %1 = load <4 x i64>, <4 x i64>* %src, align 1, !nontemporal !1 ret <4 x i64> %1 @@ -1584,14 +1634,19 @@ ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v16i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %src, align 1, !nontemporal !1 ret <16 x i16> %1 @@ -1604,14 +1659,19 @@ ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v32i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512-NEXT: retq %1 = load <32 x i8>, <32 x i8>* %src, align 1, !nontemporal !1 ret <32 x i8> %1 @@ -1628,15 +1688,21 @@ ; SSE-NEXT: movups 48(%rdi), %xmm3 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v16f32: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: vmovups 32(%rdi), %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v16f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v16f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v16f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <16 x float>, <16 x float>* %src, align 1, !nontemporal !1 ret <16 x float> %1 @@ -1651,15 +1717,21 @@ ; SSE-NEXT: movups 48(%rdi), %xmm3 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v16i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: vmovups 32(%rdi), %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v16i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <16 x i32>, <16 x i32>* %src, align 1, !nontemporal !1 ret <16 x i32> %1 @@ -1674,15 +1746,21 @@ ; SSE-NEXT: movups 48(%rdi), %xmm3 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v8f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: vmovups 32(%rdi), %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v8f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v8f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <8 x double>, <8 x double>* %src, align 1, !nontemporal !1 ret <8 x double> %1 @@ -1697,15 +1775,21 @@ ; SSE-NEXT: movups 48(%rdi), %xmm3 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v8i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: vmovups 32(%rdi), %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v8i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v8i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <8 x i64>, <8 x i64>* %src, align 1, !nontemporal !1 ret <8 x i64> %1 @@ -1720,15 +1804,21 @@ ; SSE-NEXT: movups 48(%rdi), %xmm3 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v32i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: vmovups 32(%rdi), %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v32i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v32i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %src, align 1, !nontemporal !1 ret <32 x i16> %1 @@ -1743,15 +1833,21 @@ ; SSE-NEXT: movups 48(%rdi), %xmm3 ; SSE-NEXT: retq ; -; AVX-LABEL: test_unaligned_v64i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %ymm0 -; AVX-NEXT: vmovups 32(%rdi), %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: test_unaligned_v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_unaligned_v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_unaligned_v64i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %src, align 1, !nontemporal !1 ret <64 x i8> %1 diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -81,12 +81,19 @@ ; SSE42-NEXT: movlps %xmm0, (%rdi) ; SSE42-NEXT: retq ; -; AVX-LABEL: v3i32: -; AVX: # %bb.0: -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vextractps $1, %xmm0, 8(%rdi) -; AVX-NEXT: vmovlps %xmm1, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: v3i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vextractps $1, %xmm0, 8(%rdi) +; AVX1-NEXT: vmovlps %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v3i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vpextrd $1, %xmm0, 8(%rdi) +; AVX2-NEXT: vmovq %xmm1, (%rdi) +; AVX2-NEXT: retq ; ; XOP-LABEL: v3i32: ; XOP: # %bb.0: @@ -346,16 +353,27 @@ ; SSE42-NEXT: movdqa %xmm2, (%rdi) ; SSE42-NEXT: retq ; -; AVX-LABEL: v7i32: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2] -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX-NEXT: vmovss %xmm1, 24(%rdi) -; AVX-NEXT: vmovlps %xmm0, 16(%rdi) -; AVX-NEXT: vmovaps %xmm2, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: v7i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1-NEXT: vmovss %xmm1, 24(%rdi) +; AVX1-NEXT: vmovlps %xmm0, 16(%rdi) +; AVX1-NEXT: vmovaps %xmm2, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: v7i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,3,2] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX2-NEXT: vmovd %xmm1, 24(%rdi) +; AVX2-NEXT: vmovq %xmm0, 16(%rdi) +; AVX2-NEXT: vmovdqa %xmm2, (%rdi) +; AVX2-NEXT: retq ; ; XOP-LABEL: v7i32: ; XOP: # %bb.0: @@ -584,16 +602,16 @@ ; ; AVX2-FAST-LABEL: v12i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovaps %xmm0, 32(%rdi) -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdi) +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa %xmm0, 32(%rdi) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rdi) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1323,61 +1341,61 @@ ; ; AVX2-SLOW-LABEL: interleave_24i32_out: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] +; AVX2-SLOW-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu 64(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsi) -; AVX2-SLOW-NEXT: vmovups %ymm4, (%rdx) -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> +; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsi) +; AVX2-SLOW-NEXT: vmovdqu %ymm4, (%rdx) +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: interleave_24i32_out: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovups (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] +; AVX2-FAST-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu 64(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, (%rsi) -; AVX2-FAST-NEXT: vmovups %ymm4, (%rdx) -; AVX2-FAST-NEXT: vmovups %ymm0, (%rcx) +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsi) +; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rdx) +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1548,58 +1566,58 @@ ; ; AVX2-SLOW-LABEL: interleave_24i32_in: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovups (%rsi), %ymm0 -; AVX2-SLOW-NEXT: vmovups (%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovups (%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = mem[1,0,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vbroadcastsd (%rcx), %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-SLOW-NEXT: vmovups %ymm4, 64(%rdi) -; AVX2-SLOW-NEXT: vmovups %ymm3, (%rdi) +; AVX2-SLOW-NEXT: vmovdqu (%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu (%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu (%rcx), %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = mem[1,0,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpbroadcastq (%rcx), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpbroadcastq 24(%rsi), %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, 32(%rdi) +; AVX2-SLOW-NEXT: vmovdqu %ymm4, 64(%rdi) +; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rdi) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: interleave_24i32_in: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovups (%rsi), %ymm0 -; AVX2-FAST-NEXT: vmovups (%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovups (%rcx), %ymm2 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: vmovdqu (%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu (%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu (%rcx), %ymm2 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX2-FAST-NEXT: vbroadcastsd (%rcx), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <5,u,u,6,u,u,7,u> -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%rsi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-FAST-NEXT: vmovups %ymm4, 64(%rdi) -; AVX2-FAST-NEXT: vmovups %ymm3, (%rdi) +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-NEXT: vpbroadcastq (%rcx), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <5,u,u,6,u,u,7,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,1,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpbroadcastq 24(%rsi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, 32(%rdi) +; AVX2-FAST-NEXT: vmovdqu %ymm4, 64(%rdi) +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rdi) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -2024,11 +2042,11 @@ ; ; AVX2-SLOW-LABEL: splat_v3i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss %xmm1, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: splat_v3i32: @@ -2085,9 +2103,9 @@ ; ; AVX2-LABEL: wrongorder: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-NEXT: vmovaps %ymm0, 32(%rdi) -; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) +; AVX2-NEXT: vmovdqa %ymm0, (%rdi) ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -75,28 +75,52 @@ ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: PR40815: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX-NEXT: vmovaps 48(%rdi), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rsi) -; AVX-NEXT: vmovaps %xmm3, (%rsi) -; AVX-NEXT: vmovaps %xmm0, 48(%rsi) -; AVX-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: PR40815: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX1-NEXT: vmovaps %xmm2, 16(%rsi) +; AVX1-NEXT: vmovaps %xmm3, (%rsi) +; AVX1-NEXT: vmovaps %xmm0, 48(%rsi) +; AVX1-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR40815: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-NEXT: vmovdqa %xmm2, 16(%rsi) +; AVX2-NEXT: vmovdqa %xmm3, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 48(%rsi) +; AVX2-NEXT: vmovdqa %xmm1, 32(%rsi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: PR40815: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX512-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX512-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vinserti128 $1, 32(%rdi), %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq +; +; XOP-LABEL: PR40815: +; XOP: # %bb.0: +; XOP-NEXT: vmovaps (%rdi), %xmm0 +; XOP-NEXT: vmovaps 16(%rdi), %xmm1 +; XOP-NEXT: vmovaps 32(%rdi), %xmm2 +; XOP-NEXT: vmovaps 48(%rdi), %xmm3 +; XOP-NEXT: vmovaps %xmm2, 16(%rsi) +; XOP-NEXT: vmovaps %xmm3, (%rsi) +; XOP-NEXT: vmovaps %xmm0, 48(%rsi) +; XOP-NEXT: vmovaps %xmm1, 32(%rsi) +; XOP-NEXT: retq %3 = bitcast %struct.Mat4* %0 to <16 x float>* %4 = load <16 x float>, <16 x float>* %3, align 64 %5 = shufflevector <16 x float> %4, <16 x float> undef, <4 x i32> @@ -130,14 +154,23 @@ ; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: retq ; -; AVX-LABEL: PR42819: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,1,2] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: PR42819: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,1,2] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR42819: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,1,2] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: PR42819: ; AVX512: # %bb.0: @@ -146,6 +179,15 @@ ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: retq +; +; XOP-LABEL: PR42819: +; XOP: # %bb.0: +; XOP-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,1,2] +; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; XOP-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; XOP-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; XOP-NEXT: retq %1 = load <8 x i32>, <8 x i32>* %a0, align 4 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> %3 = shufflevector <16 x i32> zeroinitializer, <16 x i32> %2, <16 x i32> diff --git a/llvm/test/CodeGen/X86/phaddsub-extract.ll b/llvm/test/CodeGen/X86/phaddsub-extract.ll --- a/llvm/test/CodeGen/X86/phaddsub-extract.ll +++ b/llvm/test/CodeGen/X86/phaddsub-extract.ll @@ -60,18 +60,32 @@ ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: extract_extract23_v4i32_add_i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractps $2, %xmm0, %ecx -; AVX-SLOW-NEXT: vextractps $3, %xmm0, %eax -; AVX-SLOW-NEXT: addl %ecx, %eax -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: extract_extract23_v4i32_add_i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractps $2, %xmm0, %ecx +; AVX1-SLOW-NEXT: vextractps $3, %xmm0, %eax +; AVX1-SLOW-NEXT: addl %ecx, %eax +; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: extract_extract23_v4i32_add_i32: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vpextrd $1, %xmm0, %eax ; AVX-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: extract_extract23_v4i32_add_i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpextrd $2, %xmm0, %ecx +; AVX2-SLOW-NEXT: vpextrd $3, %xmm0, %eax +; AVX2-SLOW-NEXT: addl %ecx, %eax +; AVX2-SLOW-NEXT: retq +; +; AVX512-SLOW-LABEL: extract_extract23_v4i32_add_i32: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpextrd $2, %xmm0, %ecx +; AVX512-SLOW-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-SLOW-NEXT: addl %ecx, %eax +; AVX512-SLOW-NEXT: retq %x0 = extractelement <4 x i32> %x, i32 2 %x1 = extractelement <4 x i32> %x, i32 3 %x01 = add i32 %x0, %x1 @@ -128,18 +142,32 @@ ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: extract_extract23_v4i32_add_i32_commute: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractps $2, %xmm0, %ecx -; AVX-SLOW-NEXT: vextractps $3, %xmm0, %eax -; AVX-SLOW-NEXT: addl %ecx, %eax -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: extract_extract23_v4i32_add_i32_commute: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractps $2, %xmm0, %ecx +; AVX1-SLOW-NEXT: vextractps $3, %xmm0, %eax +; AVX1-SLOW-NEXT: addl %ecx, %eax +; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: extract_extract23_v4i32_add_i32_commute: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vpextrd $1, %xmm0, %eax ; AVX-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: extract_extract23_v4i32_add_i32_commute: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpextrd $2, %xmm0, %ecx +; AVX2-SLOW-NEXT: vpextrd $3, %xmm0, %eax +; AVX2-SLOW-NEXT: addl %ecx, %eax +; AVX2-SLOW-NEXT: retq +; +; AVX512-SLOW-LABEL: extract_extract23_v4i32_add_i32_commute: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpextrd $2, %xmm0, %ecx +; AVX512-SLOW-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-SLOW-NEXT: addl %ecx, %eax +; AVX512-SLOW-NEXT: retq %x0 = extractelement <4 x i32> %x, i32 2 %x1 = extractelement <4 x i32> %x, i32 3 %x01 = add i32 %x1, %x0 @@ -340,18 +368,32 @@ ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: extract_extract23_v4i32_sub_i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractps $2, %xmm0, %eax -; AVX-SLOW-NEXT: vextractps $3, %xmm0, %ecx -; AVX-SLOW-NEXT: subl %ecx, %eax -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: extract_extract23_v4i32_sub_i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractps $2, %xmm0, %eax +; AVX1-SLOW-NEXT: vextractps $3, %xmm0, %ecx +; AVX1-SLOW-NEXT: subl %ecx, %eax +; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: extract_extract23_v4i32_sub_i32: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vpextrd $1, %xmm0, %eax ; AVX-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: extract_extract23_v4i32_sub_i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpextrd $2, %xmm0, %eax +; AVX2-SLOW-NEXT: vpextrd $3, %xmm0, %ecx +; AVX2-SLOW-NEXT: subl %ecx, %eax +; AVX2-SLOW-NEXT: retq +; +; AVX512-SLOW-LABEL: extract_extract23_v4i32_sub_i32: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-SLOW-NEXT: vpextrd $3, %xmm0, %ecx +; AVX512-SLOW-NEXT: subl %ecx, %eax +; AVX512-SLOW-NEXT: retq %x0 = extractelement <4 x i32> %x, i32 2 %x1 = extractelement <4 x i32> %x, i32 3 %x01 = sub i32 %x0, %x1 @@ -389,12 +431,47 @@ ; SSE3-NEXT: subl %ecx, %eax ; SSE3-NEXT: retq ; -; AVX-LABEL: extract_extract23_v4i32_sub_i32_commute: -; AVX: # %bb.0: -; AVX-NEXT: vextractps $2, %xmm0, %ecx -; AVX-NEXT: vextractps $3, %xmm0, %eax -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: extract_extract23_v4i32_sub_i32_commute: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractps $2, %xmm0, %ecx +; AVX1-SLOW-NEXT: vextractps $3, %xmm0, %eax +; AVX1-SLOW-NEXT: subl %ecx, %eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: extract_extract23_v4i32_sub_i32_commute: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractps $2, %xmm0, %ecx +; AVX1-FAST-NEXT: vextractps $3, %xmm0, %eax +; AVX1-FAST-NEXT: subl %ecx, %eax +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: extract_extract23_v4i32_sub_i32_commute: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpextrd $2, %xmm0, %ecx +; AVX2-SLOW-NEXT: vpextrd $3, %xmm0, %eax +; AVX2-SLOW-NEXT: subl %ecx, %eax +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: extract_extract23_v4i32_sub_i32_commute: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpextrd $2, %xmm0, %ecx +; AVX2-FAST-NEXT: vpextrd $3, %xmm0, %eax +; AVX2-FAST-NEXT: subl %ecx, %eax +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: extract_extract23_v4i32_sub_i32_commute: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpextrd $2, %xmm0, %ecx +; AVX512-SLOW-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-SLOW-NEXT: subl %ecx, %eax +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: extract_extract23_v4i32_sub_i32_commute: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpextrd $2, %xmm0, %ecx +; AVX512-FAST-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-FAST-NEXT: subl %ecx, %eax +; AVX512-FAST-NEXT: retq %x0 = extractelement <4 x i32> %x, i32 2 %x1 = extractelement <4 x i32> %x, i32 3 %x01 = sub i32 %x1, %x0 @@ -571,13 +648,13 @@ ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: extract_extract23_v8i32_add_i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractps $2, %xmm0, %ecx -; AVX-SLOW-NEXT: vextractps $3, %xmm0, %eax -; AVX-SLOW-NEXT: addl %ecx, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: extract_extract23_v8i32_add_i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractps $2, %xmm0, %ecx +; AVX1-SLOW-NEXT: vextractps $3, %xmm0, %eax +; AVX1-SLOW-NEXT: addl %ecx, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: extract_extract23_v8i32_add_i32: ; AVX-FAST: # %bb.0: @@ -585,6 +662,22 @@ ; AVX-FAST-NEXT: vpextrd $1, %xmm0, %eax ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: extract_extract23_v8i32_add_i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpextrd $2, %xmm0, %ecx +; AVX2-SLOW-NEXT: vpextrd $3, %xmm0, %eax +; AVX2-SLOW-NEXT: addl %ecx, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX512-SLOW-LABEL: extract_extract23_v8i32_add_i32: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpextrd $2, %xmm0, %ecx +; AVX512-SLOW-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-SLOW-NEXT: addl %ecx, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq %x0 = extractelement <8 x i32> %x, i32 2 %x1 = extractelement <8 x i32> %x, i32 3 %x01 = add i32 %x0, %x1 @@ -608,14 +701,14 @@ ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: extract_extract67_v8i32_add_i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-SLOW-NEXT: vextractps $2, %xmm0, %ecx -; AVX-SLOW-NEXT: vextractps $3, %xmm0, %eax -; AVX-SLOW-NEXT: addl %ecx, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: extract_extract67_v8i32_add_i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-SLOW-NEXT: vextractps $2, %xmm0, %ecx +; AVX1-SLOW-NEXT: vextractps $3, %xmm0, %eax +; AVX1-SLOW-NEXT: addl %ecx, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: extract_extract67_v8i32_add_i32: ; AVX1-FAST: # %bb.0: @@ -625,6 +718,15 @@ ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; +; AVX2-SLOW-LABEL: extract_extract67_v8i32_add_i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vpextrd $2, %xmm0, %ecx +; AVX2-SLOW-NEXT: vpextrd $3, %xmm0, %eax +; AVX2-SLOW-NEXT: addl %ecx, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; ; AVX2-FAST-LABEL: extract_extract67_v8i32_add_i32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -633,6 +735,15 @@ ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; +; AVX512-SLOW-LABEL: extract_extract67_v8i32_add_i32: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-SLOW-NEXT: vpextrd $2, %xmm0, %ecx +; AVX512-SLOW-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-SLOW-NEXT: addl %ecx, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; ; AVX512-FAST-LABEL: extract_extract67_v8i32_add_i32: ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -698,13 +809,13 @@ ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: extract_extract23_v8i32_add_i32_commute: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractps $2, %xmm0, %ecx -; AVX-SLOW-NEXT: vextractps $3, %xmm0, %eax -; AVX-SLOW-NEXT: addl %ecx, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: extract_extract23_v8i32_add_i32_commute: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractps $2, %xmm0, %ecx +; AVX1-SLOW-NEXT: vextractps $3, %xmm0, %eax +; AVX1-SLOW-NEXT: addl %ecx, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: extract_extract23_v8i32_add_i32_commute: ; AVX-FAST: # %bb.0: @@ -712,6 +823,22 @@ ; AVX-FAST-NEXT: vpextrd $1, %xmm0, %eax ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: extract_extract23_v8i32_add_i32_commute: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpextrd $2, %xmm0, %ecx +; AVX2-SLOW-NEXT: vpextrd $3, %xmm0, %eax +; AVX2-SLOW-NEXT: addl %ecx, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX512-SLOW-LABEL: extract_extract23_v8i32_add_i32_commute: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpextrd $2, %xmm0, %ecx +; AVX512-SLOW-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-SLOW-NEXT: addl %ecx, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq %x0 = extractelement <8 x i32> %x, i32 2 %x1 = extractelement <8 x i32> %x, i32 3 %x01 = add i32 %x1, %x0 @@ -735,14 +862,14 @@ ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: extract_extract67_v8i32_add_i32_commute: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-SLOW-NEXT: vextractps $2, %xmm0, %ecx -; AVX-SLOW-NEXT: vextractps $3, %xmm0, %eax -; AVX-SLOW-NEXT: addl %ecx, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: extract_extract67_v8i32_add_i32_commute: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-SLOW-NEXT: vextractps $2, %xmm0, %ecx +; AVX1-SLOW-NEXT: vextractps $3, %xmm0, %eax +; AVX1-SLOW-NEXT: addl %ecx, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: extract_extract67_v8i32_add_i32_commute: ; AVX1-FAST: # %bb.0: @@ -752,6 +879,15 @@ ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; +; AVX2-SLOW-LABEL: extract_extract67_v8i32_add_i32_commute: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vpextrd $2, %xmm0, %ecx +; AVX2-SLOW-NEXT: vpextrd $3, %xmm0, %eax +; AVX2-SLOW-NEXT: addl %ecx, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; ; AVX2-FAST-LABEL: extract_extract67_v8i32_add_i32_commute: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -760,6 +896,15 @@ ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; +; AVX512-SLOW-LABEL: extract_extract67_v8i32_add_i32_commute: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-SLOW-NEXT: vpextrd $2, %xmm0, %ecx +; AVX512-SLOW-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-SLOW-NEXT: addl %ecx, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; ; AVX512-FAST-LABEL: extract_extract67_v8i32_add_i32_commute: ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -1133,13 +1278,13 @@ ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: extract_extract23_v8i32_sub_i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractps $2, %xmm0, %eax -; AVX-SLOW-NEXT: vextractps $3, %xmm0, %ecx -; AVX-SLOW-NEXT: subl %ecx, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: extract_extract23_v8i32_sub_i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractps $2, %xmm0, %eax +; AVX1-SLOW-NEXT: vextractps $3, %xmm0, %ecx +; AVX1-SLOW-NEXT: subl %ecx, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: extract_extract23_v8i32_sub_i32: ; AVX-FAST: # %bb.0: @@ -1147,6 +1292,22 @@ ; AVX-FAST-NEXT: vpextrd $1, %xmm0, %eax ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: extract_extract23_v8i32_sub_i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpextrd $2, %xmm0, %eax +; AVX2-SLOW-NEXT: vpextrd $3, %xmm0, %ecx +; AVX2-SLOW-NEXT: subl %ecx, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX512-SLOW-LABEL: extract_extract23_v8i32_sub_i32: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-SLOW-NEXT: vpextrd $3, %xmm0, %ecx +; AVX512-SLOW-NEXT: subl %ecx, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq %x0 = extractelement <8 x i32> %x, i32 2 %x1 = extractelement <8 x i32> %x, i32 3 %x01 = sub i32 %x0, %x1 @@ -1170,14 +1331,14 @@ ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: extract_extract67_v8i32_sub_i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-SLOW-NEXT: vextractps $2, %xmm0, %eax -; AVX-SLOW-NEXT: vextractps $3, %xmm0, %ecx -; AVX-SLOW-NEXT: subl %ecx, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: extract_extract67_v8i32_sub_i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-SLOW-NEXT: vextractps $2, %xmm0, %eax +; AVX1-SLOW-NEXT: vextractps $3, %xmm0, %ecx +; AVX1-SLOW-NEXT: subl %ecx, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: extract_extract67_v8i32_sub_i32: ; AVX1-FAST: # %bb.0: @@ -1187,6 +1348,15 @@ ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; +; AVX2-SLOW-LABEL: extract_extract67_v8i32_sub_i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vpextrd $2, %xmm0, %eax +; AVX2-SLOW-NEXT: vpextrd $3, %xmm0, %ecx +; AVX2-SLOW-NEXT: subl %ecx, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; ; AVX2-FAST-LABEL: extract_extract67_v8i32_sub_i32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -1195,6 +1365,15 @@ ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; +; AVX512-SLOW-LABEL: extract_extract67_v8i32_sub_i32: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-SLOW-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-SLOW-NEXT: vpextrd $3, %xmm0, %ecx +; AVX512-SLOW-NEXT: subl %ecx, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; ; AVX512-FAST-LABEL: extract_extract67_v8i32_sub_i32: ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/pmovsx-inreg.ll b/llvm/test/CodeGen/X86/pmovsx-inreg.ll --- a/llvm/test/CodeGen/X86/pmovsx-inreg.ll +++ b/llvm/test/CodeGen/X86/pmovsx-inreg.ll @@ -16,21 +16,29 @@ ; SSE41-NEXT: movdqu %xmm0, (%rsi) ; SSE41-NEXT: retq ; -; AVX-LABEL: test1: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovups %xmm1, (%rax) -; AVX-NEXT: vmovdqu %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: test1: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbq (%rdi), %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovups %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbq (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqu %xmm1, (%rax) +; AVX2-NEXT: vmovdqu %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; X32-AVX2-LABEL: test1: ; X32-AVX2: # %bb.0: ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX2-NEXT: vpmovsxbq (%ecx), %xmm0 -; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX2-NEXT: vmovups %xmm1, (%eax) +; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX2-NEXT: vmovdqu %xmm1, (%eax) ; X32-AVX2-NEXT: vmovdqu %xmm0, (%eax) ; X32-AVX2-NEXT: retl %wide.load35 = load <2 x i8>, <2 x i8>* %in, align 1 @@ -65,8 +73,8 @@ ; AVX2-LABEL: test2: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0 -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %ymm1, (%rax) +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -76,8 +84,8 @@ ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX2-NEXT: vpmovsxbq (%ecx), %ymm0 -; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX2-NEXT: vmovups %ymm1, (%eax) +; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX2-NEXT: vmovdqu %ymm1, (%eax) ; X32-AVX2-NEXT: vmovdqu %ymm0, (%eax) ; X32-AVX2-NEXT: vzeroupper ; X32-AVX2-NEXT: retl @@ -97,21 +105,29 @@ ; SSE41-NEXT: movdqu %xmm0, (%rsi) ; SSE41-NEXT: retq ; -; AVX-LABEL: test3: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovups %xmm1, (%rax) -; AVX-NEXT: vmovdqu %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: test3: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovups %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqu %xmm1, (%rax) +; AVX2-NEXT: vmovdqu %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; X32-AVX2-LABEL: test3: ; X32-AVX2: # %bb.0: ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX2-NEXT: vpmovsxbd (%ecx), %xmm0 -; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX2-NEXT: vmovups %xmm1, (%eax) +; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX2-NEXT: vmovdqu %xmm1, (%eax) ; X32-AVX2-NEXT: vmovdqu %xmm0, (%eax) ; X32-AVX2-NEXT: retl %wide.load35 = load <4 x i8>, <4 x i8>* %in, align 1 @@ -146,8 +162,8 @@ ; AVX2-LABEL: test4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0 -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %ymm1, (%rax) +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -157,8 +173,8 @@ ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX2-NEXT: vpmovsxbd (%ecx), %ymm0 -; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX2-NEXT: vmovups %ymm1, (%eax) +; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX2-NEXT: vmovdqu %ymm1, (%eax) ; X32-AVX2-NEXT: vmovdqu %ymm0, (%eax) ; X32-AVX2-NEXT: vzeroupper ; X32-AVX2-NEXT: retl @@ -178,21 +194,29 @@ ; SSE41-NEXT: movdqu %xmm0, (%rsi) ; SSE41-NEXT: retq ; -; AVX-LABEL: test5: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovups %xmm1, (%rax) -; AVX-NEXT: vmovdqu %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: test5: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovups %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test5: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqu %xmm1, (%rax) +; AVX2-NEXT: vmovdqu %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; X32-AVX2-LABEL: test5: ; X32-AVX2: # %bb.0: ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX2-NEXT: vpmovsxbw (%ecx), %xmm0 -; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX2-NEXT: vmovups %xmm1, (%eax) +; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX2-NEXT: vmovdqu %xmm1, (%eax) ; X32-AVX2-NEXT: vmovdqu %xmm0, (%eax) ; X32-AVX2-NEXT: retl %wide.load35 = load <8 x i8>, <8 x i8>* %in, align 1 @@ -227,8 +251,8 @@ ; AVX2-LABEL: test6: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbw (%rdi), %ymm0 -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %ymm1, (%rax) +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -238,8 +262,8 @@ ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX2-NEXT: vpmovsxbw (%ecx), %ymm0 -; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX2-NEXT: vmovups %ymm1, (%eax) +; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX2-NEXT: vmovdqu %ymm1, (%eax) ; X32-AVX2-NEXT: vmovdqu %ymm0, (%eax) ; X32-AVX2-NEXT: vzeroupper ; X32-AVX2-NEXT: retl @@ -259,21 +283,29 @@ ; SSE41-NEXT: movdqu %xmm0, (%rsi) ; SSE41-NEXT: retq ; -; AVX-LABEL: test7: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovups %xmm1, (%rax) -; AVX-NEXT: vmovdqu %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: test7: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwq (%rdi), %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovups %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test7: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwq (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqu %xmm1, (%rax) +; AVX2-NEXT: vmovdqu %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; X32-AVX2-LABEL: test7: ; X32-AVX2: # %bb.0: ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX2-NEXT: vpmovsxwq (%ecx), %xmm0 -; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX2-NEXT: vmovups %xmm1, (%eax) +; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX2-NEXT: vmovdqu %xmm1, (%eax) ; X32-AVX2-NEXT: vmovdqu %xmm0, (%eax) ; X32-AVX2-NEXT: retl %wide.load35 = load <2 x i16>, <2 x i16>* %in, align 1 @@ -308,8 +340,8 @@ ; AVX2-LABEL: test8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxwq (%rdi), %ymm0 -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %ymm1, (%rax) +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -319,8 +351,8 @@ ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX2-NEXT: vpmovsxwq (%ecx), %ymm0 -; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX2-NEXT: vmovups %ymm1, (%eax) +; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX2-NEXT: vmovdqu %ymm1, (%eax) ; X32-AVX2-NEXT: vmovdqu %ymm0, (%eax) ; X32-AVX2-NEXT: vzeroupper ; X32-AVX2-NEXT: retl @@ -340,21 +372,29 @@ ; SSE41-NEXT: movdqu %xmm0, (%rsi) ; SSE41-NEXT: retq ; -; AVX-LABEL: test9: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovups %xmm1, (%rax) -; AVX-NEXT: vmovdqu %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: test9: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovups %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test9: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqu %xmm1, (%rax) +; AVX2-NEXT: vmovdqu %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; X32-AVX2-LABEL: test9: ; X32-AVX2: # %bb.0: ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX2-NEXT: vpmovsxwd (%ecx), %xmm0 -; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX2-NEXT: vmovups %xmm1, (%eax) +; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX2-NEXT: vmovdqu %xmm1, (%eax) ; X32-AVX2-NEXT: vmovdqu %xmm0, (%eax) ; X32-AVX2-NEXT: retl %wide.load35 = load <4 x i16>, <4 x i16>* %in, align 1 @@ -389,8 +429,8 @@ ; AVX2-LABEL: test10: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0 -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %ymm1, (%rax) +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -400,8 +440,8 @@ ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX2-NEXT: vpmovsxwd (%ecx), %ymm0 -; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX2-NEXT: vmovups %ymm1, (%eax) +; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX2-NEXT: vmovdqu %ymm1, (%eax) ; X32-AVX2-NEXT: vmovdqu %ymm0, (%eax) ; X32-AVX2-NEXT: vzeroupper ; X32-AVX2-NEXT: retl @@ -421,21 +461,29 @@ ; SSE41-NEXT: movdqu %xmm0, (%rsi) ; SSE41-NEXT: retq ; -; AVX-LABEL: test11: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxdq (%rdi), %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovups %xmm1, (%rax) -; AVX-NEXT: vmovdqu %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: test11: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxdq (%rdi), %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovups %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: test11: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxdq (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqu %xmm1, (%rax) +; AVX2-NEXT: vmovdqu %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; X32-AVX2-LABEL: test11: ; X32-AVX2: # %bb.0: ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX2-NEXT: vpmovsxdq (%ecx), %xmm0 -; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX2-NEXT: vmovups %xmm1, (%eax) +; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX2-NEXT: vmovdqu %xmm1, (%eax) ; X32-AVX2-NEXT: vmovdqu %xmm0, (%eax) ; X32-AVX2-NEXT: retl %wide.load35 = load <2 x i32>, <2 x i32>* %in, align 1 @@ -470,8 +518,8 @@ ; AVX2-LABEL: test12: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxdq (%rdi), %ymm0 -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %ymm1, (%rax) +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -481,8 +529,8 @@ ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX2-NEXT: vpmovsxdq (%ecx), %ymm0 -; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX2-NEXT: vmovups %ymm1, (%eax) +; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX2-NEXT: vmovdqu %ymm1, (%eax) ; X32-AVX2-NEXT: vmovdqu %ymm0, (%eax) ; X32-AVX2-NEXT: vzeroupper ; X32-AVX2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -281,7 +281,7 @@ ; SSE2-NEXT: subq $40, %rsp ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE2-NEXT: callq foo +; SSE2-NEXT: callq foo@PLT ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -299,7 +299,7 @@ ; SSE41-NEXT: subq $40, %rsp ; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE41-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE41-NEXT: callq foo +; SSE41-NEXT: callq foo@PLT ; SSE41-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE41-NEXT: pmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE41-NEXT: addq $40, %rsp @@ -308,9 +308,9 @@ ; AVX-LABEL: mul_v4i32spill: ; AVX: # %bb.0: # %entry ; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: callq foo +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: callq foo@PLT ; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: addq $40, %rsp @@ -328,7 +328,7 @@ ; SSE-NEXT: subq $40, %rsp ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: callq foo +; SSE-NEXT: callq foo@PLT ; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 @@ -347,9 +347,9 @@ ; AVX-LABEL: mul_v2i64spill: ; AVX: # %bb.0: # %entry ; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: callq foo +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: callq foo@PLT ; AVX-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload ; AVX-NEXT: vpsrlq $32, %xmm3, %xmm0 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll --- a/llvm/test/CodeGen/X86/pr29112.ll +++ b/llvm/test/CodeGen/X86/pr29112.ll @@ -10,7 +10,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: vmovaps %xmm1, %xmm9 +; CHECK-NEXT: vmovdqa %xmm1, %xmm9 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,22,1,17,4,22,1,17,4,22,1,17,4,22,1,17] ; CHECK-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm14 @@ -54,9 +54,9 @@ ; CHECK-NEXT: vaddps %xmm0, %xmm13, %xmm0 ; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovaps %xmm10, (%rsp) -; CHECK-NEXT: vmovaps %xmm9, %xmm3 +; CHECK-NEXT: vmovdqa %xmm9, %xmm3 ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq foo +; CHECK-NEXT: callq foo@PLT ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr30284.ll b/llvm/test/CodeGen/X86/pr30284.ll --- a/llvm/test/CodeGen/X86/pr30284.ll +++ b/llvm/test/CodeGen/X86/pr30284.ll @@ -19,14 +19,14 @@ ; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 ; CHECK-NEXT: vpslld $31, %zmm0, %zmm0 ; CHECK-NEXT: vpmovd2m %zmm0, %k1 -; CHECK-NEXT: vmovapd 0, %zmm0 -; CHECK-NEXT: vmovapd 64, %zmm1 -; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm2 = [68719476736,68719476736,68719476736,68719476736,68719476736,68719476736,68719476736,68719476736] +; CHECK-NEXT: vmovdqa64 0, %zmm0 +; CHECK-NEXT: vmovdqa64 64, %zmm1 +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm2 = [68719476736,68719476736,68719476736,68719476736,68719476736,68719476736,68719476736,68719476736] ; CHECK-NEXT: kshiftrw $8, %k1, %k2 -; CHECK-NEXT: vorpd %zmm2, %zmm1, %zmm1 {%k2} -; CHECK-NEXT: vorpd %zmm2, %zmm0, %zmm0 {%k1} -; CHECK-NEXT: vmovapd %zmm0, 0 -; CHECK-NEXT: vmovapd %zmm1, 64 +; CHECK-NEXT: vporq %zmm2, %zmm1, %zmm1 {%k2} +; CHECK-NEXT: vporq %zmm2, %zmm0, %zmm0 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm0, 0 +; CHECK-NEXT: vmovdqa64 %zmm1, 64 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %a_load22 = load <16 x i64>, <16 x i64>* null, align 1 diff --git a/llvm/test/CodeGen/X86/pr34657.ll b/llvm/test/CodeGen/X86/pr34657.ll --- a/llvm/test/CodeGen/X86/pr34657.ll +++ b/llvm/test/CodeGen/X86/pr34657.ll @@ -5,12 +5,12 @@ ; CHECK-LABEL: pr34657: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: vmovups (%rsi), %zmm0 -; CHECK-NEXT: vmovups 64(%rsi), %ymm1 -; CHECK-NEXT: vmovups 96(%rsi), %xmm2 -; CHECK-NEXT: vmovaps %xmm2, 96(%rdi) -; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm0, (%rdi) +; CHECK-NEXT: vmovdqu64 (%rsi), %zmm0 +; CHECK-NEXT: vmovdqu 64(%rsi), %ymm1 +; CHECK-NEXT: vmovdqu 96(%rsi), %xmm2 +; CHECK-NEXT: vmovdqa %xmm2, 96(%rdi) +; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi) +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll --- a/llvm/test/CodeGen/X86/pr41619.ll +++ b/llvm/test/CodeGen/X86/pr41619.ll @@ -7,10 +7,10 @@ ; CHECK: ## %bb.0: ## %bb ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: vmovd %eax, %xmm0 -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: movl %eax, (%rax) -; CHECK-NEXT: vmovlps %xmm1, (%rax) +; CHECK-NEXT: vmovq %xmm1, (%rax) ; CHECK-NEXT: retq bb: %tmp = bitcast double %arg to i64 diff --git a/llvm/test/CodeGen/X86/pr44140.ll b/llvm/test/CodeGen/X86/pr44140.ll --- a/llvm/test/CodeGen/X86/pr44140.ll +++ b/llvm/test/CodeGen/X86/pr44140.ll @@ -15,34 +15,34 @@ ; CHECK: # %bb.0: # %start ; CHECK-NEXT: subq $584, %rsp # imm = 0x248 ; CHECK-NEXT: .cfi_def_cfa_offset 592 -; CHECK-NEXT: vmovaps {{.*#+}} xmm6 = [1010101010101010101,2020202020202020202] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm6 = [1010101010101010101,2020202020202020202] ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_1: # %fake-loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 -; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 -; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm7 -; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm2 -; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm3 -; CHECK-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 -; CHECK-NEXT: vmovups %ymm3, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups %ymm7, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups %ymm3, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups %ymm7, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm5 -; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm4 -; CHECK-NEXT: vmovups %ymm5, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm0 +; CHECK-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm1 +; CHECK-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm7 +; CHECK-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm2 +; CHECK-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm3 +; CHECK-NEXT: vmovdqu %ymm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu %ymm1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm1 +; CHECK-NEXT: vmovdqu %ymm3, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu %ymm2, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu %ymm7, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu %ymm3, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu %ymm2, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu %ymm7, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu %ymm1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu %ymm1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm5 +; CHECK-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm4 +; CHECK-NEXT: vmovdqu %ymm5, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu %ymm4, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq opaque -; CHECK-NEXT: vmovaps %xmm6, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqa %xmm6, {{[0-9]+}}(%rsp) ; CHECK-NEXT: testb %sil, %sil ; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: # %bb.2: # %exit diff --git a/llvm/test/CodeGen/X86/pr46532.ll b/llvm/test/CodeGen/X86/pr46532.ll --- a/llvm/test/CodeGen/X86/pr46532.ll +++ b/llvm/test/CodeGen/X86/pr46532.ll @@ -5,8 +5,8 @@ ; CHECK-LABEL: WhileWithLoopInvariantOperation.21: ; CHECK: # %bb.0: # %while.1.body.preheader ; CHECK-NEXT: movq (%rax), %rax -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, 32(%rax) +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, 32(%rax) ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,0,0,0,0,0,0] ; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax) while.1.body.preheader: diff --git a/llvm/test/CodeGen/X86/pr46820.ll b/llvm/test/CodeGen/X86/pr46820.ll --- a/llvm/test/CodeGen/X86/pr46820.ll +++ b/llvm/test/CodeGen/X86/pr46820.ll @@ -11,15 +11,15 @@ ; CHECK-LABEL: load23: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: vmovups 64(%rsi), %ymm0 -; CHECK-NEXT: vmovups (%rsi), %zmm1 -; CHECK-NEXT: vmovaps 64(%rsi), %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss %xmm3, 88(%rdi) -; CHECK-NEXT: vmovaps %xmm2, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm1, (%rdi) -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovlps %xmm0, 80(%rdi) +; CHECK-NEXT: vmovdqu 64(%rsi), %ymm0 +; CHECK-NEXT: vmovdqu64 (%rsi), %zmm1 +; CHECK-NEXT: vmovdqa 64(%rsi), %xmm2 +; CHECK-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovd %xmm3, 88(%rdi) +; CHECK-NEXT: vmovdqa %xmm2, 64(%rdi) +; CHECK-NEXT: vmovdqa64 %zmm1, (%rdi) +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovq %xmm0, 80(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t0 = load <23 x float>, <23 x float>* %p, align 16 @@ -32,14 +32,14 @@ ; CHECK-LABEL: load23_align_1: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: vmovups (%rsi), %zmm0 -; CHECK-NEXT: vmovups 64(%rsi), %xmm1 +; CHECK-NEXT: vmovdqu64 (%rsi), %zmm0 +; CHECK-NEXT: vmovdqu 64(%rsi), %xmm1 ; CHECK-NEXT: movq 80(%rsi), %rcx -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss %xmm2, 88(%rdi) +; CHECK-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovd %xmm2, 88(%rdi) ; CHECK-NEXT: movq %rcx, 80(%rdi) -; CHECK-NEXT: vmovaps %xmm1, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm0, (%rdi) +; CHECK-NEXT: vmovdqa %xmm1, 64(%rdi) +; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t0 = load <23 x float>, <23 x float>* %p, align 1 diff --git a/llvm/test/CodeGen/X86/reduce-trunc-shl.ll b/llvm/test/CodeGen/X86/reduce-trunc-shl.ll --- a/llvm/test/CodeGen/X86/reduce-trunc-shl.ll +++ b/llvm/test/CodeGen/X86/reduce-trunc-shl.ll @@ -56,7 +56,7 @@ ; ; AVX2-LABEL: trunc_shl_16_v8i16_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: retq %shl = shl <8 x i32> %a, %conv = trunc <8 x i32> %shl to <8 x i16> @@ -71,7 +71,7 @@ ; ; AVX2-LABEL: trunc_shl_17_v8i16_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: retq %shl = shl <8 x i32> %a, %conv = trunc <8 x i32> %shl to <8 x i16> diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -57,15 +57,15 @@ ; X86-LABEL: vrolq_extract_udiv: ; X86: # %bb.0: ; X86-NEXT: subl $44, %esp -; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vpextrd $1, %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: vmovd %xmm0, (%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $3, {{[0-9]+}}(%esp) ; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vextractps $2, %xmm0, (%esp) +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpextrd $3, %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: vpextrd $2, %xmm0, (%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $3, {{[0-9]+}}(%esp) ; X86-NEXT: vmovd %eax, %xmm0 @@ -208,15 +208,15 @@ ; X86-LABEL: no_extract_udiv: ; X86: # %bb.0: ; X86-NEXT: subl $60, %esp -; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: vpextrd $1, %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: vmovd %xmm0, (%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $3, {{[0-9]+}}(%esp) ; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vextractps $2, %xmm0, (%esp) +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpextrd $3, %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: vpextrd $2, %xmm0, (%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $3, {{[0-9]+}}(%esp) ; X86-NEXT: vmovd %eax, %xmm0 @@ -225,15 +225,15 @@ ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpextrd $1, %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: vmovd %xmm0, (%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302 ; X86-NEXT: calll __udivdi3 -; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: vextractps $2, %xmm0, (%esp) +; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: vpextrd $3, %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: vpextrd $2, %xmm0, (%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302 ; X86-NEXT: vmovd %eax, %xmm0 diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll --- a/llvm/test/CodeGen/X86/rotate_vec.ll +++ b/llvm/test/CodeGen/X86/rotate_vec.ll @@ -85,12 +85,12 @@ ; ; XOPAVX2-LABEL: rot_v4i32_zero_non_splat: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vbroadcastss %xmm0, %xmm0 +; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: rot_v4i32_zero_non_splat: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> ) %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll --- a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll +++ b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll @@ -699,8 +699,8 @@ ; X86-AVX512-WIN-LABEL: t_to_u32: ; X86-AVX512-WIN: # %bb.0: ; X86-AVX512-WIN-NEXT: subl $16, %esp -; X86-AVX512-WIN-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; X86-AVX512-WIN-NEXT: vmovups %xmm0, (%esp) +; X86-AVX512-WIN-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-WIN-NEXT: vmovdqu %xmm0, (%esp) ; X86-AVX512-WIN-NEXT: calll ___fixunstfsi ; X86-AVX512-WIN-NEXT: addl $16, %esp ; X86-AVX512-WIN-NEXT: retl @@ -708,8 +708,8 @@ ; X86-AVX512-LIN-LABEL: t_to_u32: ; X86-AVX512-LIN: # %bb.0: ; X86-AVX512-LIN-NEXT: subl $28, %esp -; X86-AVX512-LIN-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 -; X86-AVX512-LIN-NEXT: vmovups %xmm0, (%esp) +; X86-AVX512-LIN-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-LIN-NEXT: vmovdqu %xmm0, (%esp) ; X86-AVX512-LIN-NEXT: calll __fixunstfsi ; X86-AVX512-LIN-NEXT: addl $28, %esp ; X86-AVX512-LIN-NEXT: retl @@ -724,7 +724,7 @@ ; X64-AVX512-LIN-LABEL: t_to_u32: ; X64-AVX512-LIN: # %bb.0: ; X64-AVX512-LIN-NEXT: pushq %rax -; X64-AVX512-LIN-NEXT: callq __fixunstfsi +; X64-AVX512-LIN-NEXT: callq __fixunstfsi@PLT ; X64-AVX512-LIN-NEXT: popq %rcx ; X64-AVX512-LIN-NEXT: retq ; @@ -759,7 +759,7 @@ ; X64-SSE-LIN-LABEL: t_to_u32: ; X64-SSE-LIN: # %bb.0: ; X64-SSE-LIN-NEXT: pushq %rax -; X64-SSE-LIN-NEXT: callq __fixunstfsi +; X64-SSE-LIN-NEXT: callq __fixunstfsi@PLT ; X64-SSE-LIN-NEXT: popq %rcx ; X64-SSE-LIN-NEXT: retq ; @@ -791,8 +791,8 @@ ; X86-AVX512-WIN-LABEL: t_to_s32: ; X86-AVX512-WIN: # %bb.0: ; X86-AVX512-WIN-NEXT: subl $16, %esp -; X86-AVX512-WIN-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; X86-AVX512-WIN-NEXT: vmovups %xmm0, (%esp) +; X86-AVX512-WIN-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-WIN-NEXT: vmovdqu %xmm0, (%esp) ; X86-AVX512-WIN-NEXT: calll ___fixtfsi ; X86-AVX512-WIN-NEXT: addl $16, %esp ; X86-AVX512-WIN-NEXT: retl @@ -800,8 +800,8 @@ ; X86-AVX512-LIN-LABEL: t_to_s32: ; X86-AVX512-LIN: # %bb.0: ; X86-AVX512-LIN-NEXT: subl $28, %esp -; X86-AVX512-LIN-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 -; X86-AVX512-LIN-NEXT: vmovups %xmm0, (%esp) +; X86-AVX512-LIN-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-LIN-NEXT: vmovdqu %xmm0, (%esp) ; X86-AVX512-LIN-NEXT: calll __fixtfsi ; X86-AVX512-LIN-NEXT: addl $28, %esp ; X86-AVX512-LIN-NEXT: retl @@ -816,7 +816,7 @@ ; X64-AVX512-LIN-LABEL: t_to_s32: ; X64-AVX512-LIN: # %bb.0: ; X64-AVX512-LIN-NEXT: pushq %rax -; X64-AVX512-LIN-NEXT: callq __fixtfsi +; X64-AVX512-LIN-NEXT: callq __fixtfsi@PLT ; X64-AVX512-LIN-NEXT: popq %rcx ; X64-AVX512-LIN-NEXT: retq ; @@ -851,7 +851,7 @@ ; X64-SSE-LIN-LABEL: t_to_s32: ; X64-SSE-LIN: # %bb.0: ; X64-SSE-LIN-NEXT: pushq %rax -; X64-SSE-LIN-NEXT: callq __fixtfsi +; X64-SSE-LIN-NEXT: callq __fixtfsi@PLT ; X64-SSE-LIN-NEXT: popq %rcx ; X64-SSE-LIN-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll --- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll +++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll @@ -41,7 +41,7 @@ define i64 @f_to_u64(float %a) nounwind { ; X86-AVX512DQVL-LABEL: f_to_u64: ; X86-AVX512DQVL: # %bb.0: -; X86-AVX512DQVL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512DQVL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX512DQVL-NEXT: vcvttps2uqq %xmm0, %xmm0 ; X86-AVX512DQVL-NEXT: vmovd %xmm0, %eax ; X86-AVX512DQVL-NEXT: vpextrd $1, %xmm0, %edx @@ -54,7 +54,7 @@ ; ; X86-AVX512DQ-LABEL: f_to_u64: ; X86-AVX512DQ: # %bb.0: -; X86-AVX512DQ-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512DQ-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 ; X86-AVX512DQ-NEXT: vmovd %xmm0, %eax ; X86-AVX512DQ-NEXT: vpextrd $1, %xmm0, %edx @@ -302,7 +302,7 @@ define i64 @f_to_s64(float %a) nounwind { ; X86-AVX512DQVL-LABEL: f_to_s64: ; X86-AVX512DQVL: # %bb.0: -; X86-AVX512DQVL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512DQVL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX512DQVL-NEXT: vcvttps2qq %xmm0, %xmm0 ; X86-AVX512DQVL-NEXT: vmovd %xmm0, %eax ; X86-AVX512DQVL-NEXT: vpextrd $1, %xmm0, %edx @@ -315,7 +315,7 @@ ; ; X86-AVX512DQ-LABEL: f_to_s64: ; X86-AVX512DQ: # %bb.0: -; X86-AVX512DQ-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512DQ-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 ; X86-AVX512DQ-NEXT: vmovd %xmm0, %eax ; X86-AVX512DQ-NEXT: vpextrd $1, %xmm0, %edx @@ -328,8 +328,8 @@ ; X86-AVX512F-WIN-NEXT: movl %esp, %ebp ; X86-AVX512F-WIN-NEXT: andl $-8, %esp ; X86-AVX512F-WIN-NEXT: subl $8, %esp -; X86-AVX512F-WIN-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX512F-WIN-NEXT: vmovss %xmm0, (%esp) +; X86-AVX512F-WIN-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512F-WIN-NEXT: vmovd %xmm0, (%esp) ; X86-AVX512F-WIN-NEXT: flds (%esp) ; X86-AVX512F-WIN-NEXT: fisttpll (%esp) ; X86-AVX512F-WIN-NEXT: movl (%esp), %eax @@ -341,8 +341,8 @@ ; X86-AVX512F-LIN-LABEL: f_to_s64: ; X86-AVX512F-LIN: # %bb.0: ; X86-AVX512F-LIN-NEXT: subl $12, %esp -; X86-AVX512F-LIN-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX512F-LIN-NEXT: vmovss %xmm0, (%esp) +; X86-AVX512F-LIN-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512F-LIN-NEXT: vmovd %xmm0, (%esp) ; X86-AVX512F-LIN-NEXT: flds (%esp) ; X86-AVX512F-LIN-NEXT: fisttpll (%esp) ; X86-AVX512F-LIN-NEXT: movl (%esp), %eax @@ -465,7 +465,7 @@ define i64 @d_to_u64(double %a) nounwind { ; X86-AVX512DQVL-LABEL: d_to_u64: ; X86-AVX512DQVL: # %bb.0: -; X86-AVX512DQVL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512DQVL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X86-AVX512DQVL-NEXT: vcvttpd2uqq %xmm0, %xmm0 ; X86-AVX512DQVL-NEXT: vmovd %xmm0, %eax ; X86-AVX512DQVL-NEXT: vpextrd $1, %xmm0, %edx @@ -478,7 +478,7 @@ ; ; X86-AVX512DQ-LABEL: d_to_u64: ; X86-AVX512DQ: # %bb.0: -; X86-AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X86-AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 ; X86-AVX512DQ-NEXT: vmovd %xmm0, %eax ; X86-AVX512DQ-NEXT: vpextrd $1, %xmm0, %edx @@ -726,7 +726,7 @@ define i64 @d_to_s64(double %a) nounwind { ; X86-AVX512DQVL-LABEL: d_to_s64: ; X86-AVX512DQVL: # %bb.0: -; X86-AVX512DQVL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512DQVL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X86-AVX512DQVL-NEXT: vcvttpd2qq %xmm0, %xmm0 ; X86-AVX512DQVL-NEXT: vmovd %xmm0, %eax ; X86-AVX512DQVL-NEXT: vpextrd $1, %xmm0, %edx @@ -739,7 +739,7 @@ ; ; X86-AVX512DQ-LABEL: d_to_s64: ; X86-AVX512DQ: # %bb.0: -; X86-AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; X86-AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 ; X86-AVX512DQ-NEXT: vmovd %xmm0, %eax ; X86-AVX512DQ-NEXT: vpextrd $1, %xmm0, %edx @@ -752,8 +752,8 @@ ; X86-AVX512F-WIN-NEXT: movl %esp, %ebp ; X86-AVX512F-WIN-NEXT: andl $-8, %esp ; X86-AVX512F-WIN-NEXT: subl $8, %esp -; X86-AVX512F-WIN-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX512F-WIN-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX512F-WIN-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512F-WIN-NEXT: vmovq %xmm0, (%esp) ; X86-AVX512F-WIN-NEXT: fldl (%esp) ; X86-AVX512F-WIN-NEXT: fisttpll (%esp) ; X86-AVX512F-WIN-NEXT: movl (%esp), %eax @@ -765,8 +765,8 @@ ; X86-AVX512F-LIN-LABEL: d_to_s64: ; X86-AVX512F-LIN: # %bb.0: ; X86-AVX512F-LIN-NEXT: subl $12, %esp -; X86-AVX512F-LIN-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX512F-LIN-NEXT: vmovsd %xmm0, (%esp) +; X86-AVX512F-LIN-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512F-LIN-NEXT: vmovq %xmm0, (%esp) ; X86-AVX512F-LIN-NEXT: fldl (%esp) ; X86-AVX512F-LIN-NEXT: fisttpll (%esp) ; X86-AVX512F-LIN-NEXT: movl (%esp), %eax @@ -1400,8 +1400,8 @@ ; X86-AVX512-WIN-LABEL: t_to_u64: ; X86-AVX512-WIN: # %bb.0: ; X86-AVX512-WIN-NEXT: subl $16, %esp -; X86-AVX512-WIN-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; X86-AVX512-WIN-NEXT: vmovups %xmm0, (%esp) +; X86-AVX512-WIN-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-WIN-NEXT: vmovdqu %xmm0, (%esp) ; X86-AVX512-WIN-NEXT: calll ___fixunstfdi ; X86-AVX512-WIN-NEXT: addl $16, %esp ; X86-AVX512-WIN-NEXT: retl @@ -1409,8 +1409,8 @@ ; X86-AVX512-LIN-LABEL: t_to_u64: ; X86-AVX512-LIN: # %bb.0: ; X86-AVX512-LIN-NEXT: subl $28, %esp -; X86-AVX512-LIN-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 -; X86-AVX512-LIN-NEXT: vmovups %xmm0, (%esp) +; X86-AVX512-LIN-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-LIN-NEXT: vmovdqu %xmm0, (%esp) ; X86-AVX512-LIN-NEXT: calll __fixunstfdi ; X86-AVX512-LIN-NEXT: addl $28, %esp ; X86-AVX512-LIN-NEXT: retl @@ -1425,7 +1425,7 @@ ; X64-AVX512-LIN-LABEL: t_to_u64: ; X64-AVX512-LIN: # %bb.0: ; X64-AVX512-LIN-NEXT: pushq %rax -; X64-AVX512-LIN-NEXT: callq __fixunstfdi +; X64-AVX512-LIN-NEXT: callq __fixunstfdi@PLT ; X64-AVX512-LIN-NEXT: popq %rcx ; X64-AVX512-LIN-NEXT: retq ; @@ -1460,7 +1460,7 @@ ; X64-SSE-LIN-LABEL: t_to_u64: ; X64-SSE-LIN: # %bb.0: ; X64-SSE-LIN-NEXT: pushq %rax -; X64-SSE-LIN-NEXT: callq __fixunstfdi +; X64-SSE-LIN-NEXT: callq __fixunstfdi@PLT ; X64-SSE-LIN-NEXT: popq %rcx ; X64-SSE-LIN-NEXT: retq ; @@ -1492,8 +1492,8 @@ ; X86-AVX512-WIN-LABEL: t_to_s64: ; X86-AVX512-WIN: # %bb.0: ; X86-AVX512-WIN-NEXT: subl $16, %esp -; X86-AVX512-WIN-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; X86-AVX512-WIN-NEXT: vmovups %xmm0, (%esp) +; X86-AVX512-WIN-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-WIN-NEXT: vmovdqu %xmm0, (%esp) ; X86-AVX512-WIN-NEXT: calll ___fixtfdi ; X86-AVX512-WIN-NEXT: addl $16, %esp ; X86-AVX512-WIN-NEXT: retl @@ -1501,8 +1501,8 @@ ; X86-AVX512-LIN-LABEL: t_to_s64: ; X86-AVX512-LIN: # %bb.0: ; X86-AVX512-LIN-NEXT: subl $28, %esp -; X86-AVX512-LIN-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 -; X86-AVX512-LIN-NEXT: vmovups %xmm0, (%esp) +; X86-AVX512-LIN-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-LIN-NEXT: vmovdqu %xmm0, (%esp) ; X86-AVX512-LIN-NEXT: calll __fixtfdi ; X86-AVX512-LIN-NEXT: addl $28, %esp ; X86-AVX512-LIN-NEXT: retl @@ -1517,7 +1517,7 @@ ; X64-AVX512-LIN-LABEL: t_to_s64: ; X64-AVX512-LIN: # %bb.0: ; X64-AVX512-LIN-NEXT: pushq %rax -; X64-AVX512-LIN-NEXT: callq __fixtfdi +; X64-AVX512-LIN-NEXT: callq __fixtfdi@PLT ; X64-AVX512-LIN-NEXT: popq %rcx ; X64-AVX512-LIN-NEXT: retq ; @@ -1552,7 +1552,7 @@ ; X64-SSE-LIN-LABEL: t_to_s64: ; X64-SSE-LIN: # %bb.0: ; X64-SSE-LIN-NEXT: pushq %rax -; X64-SSE-LIN-NEXT: callq __fixtfdi +; X64-SSE-LIN-NEXT: callq __fixtfdi@PLT ; X64-SSE-LIN-NEXT: popq %rcx ; X64-SSE-LIN-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll --- a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll +++ b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll @@ -298,9 +298,9 @@ ; AVX512DQVL_32-LABEL: u64_to_f: ; AVX512DQVL_32: # %bb.0: ; AVX512DQVL_32-NEXT: pushl %eax -; AVX512DQVL_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQVL_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQVL_32-NEXT: vcvtuqq2ps %ymm0, %xmm0 -; AVX512DQVL_32-NEXT: vmovss %xmm0, (%esp) +; AVX512DQVL_32-NEXT: vmovd %xmm0, (%esp) ; AVX512DQVL_32-NEXT: flds (%esp) ; AVX512DQVL_32-NEXT: popl %eax ; AVX512DQVL_32-NEXT: vzeroupper @@ -314,9 +314,9 @@ ; AVX512DQ_32-LABEL: u64_to_f: ; AVX512DQ_32: # %bb.0: ; AVX512DQ_32-NEXT: pushl %eax -; AVX512DQ_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ_32-NEXT: vcvtuqq2ps %zmm0, %ymm0 -; AVX512DQ_32-NEXT: vmovss %xmm0, (%esp) +; AVX512DQ_32-NEXT: vmovd %xmm0, (%esp) ; AVX512DQ_32-NEXT: flds (%esp) ; AVX512DQ_32-NEXT: popl %eax ; AVX512DQ_32-NEXT: vzeroupper @@ -329,14 +329,14 @@ ; AVX512F_32-NEXT: andl $-8, %esp ; AVX512F_32-NEXT: subl $16, %esp ; AVX512F_32-NEXT: movl 12(%ebp), %eax -; AVX512F_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512F_32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX512F_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F_32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) ; AVX512F_32-NEXT: shrl $31, %eax ; AVX512F_32-NEXT: fildll {{[0-9]+}}(%esp) ; AVX512F_32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; AVX512F_32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX512F_32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F_32-NEXT: vmovss %xmm0, (%esp) +; AVX512F_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F_32-NEXT: vmovd %xmm0, (%esp) ; AVX512F_32-NEXT: flds (%esp) ; AVX512F_32-NEXT: movl %ebp, %esp ; AVX512F_32-NEXT: popl %ebp @@ -427,9 +427,9 @@ ; AVX512DQVL_32-LABEL: s64_to_f: ; AVX512DQVL_32: # %bb.0: ; AVX512DQVL_32-NEXT: pushl %eax -; AVX512DQVL_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQVL_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQVL_32-NEXT: vcvtqq2ps %ymm0, %xmm0 -; AVX512DQVL_32-NEXT: vmovss %xmm0, (%esp) +; AVX512DQVL_32-NEXT: vmovd %xmm0, (%esp) ; AVX512DQVL_32-NEXT: flds (%esp) ; AVX512DQVL_32-NEXT: popl %eax ; AVX512DQVL_32-NEXT: vzeroupper @@ -443,9 +443,9 @@ ; AVX512DQ_32-LABEL: s64_to_f: ; AVX512DQ_32: # %bb.0: ; AVX512DQ_32-NEXT: pushl %eax -; AVX512DQ_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ_32-NEXT: vcvtqq2ps %zmm0, %ymm0 -; AVX512DQ_32-NEXT: vmovss %xmm0, (%esp) +; AVX512DQ_32-NEXT: vmovd %xmm0, (%esp) ; AVX512DQ_32-NEXT: flds (%esp) ; AVX512DQ_32-NEXT: popl %eax ; AVX512DQ_32-NEXT: vzeroupper @@ -493,7 +493,7 @@ ; AVX512DQVL_32-NEXT: vmovd %eax, %xmm0 ; AVX512DQVL_32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX512DQVL_32-NEXT: vcvtqq2ps %ymm0, %xmm0 -; AVX512DQVL_32-NEXT: vmovss %xmm0, (%esp) +; AVX512DQVL_32-NEXT: vmovd %xmm0, (%esp) ; AVX512DQVL_32-NEXT: flds (%esp) ; AVX512DQVL_32-NEXT: popl %eax ; AVX512DQVL_32-NEXT: vzeroupper @@ -515,7 +515,7 @@ ; AVX512DQ_32-NEXT: vmovd %eax, %xmm0 ; AVX512DQ_32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX512DQ_32-NEXT: vcvtqq2ps %zmm0, %ymm0 -; AVX512DQ_32-NEXT: vmovss %xmm0, (%esp) +; AVX512DQ_32-NEXT: vmovd %xmm0, (%esp) ; AVX512DQ_32-NEXT: flds (%esp) ; AVX512DQ_32-NEXT: popl %eax ; AVX512DQ_32-NEXT: vzeroupper @@ -615,9 +615,9 @@ ; AVX512DQVL_32-NEXT: movl %esp, %ebp ; AVX512DQVL_32-NEXT: andl $-8, %esp ; AVX512DQVL_32-NEXT: subl $8, %esp -; AVX512DQVL_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQVL_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQVL_32-NEXT: vcvtuqq2pd %ymm0, %ymm0 -; AVX512DQVL_32-NEXT: vmovlps %xmm0, (%esp) +; AVX512DQVL_32-NEXT: vmovq %xmm0, (%esp) ; AVX512DQVL_32-NEXT: fldl (%esp) ; AVX512DQVL_32-NEXT: movl %ebp, %esp ; AVX512DQVL_32-NEXT: popl %ebp @@ -635,9 +635,9 @@ ; AVX512DQ_32-NEXT: movl %esp, %ebp ; AVX512DQ_32-NEXT: andl $-8, %esp ; AVX512DQ_32-NEXT: subl $8, %esp -; AVX512DQ_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ_32-NEXT: vcvtuqq2pd %zmm0, %zmm0 -; AVX512DQ_32-NEXT: vmovlps %xmm0, (%esp) +; AVX512DQ_32-NEXT: vmovq %xmm0, (%esp) ; AVX512DQ_32-NEXT: fldl (%esp) ; AVX512DQ_32-NEXT: movl %ebp, %esp ; AVX512DQ_32-NEXT: popl %ebp @@ -737,9 +737,9 @@ ; AVX512DQVL_32-NEXT: movl %esp, %ebp ; AVX512DQVL_32-NEXT: andl $-8, %esp ; AVX512DQVL_32-NEXT: subl $8, %esp -; AVX512DQVL_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQVL_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQVL_32-NEXT: vcvtuqq2pd %ymm0, %ymm0 -; AVX512DQVL_32-NEXT: vmovlps %xmm0, (%esp) +; AVX512DQVL_32-NEXT: vmovq %xmm0, (%esp) ; AVX512DQVL_32-NEXT: fldl (%esp) ; AVX512DQVL_32-NEXT: movl %ebp, %esp ; AVX512DQVL_32-NEXT: popl %ebp @@ -757,9 +757,9 @@ ; AVX512DQ_32-NEXT: movl %esp, %ebp ; AVX512DQ_32-NEXT: andl $-8, %esp ; AVX512DQ_32-NEXT: subl $8, %esp -; AVX512DQ_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ_32-NEXT: vcvtuqq2pd %zmm0, %zmm0 -; AVX512DQ_32-NEXT: vmovlps %xmm0, (%esp) +; AVX512DQ_32-NEXT: vmovq %xmm0, (%esp) ; AVX512DQ_32-NEXT: fldl (%esp) ; AVX512DQ_32-NEXT: movl %ebp, %esp ; AVX512DQ_32-NEXT: popl %ebp @@ -858,9 +858,9 @@ ; AVX512DQVL_32-NEXT: movl %esp, %ebp ; AVX512DQVL_32-NEXT: andl $-8, %esp ; AVX512DQVL_32-NEXT: subl $8, %esp -; AVX512DQVL_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQVL_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQVL_32-NEXT: vcvtqq2pd %ymm0, %ymm0 -; AVX512DQVL_32-NEXT: vmovlps %xmm0, (%esp) +; AVX512DQVL_32-NEXT: vmovq %xmm0, (%esp) ; AVX512DQVL_32-NEXT: fldl (%esp) ; AVX512DQVL_32-NEXT: movl %ebp, %esp ; AVX512DQVL_32-NEXT: popl %ebp @@ -878,9 +878,9 @@ ; AVX512DQ_32-NEXT: movl %esp, %ebp ; AVX512DQ_32-NEXT: andl $-8, %esp ; AVX512DQ_32-NEXT: subl $8, %esp -; AVX512DQ_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ_32-NEXT: vcvtqq2pd %zmm0, %zmm0 -; AVX512DQ_32-NEXT: vmovlps %xmm0, (%esp) +; AVX512DQ_32-NEXT: vmovq %xmm0, (%esp) ; AVX512DQ_32-NEXT: fldl (%esp) ; AVX512DQ_32-NEXT: movl %ebp, %esp ; AVX512DQ_32-NEXT: popl %ebp @@ -945,7 +945,7 @@ ; AVX512DQVL_32-NEXT: vmovd %eax, %xmm0 ; AVX512DQVL_32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX512DQVL_32-NEXT: vcvtqq2pd %ymm0, %ymm0 -; AVX512DQVL_32-NEXT: vmovlps %xmm0, (%esp) +; AVX512DQVL_32-NEXT: vmovq %xmm0, (%esp) ; AVX512DQVL_32-NEXT: fldl (%esp) ; AVX512DQVL_32-NEXT: movl %ebp, %esp ; AVX512DQVL_32-NEXT: popl %ebp @@ -971,7 +971,7 @@ ; AVX512DQ_32-NEXT: vmovd %eax, %xmm0 ; AVX512DQ_32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX512DQ_32-NEXT: vcvtqq2pd %zmm0, %zmm0 -; AVX512DQ_32-NEXT: vmovlps %xmm0, (%esp) +; AVX512DQ_32-NEXT: vmovq %xmm0, (%esp) ; AVX512DQ_32-NEXT: fldl (%esp) ; AVX512DQ_32-NEXT: movl %ebp, %esp ; AVX512DQ_32-NEXT: popl %ebp diff --git a/llvm/test/CodeGen/X86/select-of-fp-constants.ll b/llvm/test/CodeGen/X86/select-of-fp-constants.ll --- a/llvm/test/CodeGen/X86/select-of-fp-constants.ll +++ b/llvm/test/CodeGen/X86/select-of-fp-constants.ll @@ -32,7 +32,7 @@ ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: testl %edi, %edi ; X64-AVX-NEXT: sete %al -; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: retq %c = icmp eq i32 %x, 0 %r = select i1 %c, float 42.0, float 23.0 diff --git a/llvm/test/CodeGen/X86/sha.ll b/llvm/test/CodeGen/X86/sha.ll --- a/llvm/test/CodeGen/X86/sha.ll +++ b/llvm/test/CodeGen/X86/sha.ll @@ -107,10 +107,10 @@ ; ; AVX-LABEL: test_sha256rnds2rr: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps %xmm0, %xmm3 -; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, %xmm3 +; AVX-NEXT: vmovdqa %xmm2, %xmm0 ; AVX-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3 -; AVX-NEXT: vmovaps %xmm3, %xmm0 +; AVX-NEXT: vmovdqa %xmm3, %xmm0 ; AVX-NEXT: retq entry: %0 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) @@ -128,10 +128,10 @@ ; ; AVX-LABEL: test_sha256rnds2rm: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps %xmm0, %xmm2 -; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, %xmm2 +; AVX-NEXT: vmovdqa %xmm1, %xmm0 ; AVX-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm2 -; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: vmovdqa %xmm2, %xmm0 ; AVX-NEXT: retq entry: %0 = load <4 x i32>, <4 x i32>* %b @@ -196,7 +196,7 @@ ; AVX-LABEL: test_sha1rnds4_zero_extend: ; AVX: # %bb.0: # %entry ; AVX-NEXT: sha1rnds4 $3, (%rdi), %xmm0 -; AVX-NEXT: vmovaps %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, %xmm0 ; AVX-NEXT: retq entry: %0 = load <4 x i32>, <4 x i32>* %b diff --git a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll --- a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll +++ b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll @@ -18,8 +18,8 @@ define <4 x double> @foo4(<4 x double> %v, <4 x double> *%p) nounwind { ; AVX2-LABEL: foo4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vmovdqa %ymm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> %res1 = shufflevector<4 x double> %res, <4 x double> undef, <4 x i32> @@ -37,9 +37,9 @@ ; ; AVX2-FAST-LABEL: foo8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vbroadcastss {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdi) ; AVX2-FAST-NEXT: retq %res = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> %res1 = shufflevector<8 x float> %res, <8 x float> undef, <8 x i32> @@ -50,7 +50,7 @@ define <4 x i32> @undef_splatmask(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -60,7 +60,7 @@ define <4 x i32> @undef_splatmask2(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask2: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -70,7 +70,7 @@ define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask3: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -80,10 +80,10 @@ define <4 x i32> @undef_splatmask4(<4 x i32> %v, <4 x i32>* %p) nounwind { ; AVX2-LABEL: undef_splatmask4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-NEXT: vmovaps %xmm0, (%rdi) -; AVX2-NEXT: vmovaps %xmm1, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -94,8 +94,8 @@ define <4 x i32> @undef_splatmask5(<4 x i32> %v, <4 x i32>* %p) nounwind { ; AVX2-LABEL: undef_splatmask5: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-NEXT: vmovaps %xmm0, (%rdi) +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -87,16 +87,28 @@ ; SSE-NEXT: movq %xmm0, (%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_to_v2i32_1: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] -; AVX-NEXT: vmovlps %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_to_v2i32_1: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] +; AVX1-NEXT: vmovlps %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v4i32_to_v2i32_1: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,3,2,3] +; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v4i32_to_v2i32_1: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,3,2,3] +; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: shuffle_v4i32_to_v2i32_1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] -; AVX512-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,3,2,3] +; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %L %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -241,16 +241,28 @@ ; SSE-NEXT: movq %xmm0, (%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_to_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX-NEXT: vmovlps %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_to_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT: vmovlps %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v4i32_to_v2i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v4i32_to_v2i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: shuffle_v4i32_to_v2i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512-NEXT: vmovlps %xmm0, (%rsi) +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: retq %vec = load <4 x i32>, <4 x i32>* %L %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> @@ -265,16 +277,28 @@ ; SSE-NEXT: movq %xmm0, (%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: trunc_v2i64_to_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX-NEXT: vmovlps %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_v2i64_to_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT: vmovlps %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v2i64_to_v2i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_v2i64_to_v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512F-NEXT: vmovlps %xmm0, (%rsi) +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v2i64_to_v2i32: @@ -285,8 +309,8 @@ ; ; AVX512BW-LABEL: trunc_v2i64_to_v2i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32: diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -311,9 +311,9 @@ ; ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps (%rdi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsi) +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsi) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/splat-const.ll b/llvm/test/CodeGen/X86/splat-const.ll --- a/llvm/test/CodeGen/X86/splat-const.ll +++ b/llvm/test/CodeGen/X86/splat-const.ll @@ -19,7 +19,7 @@ ; ; AVX2-LABEL: zero_vector: ; AVX2: # %bb.0: -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: retq %zero = insertelement <4 x i32> undef, i32 0, i32 0 %splat = shufflevector <4 x i32> %zero, <4 x i32> undef, <4 x i32> zeroinitializer @@ -43,7 +43,7 @@ ; ; AVX2-LABEL: const_vector: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42] ; AVX2-NEXT: retq %const = insertelement <4 x i32> undef, i32 42, i32 0 %splat = shufflevector <4 x i32> %const, <4 x i32> undef, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll --- a/llvm/test/CodeGen/X86/splat-for-size.ll +++ b/llvm/test/CodeGen/X86/splat-for-size.ll @@ -408,7 +408,7 @@ ; AVX2-NEXT: vmovq %rax, %xmm1 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,1,1] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] ; AVX2-NEXT: retq entry: %0 = load <4 x i64>, <4 x i64>* bitcast (<3 x i64>* @A to <4 x i64>*), align 32 diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll @@ -582,12 +582,12 @@ ; ; CHECK-AVX2-LABEL: test_srem_one_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_one_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -600,10 +600,20 @@ ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: test_srem_one_ne: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX1-LABEL: test_srem_one_ne: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_srem_one_ne: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_srem_one_ne: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, %cmp = icmp ne <4 x i32> %srem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -738,12 +748,12 @@ ; ; CHECK-AVX2-LABEL: test_srem_allones: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_allones: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, diff --git a/llvm/test/CodeGen/X86/sse-fsignum.ll b/llvm/test/CodeGen/X86/sse-fsignum.ll --- a/llvm/test/CodeGen/X86/sse-fsignum.ll +++ b/llvm/test/CodeGen/X86/sse-fsignum.ll @@ -202,7 +202,7 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vmovdqa %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -216,7 +216,7 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX512F-NEXT: vmovaps %ymm0, (%rdi) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -64,7 +64,7 @@ ; ; AVX512-LABEL: test_mm_and_ps: ; AVX512: # %bb.0: -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <4 x float> %a0 to <4 x i32> %arg1 = bitcast <4 x float> %a1 to <4 x i32> @@ -818,7 +818,7 @@ ; X86-AVX512-LABEL: test_mm_cvtss_f32: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: pushl %eax # encoding: [0x50] -; X86-AVX512-NEXT: vmovss %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24] +; X86-AVX512-NEXT: vmovd %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0x04,0x24] ; X86-AVX512-NEXT: flds (%esp) # encoding: [0xd9,0x04,0x24] ; X86-AVX512-NEXT: popl %eax # encoding: [0x58] ; X86-AVX512-NEXT: retl # encoding: [0xc3] @@ -1173,7 +1173,7 @@ ; X86-AVX512-LABEL: test_mm_load_ps: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00] +; X86-AVX512-NEXT: vmovdqa (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_load_ps: @@ -1188,7 +1188,7 @@ ; ; X64-AVX512-LABEL: test_mm_load_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast float* %a0 to <4 x float>* %res = load <4 x float>, <4 x float>* %arg0, align 16 @@ -1214,7 +1214,7 @@ ; X86-AVX512-LABEL: test_mm_load_ps1: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vbroadcastss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x00] +; X86-AVX512-NEXT: vpbroadcastd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_load_ps1: @@ -1232,7 +1232,7 @@ ; ; X64-AVX512-LABEL: test_mm_load_ps1: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07] +; X64-AVX512-NEXT: vpbroadcastd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %ld = load float, float* %a0, align 4 %res0 = insertelement <4 x float> undef, float %ld, i32 0 @@ -1260,7 +1260,7 @@ ; X86-AVX512-LABEL: test_mm_load_ss: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00] +; X86-AVX512-NEXT: vmovd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x00] ; X86-AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; @@ -1278,7 +1278,7 @@ ; ; X64-AVX512-LABEL: test_mm_load_ss: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07] +; X64-AVX512-NEXT: vmovd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07] ; X64-AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero ; X64-AVX512-NEXT: retq # encoding: [0xc3] %ld = load float, float* %a0, align 1 @@ -1308,7 +1308,7 @@ ; X86-AVX512-LABEL: test_mm_load1_ps: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vbroadcastss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x00] +; X86-AVX512-NEXT: vpbroadcastd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_load1_ps: @@ -1326,7 +1326,7 @@ ; ; X64-AVX512-LABEL: test_mm_load1_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07] +; X64-AVX512-NEXT: vpbroadcastd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %ld = load float, float* %a0, align 4 %res0 = insertelement <4 x float> undef, float %ld, i32 0 @@ -1447,7 +1447,7 @@ ; X86-AVX512-LABEL: test_mm_loadr_ps: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vpermilps $27, (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x00,0x1b] +; X86-AVX512-NEXT: vpshufd $27, (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0x00,0x1b] ; X86-AVX512-NEXT: # xmm0 = mem[3,2,1,0] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; @@ -1466,7 +1466,7 @@ ; ; X64-AVX512-LABEL: test_mm_loadr_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpermilps $27, (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x07,0x1b] +; X64-AVX512-NEXT: vpshufd $27, (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0x07,0x1b] ; X64-AVX512-NEXT: # xmm0 = mem[3,2,1,0] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast float* %a0 to <4 x float>* @@ -1491,7 +1491,7 @@ ; X86-AVX512-LABEL: test_mm_loadu_ps: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovups (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x00] +; X86-AVX512-NEXT: vmovdqu (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_loadu_ps: @@ -1506,7 +1506,7 @@ ; ; X64-AVX512-LABEL: test_mm_loadu_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] +; X64-AVX512-NEXT: vmovdqu (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast float* %a0 to <4 x float>* %res = load <4 x float>, <4 x float>* %arg0, align 1 @@ -1600,11 +1600,17 @@ ; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX-LABEL: test_mm_move_ss: -; AVX: # %bb.0: -; AVX-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] -; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX1-LABEL: test_mm_move_ss: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] +; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512-LABEL: test_mm_move_ss: +; AVX512: # %bb.0: +; AVX512-NEXT: vpblendd $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x01] +; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> ret <4 x float> %res } @@ -1624,7 +1630,7 @@ ; ; AVX512-LABEL: test_mm_movehl_ps: ; AVX512: # %bb.0: -; AVX512-NEXT: vunpckhpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x15,0xc0] +; AVX512-NEXT: vpunpckhqdq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6d,0xc0] ; AVX512-NEXT: # xmm0 = xmm1[1],xmm0[1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> @@ -1646,7 +1652,7 @@ ; ; AVX512-LABEL: test_mm_movelh_ps: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1] +; AVX512-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xc1] ; AVX512-NEXT: # xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> @@ -1722,7 +1728,7 @@ ; ; AVX512-LABEL: test_mm_or_ps: ; AVX512: # %bb.0: -; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <4 x float> %a0 to <4 x i32> %arg1 = bitcast <4 x float> %a1 to <4 x i32> @@ -2110,9 +2116,9 @@ ; ; X86-AVX512-LABEL: test_mm_set_ps1: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04] +; X86-AVX512-NEXT: vmovd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04] ; X86-AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero -; X86-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] +; X86-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xc0] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_set_ps1: @@ -2129,7 +2135,7 @@ ; ; X64-AVX512-LABEL: test_mm_set_ps1: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] +; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xc0] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %res0 = insertelement <4 x float> undef, float %a0, i32 0 %res1 = insertelement <4 x float> %res0, float %a0, i32 1 @@ -2224,10 +2230,10 @@ ; ; X86-AVX512-LABEL: test_mm_set_ss: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04] +; X86-AVX512-NEXT: vmovd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04] ; X86-AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero -; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9] -; X86-AVX512-NEXT: vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01] +; X86-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xef,0xc9] +; X86-AVX512-NEXT: vpblendd $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x02,0xc0,0x01] ; X86-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; @@ -2239,12 +2245,19 @@ ; X64-SSE-NEXT: movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1] ; X64-SSE-NEXT: retq # encoding: [0xc3] ; -; X64-AVX-LABEL: test_mm_set_ss: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9] -; X64-AVX-NEXT: vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01] -; X64-AVX-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3] -; X64-AVX-NEXT: retq # encoding: [0xc3] +; X64-AVX1-LABEL: test_mm_set_ss: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9] +; X64-AVX1-NEXT: vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01] +; X64-AVX1-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3] +; X64-AVX1-NEXT: retq # encoding: [0xc3] +; +; X64-AVX512-LABEL: test_mm_set_ss: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xef,0xc9] +; X64-AVX512-NEXT: vpblendd $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x02,0xc0,0x01] +; X64-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3] +; X64-AVX512-NEXT: retq # encoding: [0xc3] %res0 = insertelement <4 x float> undef, float %a0, i32 0 %res1 = insertelement <4 x float> %res0, float 0.0, i32 1 %res2 = insertelement <4 x float> %res1, float 0.0, i32 2 @@ -2271,9 +2284,9 @@ ; ; X86-AVX512-LABEL: test_mm_set1_ps: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04] +; X86-AVX512-NEXT: vmovd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04] ; X86-AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero -; X86-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] +; X86-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xc0] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_set1_ps: @@ -2290,7 +2303,7 @@ ; ; X64-AVX512-LABEL: test_mm_set1_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] +; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xc0] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %res0 = insertelement <4 x float> undef, float %a0, i32 0 %res1 = insertelement <4 x float> %res0, float %a0, i32 1 @@ -2436,7 +2449,7 @@ ; ; AVX512-LABEL: test_mm_setzero_ps: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc0] +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] ret <4 x float> zeroinitializer } @@ -2589,7 +2602,7 @@ ; X86-AVX512-LABEL: test_mm_store_ps: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store_ps: @@ -2604,7 +2617,7 @@ ; ; X64-AVX512-LABEL: test_mm_store_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast float* %a0 to <4 x float>* store <4 x float> %a1, <4 x float>* %arg0, align 16 @@ -2631,8 +2644,8 @@ ; X86-AVX512-LABEL: test_mm_store_ps1: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xc0] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store_ps1: @@ -2651,8 +2664,8 @@ ; ; X64-AVX512-LABEL: test_mm_store_ps1: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xc0] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast float* %a0 to <4 x float>* %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer @@ -2676,7 +2689,7 @@ ; X86-AVX512-LABEL: test_mm_store_ss: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovss %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x00] +; X86-AVX512-NEXT: vmovd %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store_ss: @@ -2691,7 +2704,7 @@ ; ; X64-AVX512-LABEL: test_mm_store_ss: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovss %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07] +; X64-AVX512-NEXT: vmovd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %ext = extractelement <4 x float> %a1, i32 0 store float %ext, float* %a0, align 1 @@ -2718,8 +2731,8 @@ ; X86-AVX512-LABEL: test_mm_store1_ps: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xc0] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store1_ps: @@ -2738,8 +2751,8 @@ ; ; X64-AVX512-LABEL: test_mm_store1_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xc0] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast float* %a0 to <4 x float>* %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer @@ -2886,7 +2899,7 @@ ; X86-AVX512-LABEL: test_mm_storel_pi: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovlps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00] +; X86-AVX512-NEXT: vmovq %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd6,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE1-LABEL: test_mm_storel_pi: @@ -2937,7 +2950,7 @@ ; X86-AVX512-LABEL: test_mm_storel_pi2: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovlps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00] +; X86-AVX512-NEXT: vmovq %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd6,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_storel_pi2: @@ -2952,7 +2965,7 @@ ; ; X64-AVX512-LABEL: test_mm_storel_pi2: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovlps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x07] +; X64-AVX512-NEXT: vmovq %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd6,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %ptr = bitcast x86_mmx* %a0 to <2 x float>* %ext = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> @@ -2980,9 +2993,9 @@ ; X86-AVX512-LABEL: test_mm_storer_ps: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; X86-AVX512-NEXT: vpshufd $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc0,0x1b] ; X86-AVX512-NEXT: # xmm0 = xmm0[3,2,1,0] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_storer_ps: @@ -3001,9 +3014,9 @@ ; ; X64-AVX512-LABEL: test_mm_storer_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; X64-AVX512-NEXT: vpshufd $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc0,0x1b] ; X64-AVX512-NEXT: # xmm0 = xmm0[3,2,1,0] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast float* %a0 to <4 x float>* %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> @@ -3027,7 +3040,7 @@ ; X86-AVX512-LABEL: test_mm_storeu_ps: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] +; X86-AVX512-NEXT: vmovdqu %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_storeu_ps: @@ -3042,7 +3055,7 @@ ; ; X64-AVX512-LABEL: test_mm_storeu_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] +; X64-AVX512-NEXT: vmovdqu %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast float* %a0 to <4 x float>* store <4 x float> %a1, <4 x float>* %arg0, align 1 @@ -3065,7 +3078,7 @@ ; X86-AVX512-LABEL: test_mm_stream_ps: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovntps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x00] +; X86-AVX512-NEXT: vmovntdq %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe7,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_stream_ps: @@ -3080,7 +3093,7 @@ ; ; X64-AVX512-LABEL: test_mm_stream_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovntps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x07] +; X64-AVX512-NEXT: vmovntdq %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe7,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast float* %a0 to <4 x float>* store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0 @@ -3208,30 +3221,30 @@ ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-AVX512-NEXT: vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06] -; X86-AVX512-NEXT: vmovaps (%edx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0a] -; X86-AVX512-NEXT: vmovaps (%ecx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x11] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x18] -; X86-AVX512-NEXT: vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1] +; X86-AVX512-NEXT: vmovdqa (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x06] +; X86-AVX512-NEXT: vmovdqa (%edx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0a] +; X86-AVX512-NEXT: vmovdqa (%ecx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x11] +; X86-AVX512-NEXT: vmovdqa (%eax), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x18] +; X86-AVX512-NEXT: vpunpckldq %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x62,0xe1] ; X86-AVX512-NEXT: # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-AVX512-NEXT: vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb] +; X86-AVX512-NEXT: vpunpckldq %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xeb] ; X86-AVX512-NEXT: # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X86-AVX512-NEXT: vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1] +; X86-AVX512-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6a,0xc1] ; X86-AVX512-NEXT: # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-AVX512-NEXT: vunpckhps %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x15,0xcb] +; X86-AVX512-NEXT: vpunpckhdq %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x6a,0xcb] ; X86-AVX512-NEXT: # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; X86-AVX512-NEXT: vmovlhps %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x16,0xd5] +; X86-AVX512-NEXT: vpunpcklqdq %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x6c,0xd5] ; X86-AVX512-NEXT: # xmm2 = xmm4[0],xmm5[0] -; X86-AVX512-NEXT: vunpckhpd %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x15,0xdd] +; X86-AVX512-NEXT: vpunpckhqdq %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x6d,0xdd] ; X86-AVX512-NEXT: # xmm3 = xmm4[1],xmm5[1] -; X86-AVX512-NEXT: vmovlhps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xe1] +; X86-AVX512-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xe1] ; X86-AVX512-NEXT: # xmm4 = xmm0[0],xmm1[0] -; X86-AVX512-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1] +; X86-AVX512-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6d,0xc1] ; X86-AVX512-NEXT: # xmm0 = xmm0[1],xmm1[1] -; X86-AVX512-NEXT: vmovaps %xmm2, (%esi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x16] -; X86-AVX512-NEXT: vmovaps %xmm3, (%edx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1a] -; X86-AVX512-NEXT: vmovaps %xmm4, (%ecx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x21] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vmovdqa %xmm2, (%esi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x16] +; X86-AVX512-NEXT: vmovdqa %xmm3, (%edx) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x1a] +; X86-AVX512-NEXT: vmovdqa %xmm4, (%ecx) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x21] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x00] ; X86-AVX512-NEXT: popl %esi # encoding: [0x5e] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; @@ -3297,30 +3310,30 @@ ; ; X64-AVX512-LABEL: test_MM_TRANSPOSE4_PS: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] -; X64-AVX512-NEXT: vmovaps (%rsi), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0e] -; X64-AVX512-NEXT: vmovaps (%rdx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x12] -; X64-AVX512-NEXT: vmovaps (%rcx), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x19] -; X64-AVX512-NEXT: vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1] +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] +; X64-AVX512-NEXT: vmovdqa (%rsi), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0e] +; X64-AVX512-NEXT: vmovdqa (%rdx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x12] +; X64-AVX512-NEXT: vmovdqa (%rcx), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x19] +; X64-AVX512-NEXT: vpunpckldq %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x62,0xe1] ; X64-AVX512-NEXT: # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-AVX512-NEXT: vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb] +; X64-AVX512-NEXT: vpunpckldq %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xeb] ; X64-AVX512-NEXT: # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X64-AVX512-NEXT: vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1] +; X64-AVX512-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6a,0xc1] ; X64-AVX512-NEXT: # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-AVX512-NEXT: vunpckhps %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x15,0xcb] +; X64-AVX512-NEXT: vpunpckhdq %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x6a,0xcb] ; X64-AVX512-NEXT: # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; X64-AVX512-NEXT: vmovlhps %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x16,0xd5] +; X64-AVX512-NEXT: vpunpcklqdq %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x6c,0xd5] ; X64-AVX512-NEXT: # xmm2 = xmm4[0],xmm5[0] -; X64-AVX512-NEXT: vunpckhpd %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x15,0xdd] +; X64-AVX512-NEXT: vpunpckhqdq %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x6d,0xdd] ; X64-AVX512-NEXT: # xmm3 = xmm4[1],xmm5[1] -; X64-AVX512-NEXT: vmovlhps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xe1] +; X64-AVX512-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xe1] ; X64-AVX512-NEXT: # xmm4 = xmm0[0],xmm1[0] -; X64-AVX512-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1] +; X64-AVX512-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6d,0xc1] ; X64-AVX512-NEXT: # xmm0 = xmm0[1],xmm1[1] -; X64-AVX512-NEXT: vmovaps %xmm2, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x17] -; X64-AVX512-NEXT: vmovaps %xmm3, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1e] -; X64-AVX512-NEXT: vmovaps %xmm4, (%rdx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x22] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rcx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x01] +; X64-AVX512-NEXT: vmovdqa %xmm2, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x17] +; X64-AVX512-NEXT: vmovdqa %xmm3, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x1e] +; X64-AVX512-NEXT: vmovdqa %xmm4, (%rdx) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x22] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rcx) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x01] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %row0 = load <4 x float>, <4 x float>* %a0, align 16 %row1 = load <4 x float>, <4 x float>* %a1, align 16 @@ -3531,7 +3544,7 @@ ; ; AVX512-LABEL: test_mm_unpackhi_ps: ; AVX512: # %bb.0: -; AVX512-NEXT: vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1] +; AVX512-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6a,0xc1] ; AVX512-NEXT: # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> @@ -3553,7 +3566,7 @@ ; ; AVX512-LABEL: test_mm_unpacklo_ps: ; AVX512: # %bb.0: -; AVX512-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1] +; AVX512-NEXT: vpunpckldq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x62,0xc1] ; AVX512-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> @@ -3573,7 +3586,7 @@ ; ; AVX512-LABEL: test_mm_xor_ps: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <4 x float> %a0 to <4 x i32> %arg1 = bitcast <4 x float> %a1 to <4 x i32> diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll --- a/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll @@ -65,7 +65,7 @@ ; X86-AVX512-LABEL: test_x86_sse_storeu_ps: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovups %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] +; X86-AVX512-NEXT: vmovdqu %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x00] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: test_x86_sse_storeu_ps: @@ -80,7 +80,7 @@ ; ; X64-AVX512-LABEL: test_x86_sse_storeu_ps: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] +; X64-AVX512-NEXT: vmovdqu %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1) ret void diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c @@ -85,10 +85,15 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: test_mm_loadu_si64: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: test_mm_loadu_si64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_mm_loadu_si64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: retq %ld = load i64, i64* %a0, align 1 %res0 = insertelement <2 x i64> undef, i64 %ld, i32 0 %res1 = insertelement <2 x i64> %res0, i64 0, i32 1 diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -239,7 +239,7 @@ ; ; AVX512-LABEL: test_mm_and_pd: ; AVX512: # %bb.0: -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x double> %a0 to <4 x i32> %arg1 = bitcast <2 x double> %a1 to <4 x i32> @@ -261,7 +261,7 @@ ; ; AVX512-LABEL: test_mm_and_si128: ; AVX512: # %bb.0: -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = and <2 x i64> %a0, %a1 ret <2 x i64> %res @@ -1488,7 +1488,7 @@ ; X86-AVX512-NEXT: movl %esp, %ebp # encoding: [0x89,0xe5] ; X86-AVX512-NEXT: andl $-8, %esp # encoding: [0x83,0xe4,0xf8] ; X86-AVX512-NEXT: subl $8, %esp # encoding: [0x83,0xec,0x08] -; X86-AVX512-NEXT: vmovlps %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x04,0x24] +; X86-AVX512-NEXT: vmovq %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd6,0x04,0x24] ; X86-AVX512-NEXT: fldl (%esp) # encoding: [0xdd,0x04,0x24] ; X86-AVX512-NEXT: movl %ebp, %esp # encoding: [0x89,0xec] ; X86-AVX512-NEXT: popl %ebp # encoding: [0x5d] @@ -1649,7 +1649,7 @@ ; ; X86-AVX512-LABEL: test_mm_cvtsi32_si128: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04] +; X86-AVX512-NEXT: vmovd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04] ; X86-AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; @@ -1888,7 +1888,7 @@ ; X86-AVX512-LABEL: test_mm_load_pd: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00] +; X86-AVX512-NEXT: vmovdqa (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_load_pd: @@ -1903,7 +1903,7 @@ ; ; X64-AVX512-LABEL: test_mm_load_pd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast double* %a0 to <2 x double>* %res = load <2 x double>, <2 x double>* %arg0, align 16 @@ -1928,7 +1928,7 @@ ; X86-AVX512-LABEL: test_mm_load_sd: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovsd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00] +; X86-AVX512-NEXT: vmovq (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x00] ; X86-AVX512-NEXT: # xmm0 = mem[0],zero ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; @@ -1946,7 +1946,7 @@ ; ; X64-AVX512-LABEL: test_mm_load_sd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07] +; X64-AVX512-NEXT: vmovq (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x07] ; X64-AVX512-NEXT: # xmm0 = mem[0],zero ; X64-AVX512-NEXT: retq # encoding: [0xc3] %ld = load double, double* %a0, align 1 @@ -1971,7 +1971,7 @@ ; X86-AVX512-LABEL: test_mm_load_si128: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00] +; X86-AVX512-NEXT: vmovdqa (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_load_si128: @@ -1986,7 +1986,7 @@ ; ; X64-AVX512-LABEL: test_mm_load_si128: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %res = load <2 x i64>, <2 x i64>* %a0, align 16 ret <2 x i64> %res @@ -2012,8 +2012,7 @@ ; X86-AVX512-LABEL: test_mm_load1_pd: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovddup (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x00] -; X86-AVX512-NEXT: # xmm0 = mem[0,0] +; X86-AVX512-NEXT: vpbroadcastq (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_load1_pd: @@ -2032,8 +2031,7 @@ ; ; X64-AVX512-LABEL: test_mm_load1_pd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovddup (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x07] -; X64-AVX512-NEXT: # xmm0 = mem[0,0] +; X64-AVX512-NEXT: vpbroadcastq (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %ld = load double, double* %a0, align 8 %res0 = insertelement <2 x double> undef, double %ld, i32 0 @@ -2103,7 +2101,7 @@ ; X86-AVX512-LABEL: test_mm_loadl_epi64: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovsd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00] +; X86-AVX512-NEXT: vmovq (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x00] ; X86-AVX512-NEXT: # xmm0 = mem[0],zero ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; @@ -2121,7 +2119,7 @@ ; ; X64-AVX512-LABEL: test_mm_loadl_epi64: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07] +; X64-AVX512-NEXT: vmovq (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x07] ; X64-AVX512-NEXT: # xmm0 = mem[0],zero ; X64-AVX512-NEXT: retq # encoding: [0xc3] %bc = bitcast <2 x i64>* %a1 to i64* @@ -2238,7 +2236,7 @@ ; X86-AVX512-LABEL: test_mm_loadu_pd: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovups (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x00] +; X86-AVX512-NEXT: vmovdqu (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_loadu_pd: @@ -2253,7 +2251,7 @@ ; ; X64-AVX512-LABEL: test_mm_loadu_pd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] +; X64-AVX512-NEXT: vmovdqu (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast double* %a0 to <2 x double>* %res = load <2 x double>, <2 x double>* %arg0, align 1 @@ -2276,7 +2274,7 @@ ; X86-AVX512-LABEL: test_mm_loadu_si128: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovups (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x00] +; X86-AVX512-NEXT: vmovdqu (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_loadu_si128: @@ -2291,7 +2289,7 @@ ; ; X64-AVX512-LABEL: test_mm_loadu_si128: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07] +; X64-AVX512-NEXT: vmovdqu (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %res = load <2 x i64>, <2 x i64>* %a0, align 1 ret <2 x i64> %res @@ -2315,7 +2313,7 @@ ; X86-AVX512-LABEL: test_mm_loadu_si64: ; X86-AVX512: # %bb.0: # %entry ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovsd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00] +; X86-AVX512-NEXT: vmovq (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x00] ; X86-AVX512-NEXT: # xmm0 = mem[0],zero ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; @@ -2333,7 +2331,7 @@ ; ; X64-AVX512-LABEL: test_mm_loadu_si64: ; X64-AVX512: # %bb.0: # %entry -; X64-AVX512-NEXT: vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07] +; X64-AVX512-NEXT: vmovq (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x07] ; X64-AVX512-NEXT: # xmm0 = mem[0],zero ; X64-AVX512-NEXT: retq # encoding: [0xc3] entry: @@ -2361,7 +2359,7 @@ ; X86-AVX512-LABEL: test_mm_loadu_si32: ; X86-AVX512: # %bb.0: # %entry ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00] +; X86-AVX512-NEXT: vmovd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x00] ; X86-AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; @@ -2379,7 +2377,7 @@ ; ; X64-AVX512-LABEL: test_mm_loadu_si32: ; X64-AVX512: # %bb.0: # %entry -; X64-AVX512-NEXT: vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07] +; X64-AVX512-NEXT: vmovd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07] ; X64-AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero ; X64-AVX512-NEXT: retq # encoding: [0xc3] entry: @@ -2704,11 +2702,17 @@ ; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX-LABEL: test_mm_move_sd: -; AVX: # %bb.0: -; AVX-NEXT: vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03] -; AVX-NEXT: # xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX1-LABEL: test_mm_move_sd: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03] +; AVX1-NEXT: # xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512-LABEL: test_mm_move_sd: +; AVX512: # %bb.0: +; AVX512-NEXT: vpblendd $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x03] +; AVX512-NEXT: # xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %ext0 = extractelement <2 x double> %a1, i32 0 %res0 = insertelement <2 x double> undef, double %ext0, i32 0 %ext1 = extractelement <2 x double> %a0, i32 1 @@ -2895,7 +2899,7 @@ ; ; AVX512-LABEL: test_mm_or_pd: ; AVX512: # %bb.0: -; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x double> %a0 to <4 x i32> %arg1 = bitcast <2 x double> %a1 to <4 x i32> @@ -2917,7 +2921,7 @@ ; ; AVX512-LABEL: test_mm_or_si128: ; AVX512: # %bb.0: -; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = or <2 x i64> %a0, %a1 ret <2 x i64> %res @@ -3626,11 +3630,11 @@ ; ; X86-AVX512-LABEL: test_mm_set_pd: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x44,0x24,0x0c] +; X86-AVX512-NEXT: vmovq {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x44,0x24,0x0c] ; X86-AVX512-NEXT: # xmm0 = mem[0],zero -; X86-AVX512-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x04] +; X86-AVX512-NEXT: vmovq {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x4c,0x24,0x04] ; X86-AVX512-NEXT: # xmm1 = mem[0],zero -; X86-AVX512-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1] +; X86-AVX512-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xc1] ; X86-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[0] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; @@ -3649,7 +3653,7 @@ ; ; X64-AVX512-LABEL: test_mm_set_pd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovlhps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0] +; X64-AVX512-NEXT: vpunpcklqdq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xc0] ; X64-AVX512-NEXT: # xmm0 = xmm1[0],xmm0[0] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %res0 = insertelement <2 x double> undef, double %a1, i32 0 @@ -3676,10 +3680,9 @@ ; ; X86-AVX512-LABEL: test_mm_set_pd1: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x44,0x24,0x04] +; X86-AVX512-NEXT: vmovq {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x44,0x24,0x04] ; X86-AVX512-NEXT: # xmm0 = mem[0],zero -; X86-AVX512-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] -; X86-AVX512-NEXT: # xmm0 = xmm0[0,0] +; X86-AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_set_pd1: @@ -3696,8 +3699,7 @@ ; ; X64-AVX512-LABEL: test_mm_set_pd1: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] -; X64-AVX512-NEXT: # xmm0 = xmm0[0,0] +; X64-AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %res0 = insertelement <2 x double> undef, double %a0, i32 0 %res1 = insertelement <2 x double> %res0, double %a0, i32 1 @@ -4006,10 +4008,9 @@ ; ; X86-AVX512-LABEL: test_mm_set1_pd: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x44,0x24,0x04] +; X86-AVX512-NEXT: vmovq {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x44,0x24,0x04] ; X86-AVX512-NEXT: # xmm0 = mem[0],zero -; X86-AVX512-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] -; X86-AVX512-NEXT: # xmm0 = xmm0[0,0] +; X86-AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_set1_pd: @@ -4026,8 +4027,7 @@ ; ; X64-AVX512-LABEL: test_mm_set1_pd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] -; X64-AVX512-NEXT: # xmm0 = xmm0[0,0] +; X64-AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %res0 = insertelement <2 x double> undef, double %a0, i32 0 %res1 = insertelement <2 x double> %res0, double %a0, i32 1 @@ -4636,11 +4636,11 @@ ; ; X86-AVX512-LABEL: test_mm_setr_pd: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x44,0x24,0x0c] +; X86-AVX512-NEXT: vmovq {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x44,0x24,0x0c] ; X86-AVX512-NEXT: # xmm0 = mem[0],zero -; X86-AVX512-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x04] +; X86-AVX512-NEXT: vmovq {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x4c,0x24,0x04] ; X86-AVX512-NEXT: # xmm1 = mem[0],zero -; X86-AVX512-NEXT: vmovlhps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0] +; X86-AVX512-NEXT: vpunpcklqdq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xc0] ; X86-AVX512-NEXT: # xmm0 = xmm1[0],xmm0[0] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; @@ -4658,7 +4658,7 @@ ; ; X64-AVX512-LABEL: test_mm_setr_pd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1] +; X64-AVX512-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xc1] ; X64-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[0] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %res0 = insertelement <2 x double> undef, double %a0, i32 0 @@ -4679,7 +4679,7 @@ ; ; AVX512-LABEL: test_mm_setzero_pd: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc0] +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] ret <2 x double> zeroinitializer } @@ -4697,7 +4697,7 @@ ; ; AVX512-LABEL: test_mm_setzero_si128: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc0] +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] ret <2 x i64> zeroinitializer } @@ -4717,7 +4717,7 @@ ; ; AVX512-LABEL: test_mm_shuffle_epi32: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] +; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xc0] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <4 x i32> %res = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer @@ -5317,7 +5317,7 @@ ; X86-AVX512-LABEL: test_mm_store_pd: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store_pd: @@ -5332,7 +5332,7 @@ ; ; X64-AVX512-LABEL: test_mm_store_pd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast double* %a0 to <2 x double>* store <2 x double> %a1, <2 x double>* %arg0, align 16 @@ -5359,9 +5359,8 @@ ; X86-AVX512-LABEL: test_mm_store_pd1: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] -; X86-AVX512-NEXT: # xmm0 = xmm0[0,0] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store_pd1: @@ -5380,9 +5379,8 @@ ; ; X64-AVX512-LABEL: test_mm_store_pd1: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] -; X64-AVX512-NEXT: # xmm0 = xmm0[0,0] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast double * %a0 to <2 x double>* %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer @@ -5406,7 +5404,7 @@ ; X86-AVX512-LABEL: test_mm_store_sd: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovsd %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x00] +; X86-AVX512-NEXT: vmovq %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd6,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store_sd: @@ -5421,7 +5419,7 @@ ; ; X64-AVX512-LABEL: test_mm_store_sd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovsd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07] +; X64-AVX512-NEXT: vmovq %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd6,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %ext = extractelement <2 x double> %a1, i32 0 store double %ext, double* %a0, align 1 @@ -5444,7 +5442,7 @@ ; X86-AVX512-LABEL: test_mm_store_si128: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store_si128: @@ -5459,7 +5457,7 @@ ; ; X64-AVX512-LABEL: test_mm_store_si128: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] store <2 x i64> %a1, <2 x i64>* %a0, align 16 ret void @@ -5485,9 +5483,8 @@ ; X86-AVX512-LABEL: test_mm_store1_pd: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] -; X86-AVX512-NEXT: # xmm0 = xmm0[0,0] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] +; X86-AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_store1_pd: @@ -5506,9 +5503,8 @@ ; ; X64-AVX512-LABEL: test_mm_store1_pd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] -; X64-AVX512-NEXT: # xmm0 = xmm0[0,0] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] +; X64-AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast double * %a0 to <2 x double>* %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer @@ -5582,7 +5578,7 @@ ; X86-AVX512-LABEL: test_mm_storel_epi64: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovlps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00] +; X86-AVX512-NEXT: vmovq %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd6,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_storel_epi64: @@ -5624,7 +5620,7 @@ ; X86-AVX512-LABEL: test_mm_storel_sd: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovsd %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x00] +; X86-AVX512-NEXT: vmovq %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd6,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_storel_sd: @@ -5639,7 +5635,7 @@ ; ; X64-AVX512-LABEL: test_mm_storel_sd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovsd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07] +; X64-AVX512-NEXT: vmovq %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd6,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %ext = extractelement <2 x double> %a1, i32 0 store double %ext, double* %a0, align 8 @@ -5713,7 +5709,7 @@ ; X86-AVX512-LABEL: test_mm_storeu_pd: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] +; X86-AVX512-NEXT: vmovdqu %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_storeu_pd: @@ -5728,7 +5724,7 @@ ; ; X64-AVX512-LABEL: test_mm_storeu_pd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] +; X64-AVX512-NEXT: vmovdqu %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast double* %a0 to <2 x double>* store <2 x double> %a1, <2 x double>* %arg0, align 1 @@ -5751,7 +5747,7 @@ ; X86-AVX512-LABEL: test_mm_storeu_si128: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] +; X86-AVX512-NEXT: vmovdqu %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_storeu_si128: @@ -5766,7 +5762,7 @@ ; ; X64-AVX512-LABEL: test_mm_storeu_si128: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] +; X64-AVX512-NEXT: vmovdqu %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] store <2 x i64> %a1, <2 x i64>* %a0, align 1 ret void @@ -5788,7 +5784,7 @@ ; X86-AVX512-LABEL: test_mm_storeu_si64: ; X86-AVX512: # %bb.0: # %entry ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovlps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00] +; X86-AVX512-NEXT: vmovq %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd6,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_storeu_si64: @@ -5925,7 +5921,7 @@ ; X86-AVX512-LABEL: test_mm_stream_pd: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovntps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x00] +; X86-AVX512-NEXT: vmovntdq %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe7,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_stream_pd: @@ -5940,7 +5936,7 @@ ; ; X64-AVX512-LABEL: test_mm_stream_pd: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovntps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x07] +; X64-AVX512-NEXT: vmovntdq %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe7,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] %arg0 = bitcast double* %a0 to <2 x double>* store <2 x double> %a1, <2 x double>* %arg0, align 16, !nontemporal !0 @@ -5979,7 +5975,7 @@ ; X86-AVX512-LABEL: test_mm_stream_si128: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovntps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x00] +; X86-AVX512-NEXT: vmovntdq %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe7,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_stream_si128: @@ -5994,7 +5990,7 @@ ; ; X64-AVX512-LABEL: test_mm_stream_si128: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovntps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x07] +; X64-AVX512-NEXT: vmovntdq %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe7,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] store <2 x i64> %a1, <2 x i64>* %a0, align 16, !nontemporal !0 ret void @@ -6465,7 +6461,7 @@ ; ; AVX512-LABEL: test_mm_unpackhi_epi32: ; AVX512: # %bb.0: -; AVX512-NEXT: vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1] +; AVX512-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6a,0xc1] ; AVX512-NEXT: # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <4 x i32> @@ -6490,7 +6486,7 @@ ; ; AVX512-LABEL: test_mm_unpackhi_epi64: ; AVX512: # %bb.0: -; AVX512-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1] +; AVX512-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6d,0xc1] ; AVX512-NEXT: # xmm0 = xmm0[1],xmm1[1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> @@ -6512,7 +6508,7 @@ ; ; AVX512-LABEL: test_mm_unpackhi_pd: ; AVX512: # %bb.0: -; AVX512-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1] +; AVX512-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6d,0xc1] ; AVX512-NEXT: # xmm0 = xmm0[1],xmm1[1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> @@ -6584,7 +6580,7 @@ ; ; AVX512-LABEL: test_mm_unpacklo_epi32: ; AVX512: # %bb.0: -; AVX512-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1] +; AVX512-NEXT: vpunpckldq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x62,0xc1] ; AVX512-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <4 x i32> @@ -6609,7 +6605,7 @@ ; ; AVX512-LABEL: test_mm_unpacklo_epi64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1] +; AVX512-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xc1] ; AVX512-NEXT: # xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> @@ -6631,7 +6627,7 @@ ; ; AVX512-LABEL: test_mm_unpacklo_pd: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1] +; AVX512-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xc1] ; AVX512-NEXT: # xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> @@ -6651,7 +6647,7 @@ ; ; AVX512-LABEL: test_mm_xor_pd: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x double> %a0 to <4 x i32> %arg1 = bitcast <2 x double> %a1 to <4 x i32> @@ -6673,7 +6669,7 @@ ; ; AVX512-LABEL: test_mm_xor_si128: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1] +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = xor <2 x i64> %a0, %a1 ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -247,7 +247,7 @@ ; X86-AVX512-LABEL: test_x86_sse2_storel_dq: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovlps %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00] +; X86-AVX512-NEXT: vmovq %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd6,0x00] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: test_x86_sse2_storel_dq: @@ -262,7 +262,7 @@ ; ; X64-AVX512-LABEL: test_x86_sse2_storel_dq: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovlps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x07] +; X64-AVX512-NEXT: vmovq %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd6,0x07] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1) ret void @@ -408,7 +408,7 @@ ; ; AVX512-LABEL: test_x86_sse2_pshuf_d: ; AVX512: ## %bb.0: ## %entry -; AVX512-NEXT: vpermilps $27, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; AVX512-NEXT: vpshufd $27, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc0,0x1b] ; AVX512-NEXT: ## xmm0 = xmm0[3,2,1,0] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] entry: diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -780,8 +780,8 @@ ; ; X86-AVX512-LABEL: test_x86_sse2_packssdw_128_fold: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: vmovaps LCPI30_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,32767,32767,65535,32768] -; X86-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX512-NEXT: vmovdqa LCPI30_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,32767,32767,65535,32768] +; X86-AVX512-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI30_0, kind: FK_Data_4 ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -801,8 +801,8 @@ ; ; X64-AVX512-LABEL: test_x86_sse2_packssdw_128_fold: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,32767,32767,65535,32768] -; X64-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,32767,32767,65535,32768] +; X64-AVX512-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI30_0-4, kind: reloc_riprel_4byte ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> zeroinitializer, <4 x i32> ) @@ -848,8 +848,8 @@ ; ; X86-AVX512-LABEL: test_x86_sse2_packsswb_128_fold: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: vmovaps LCPI32_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX512-NEXT: vmovdqa LCPI32_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-AVX512-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI32_0, kind: FK_Data_4 ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -869,8 +869,8 @@ ; ; X64-AVX512-LABEL: test_x86_sse2_packsswb_128_fold: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-AVX512-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI32_0-4, kind: reloc_riprel_4byte ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> , <8 x i16> zeroinitializer) @@ -916,8 +916,8 @@ ; ; X86-AVX512-LABEL: test_x86_sse2_packuswb_128_fold: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: vmovaps LCPI34_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX512-NEXT: vmovdqa LCPI34_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-AVX512-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI34_0, kind: FK_Data_4 ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -937,8 +937,8 @@ ; ; X64-AVX512-LABEL: test_x86_sse2_packuswb_128_fold: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-AVX512-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI34_0-4, kind: reloc_riprel_4byte ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> , <8 x i16> zeroinitializer) diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll --- a/llvm/test/CodeGen/X86/sse2.ll +++ b/llvm/test/CodeGen/X86/sse2.ll @@ -33,11 +33,17 @@ ; X64-SSE-NEXT: movapd %xmm0, (%rdi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test1: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; X64-AVX512-NEXT: retq %tmp3 = load <2 x double>, <2 x double>* %A, align 16 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 > @@ -71,12 +77,19 @@ ; X64-SSE-NEXT: movaps %xmm1, (%rdi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test2: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rsi), %xmm1 -; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test2: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rsi), %xmm1 +; X64-AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test2: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqa (%rsi), %xmm1 +; X64-AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; X64-AVX512-NEXT: retq %tmp3 = load <2 x double>, <2 x double>* %A, align 16 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 > @@ -96,15 +109,25 @@ ; X86-SSE-NEXT: movaps %xmm0, (%eax) ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: test3: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: vmovaps (%edx), %xmm0 -; X86-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; X86-AVX-NEXT: vmovaps %xmm0, (%eax) -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test3: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: vmovaps (%edx), %xmm0 +; X86-AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test3: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX512-NEXT: vmovdqa (%edx), %xmm0 +; X86-AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test3: ; X64-SSE: # %bb.0: @@ -113,12 +136,19 @@ ; X64-SSE-NEXT: movaps %xmm0, (%rdi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test3: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rsi), %xmm0 -; X64-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test3: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rsi), %xmm0 +; X64-AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test3: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqa (%rsi), %xmm0 +; X64-AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; X64-AVX512-NEXT: retq %tmp = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=2] %tmp3 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=2] %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; [#uses=1] @@ -141,12 +171,19 @@ ; X86-SSE-NEXT: movaps %xmm0, (%eax) ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: test4: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] -; X86-AVX-NEXT: vmovaps %xmm0, (%eax) -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test4: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test4: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test4: ; X64-SSE: # %bb.0: @@ -154,11 +191,17 @@ ; X64-SSE-NEXT: movaps %xmm0, (%rdi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test4: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] -; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test4: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test4: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; X64-AVX512-NEXT: retq %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] store <4 x float> %tmp5, <4 x float>* %res ret void @@ -226,13 +269,21 @@ ; X86-SSE-NEXT: movaps %xmm0, (%eax) ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: test6: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vmovaps (%ecx), %xmm0 -; X86-AVX-NEXT: vmovaps %xmm0, (%eax) -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test6: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: vmovaps (%ecx), %xmm0 +; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test6: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vmovdqa (%ecx), %xmm0 +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test6: ; X64-SSE: # %bb.0: @@ -240,11 +291,17 @@ ; X64-SSE-NEXT: movaps %xmm0, (%rdi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test6: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rsi), %xmm0 -; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test6: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rsi), %xmm0 +; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test6: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqa (%rsi), %xmm0 +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; X64-AVX512-NEXT: retq %tmp1 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=1] %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1] store <4 x float> %tmp2, <4 x float>* %res @@ -258,11 +315,17 @@ ; SSE-NEXT: movaps %xmm0, 0 ; SSE-NEXT: ret{{[l|q]}} ; -; AVX-LABEL: test7: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, 0 -; AVX-NEXT: ret{{[l|q]}} +; AVX1-LABEL: test7: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, 0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test7: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, 0 +; AVX512-NEXT: ret{{[l|q]}} bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1] shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1] store <4 x float> %2, <4 x float>* null @@ -277,20 +340,30 @@ ; X86-SSE-NEXT: movups x, %xmm0 ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: test8: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovups x, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovups x, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test8: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vmovdqu x, %xmm0 +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test8: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movups {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test8: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups {{.*}}(%rip), %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test8: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqu {{.*}}(%rip), %xmm0 +; X64-AVX512-NEXT: retq %tmp = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 0) ; [#uses=1] %tmp3 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 1) ; [#uses=1] %tmp5 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 2) ; [#uses=1] @@ -309,10 +382,15 @@ ; X86-SSE-NEXT: movups {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: test9: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test9: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test9: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test9: ; X64-SSE: # %bb.0: @@ -340,10 +418,15 @@ ; X86-SSE-NEXT: movups {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: test10: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test10: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test10: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test10: ; X64-SSE: # %bb.0: @@ -371,20 +454,30 @@ ; X86-SSE-NEXT: movups {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: test11: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test11: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test11: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test11: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test11: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test11: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test11: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-AVX512-NEXT: retq %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1] %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1] ret <2 x double> %tmp7 @@ -534,13 +627,21 @@ ; X86-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: test15: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vmovaps (%ecx), %xmm0 -; X86-AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test15: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: vmovaps (%ecx), %xmm0 +; X86-AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test15: +; X86-AVX512: # %bb.0: # %entry +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vmovdqa (%ecx), %xmm0 +; X86-AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1] +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test15: ; X64-SSE: # %bb.0: # %entry @@ -548,11 +649,17 @@ ; X64-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test15: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test15: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test15: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1] +; X64-AVX512-NEXT: retq entry: %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=1] %tmp3 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=1] @@ -570,12 +677,19 @@ ; X86-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: test16: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovaps 96(%eax), %xmm0 -; X86-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test16: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovaps 96(%eax), %xmm0 +; X86-AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test16: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vmovdqa 96(%eax), %xmm0 +; X86-AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test16: ; X64-SSE: # %bb.0: @@ -583,11 +697,17 @@ ; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test16: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps 96(%rdi), %xmm0 -; X64-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test16: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps 96(%rdi), %xmm0 +; X64-AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test16: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqa 96(%rdi), %xmm0 +; X64-AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; X64-AVX512-NEXT: retq %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3 %i6 = load <4 x double>, <4 x double>* %i5, align 32 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> @@ -610,8 +730,8 @@ ; ; X86-AVX512-LABEL: test17: ; X86-AVX512: # %bb.0: # %entry -; X86-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [32768,32768,32768,32768] +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test17: @@ -628,8 +748,8 @@ ; ; X64-AVX512-LABEL: test17: ; X64-AVX512: # %bb.0: # %entry -; X64-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rax) +; X64-AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [32768,32768,32768,32768] +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rax) ; X64-AVX512-NEXT: retq entry: %0 = insertelement <4 x i32> undef, i32 undef, i32 1 @@ -678,11 +798,17 @@ ; X86-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: PR19721: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: PR19721: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: PR19721: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: PR19721: ; X64-SSE: # %bb.0: diff --git a/llvm/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,X86-SSE -; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX -; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X86-AVX +; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX1 +; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX512 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,X64-SSE -; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX -; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X64-AVX +; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX1 +; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX512 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse3-builtins.c @@ -134,21 +134,32 @@ ; X86-SSE-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: test_mm_loaddup_pd: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_mm_loaddup_pd: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test_mm_loaddup_pd: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vpbroadcastq (%eax), %xmm0 +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test_mm_loaddup_pd: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test_mm_loaddup_pd: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_mm_loaddup_pd: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test_mm_loaddup_pd: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpbroadcastq (%rdi), %xmm0 +; X64-AVX512-NEXT: retq %ld = load double, double* %a0 %res0 = insertelement <2 x double> undef, double %ld, i32 0 %res1 = insertelement <2 x double> %res0, double %ld, i32 1 @@ -161,10 +172,25 @@ ; SSE-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE-NEXT: ret{{[l|q]}} ; -; AVX-LABEL: test_mm_movedup_pd: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: ret{{[l|q]}} +; X86-AVX1-LABEL: test_mm_movedup_pd: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test_mm_movedup_pd: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: test_mm_movedup_pd: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test_mm_movedup_pd: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 +; X64-AVX512-NEXT: retq %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> zeroinitializer ret <2 x double> %res } diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll @@ -31,10 +31,15 @@ ; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE-NEXT: ret{{[l|q]}} ; -; AVX-LABEL: test_mm_blend_pd: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: ret{{[l|q]}} +; AVX1-LABEL: test_mm_blend_pd: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_blend_pd: +; AVX512: # %bb.0: +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512-NEXT: ret{{[l|q]}} %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> ret <2 x double> %res } @@ -45,10 +50,15 @@ ; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] ; SSE-NEXT: ret{{[l|q]}} ; -; AVX-LABEL: test_mm_blend_ps: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] -; AVX-NEXT: ret{{[l|q]}} +; AVX1-LABEL: test_mm_blend_ps: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_blend_ps: +; AVX512: # %bb.0: +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX512-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> ret <4 x float> %res } @@ -444,10 +454,15 @@ ; SSE-NEXT: extractps $1, %xmm0, %eax ; SSE-NEXT: ret{{[l|q]}} ; -; AVX-LABEL: test_mm_extract_epi32: -; AVX: # %bb.0: -; AVX-NEXT: vextractps $1, %xmm0, %eax -; AVX-NEXT: ret{{[l|q]}} +; AVX1-LABEL: test_mm_extract_epi32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_extract_epi32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpextrd $1, %xmm0, %eax +; AVX512-NEXT: ret{{[l|q]}} %arg0 = bitcast <2 x i64> %a0 to <4 x i32> %ext = extractelement <4 x i32> %arg0, i32 1 ret i32 %ext @@ -460,11 +475,17 @@ ; X86-SSE-NEXT: extractps $3, %xmm0, %edx ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: test_mm_extract_epi64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vextractps $2, %xmm0, %eax -; X86-AVX-NEXT: vextractps $3, %xmm0, %edx -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_mm_extract_epi64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vextractps $2, %xmm0, %eax +; X86-AVX1-NEXT: vextractps $3, %xmm0, %edx +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test_mm_extract_epi64: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vpextrd $2, %xmm0, %eax +; X86-AVX512-NEXT: vpextrd $3, %xmm0, %edx +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test_mm_extract_epi64: ; X64-SSE: # %bb.0: diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll --- a/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll @@ -16,11 +16,17 @@ ; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; AVX-LABEL: test_x86_sse41_blendpd: -; AVX: ## %bb.0: -; AVX-NEXT: vblendps $3, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x03] -; AVX-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; AVX1-LABEL: test_x86_sse41_blendpd: +; AVX1: ## %bb.0: +; AVX1-NEXT: vblendps $3, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x03] +; AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse41_blendpd: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpblendd $3, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x02,0xc0,0x03] +; AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 6) ; <<2 x double>> [#uses=1] ret <2 x double> %res } @@ -34,11 +40,17 @@ ; SSE-NEXT: ## xmm0 = xmm1[0,1,2],xmm0[3] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; AVX-LABEL: test_x86_sse41_blendps: -; AVX: ## %bb.0: -; AVX-NEXT: vblendps $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08] -; AVX-NEXT: ## xmm0 = xmm1[0,1,2],xmm0[3] -; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; AVX1-LABEL: test_x86_sse41_blendps: +; AVX1: ## %bb.0: +; AVX1-NEXT: vblendps $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08] +; AVX1-NEXT: ## xmm0 = xmm1[0,1,2],xmm0[3] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse41_blendps: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpblendd $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x02,0xc0,0x08] +; AVX512-NEXT: ## xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll --- a/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/sse41-intrinsics-x86.ll @@ -155,8 +155,8 @@ ; ; X86-AVX512-LABEL: test_x86_sse41_packusdw_fold: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: vmovaps LCPI7_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,65535,65535,0,0] -; X86-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX512-NEXT: vmovdqa LCPI7_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,65535,65535,0,0] +; X86-AVX512-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI7_0, kind: FK_Data_4 ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -176,8 +176,8 @@ ; ; X64-AVX512-LABEL: test_x86_sse41_packusdw_fold: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,65535,65535,0,0] -; X64-AVX512-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX512-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,65535,65535,0,0] +; X64-AVX512-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 4, value: LCPI7_0-4, kind: reloc_riprel_4byte ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> zeroinitializer, <4 x i32> ) diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -267,9 +267,9 @@ ; X86-AVX512-LABEL: ext_2: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: pushl %eax ## encoding: [0x50] -; X86-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X86-AVX512-NEXT: vpshufd $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc0,0xff] ; X86-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] -; X86-AVX512-NEXT: vmovss %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24] +; X86-AVX512-NEXT: vmovd %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0x04,0x24] ; X86-AVX512-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] ; X86-AVX512-NEXT: popl %eax ## encoding: [0x58] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] @@ -288,7 +288,7 @@ ; ; X64-AVX512-LABEL: ext_2: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X64-AVX512-NEXT: vpshufd $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc0,0xff] ; X64-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %s = extractelement <4 x float> %v, i32 3 @@ -308,7 +308,7 @@ ; ; AVX512-LABEL: ext_3: ; AVX512: ## %bb.0: -; AVX512-NEXT: vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03] +; AVX512-NEXT: vpextrd $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x16,0xc0,0x03] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %i = extractelement <4 x i32> %v, i32 3 ret i32 %i @@ -359,9 +359,9 @@ ; ; X86-AVX512-LABEL: blendps_not_insertps_1: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04] +; X86-AVX512-NEXT: vmovd {{[0-9]+}}(%esp), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x4c,0x24,0x04] ; X86-AVX512-NEXT: ## xmm1 = mem[0],zero,zero,zero -; X86-AVX512-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] +; X86-AVX512-NEXT: vpblendd $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x01] ; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -371,11 +371,17 @@ ; X64-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; -; X64-AVX-LABEL: blendps_not_insertps_1: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] -; X64-AVX-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] -; X64-AVX-NEXT: retq ## encoding: [0xc3] +; X64-AVX1-LABEL: blendps_not_insertps_1: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] +; X64-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; X64-AVX1-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512-LABEL: blendps_not_insertps_1: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpblendd $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x01] +; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; X64-AVX512-NEXT: retq ## encoding: [0xc3] %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 ret <4 x float> %tmp1 } @@ -438,11 +444,17 @@ ; SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; AVX-LABEL: blendps_not_insertps_2: -; AVX: ## %bb.0: -; AVX-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] -; AVX-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; AVX1-LABEL: blendps_not_insertps_2: +; AVX1: ## %bb.0: +; AVX1-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] +; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: blendps_not_insertps_2: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpblendd $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x01] +; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %tmp2 = extractelement <4 x float> %t2, i32 0 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0 ret <4 x float> %tmp1 @@ -654,8 +666,8 @@ ; X86-AVX512-LABEL: pinsrd_from_shufflevector_i32: ; X86-AVX512: ## %bb.0: ## %entry ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vbroadcastss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x08] -; X86-AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] +; X86-AVX512-NEXT: vpbroadcastd (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08] +; X86-AVX512-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -677,8 +689,8 @@ ; ; X64-AVX512-LABEL: pinsrd_from_shufflevector_i32: ; X64-AVX512: ## %bb.0: ## %entry -; X64-AVX512-NEXT: vbroadcastss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0f] -; X64-AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] +; X64-AVX512-NEXT: vpbroadcastd (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x0f] +; X64-AVX512-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] entry: @@ -706,9 +718,9 @@ ; ; AVX512-LABEL: insertps_from_shufflevector_i32_2: ; AVX512: ## %bb.0: ## %entry -; AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; AVX512-NEXT: vpshufd $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xee] ; AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] -; AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] +; AVX512-NEXT: vpblendd $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x02] ; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] entry: @@ -819,8 +831,8 @@ ; ; AVX512-LABEL: shuf_XYZ0: ; AVX512: ## %bb.0: -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %x, i32 0 @@ -1045,8 +1057,8 @@ ; ; AVX512-LABEL: i32_shuf_XYZ0: ; AVX512: ## %bb.0: -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 0 @@ -1107,10 +1119,10 @@ ; ; AVX512-LABEL: i32_shuf_XYY0: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpermilps $212, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4] +; AVX512-NEXT: vpshufd $212, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc0,0xd4] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,1,3] -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 0 @@ -1143,10 +1155,10 @@ ; ; AVX512-LABEL: i32_shuf_XYW0: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpermilps $244, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xf4] +; AVX512-NEXT: vpshufd $244, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc0,0xf4] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,3,3] -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 0 @@ -1180,10 +1192,10 @@ ; ; AVX512-LABEL: i32_shuf_W00W: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; AVX512-NEXT: vpshufd $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc0,0xff] ; AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpblendd $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x06] ; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 3 @@ -1219,11 +1231,11 @@ ; ; AVX512-LABEL: i32_shuf_X00A: ; AVX512: ## %bb.0: -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] -; AVX512-NEXT: vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01] -; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3] -; AVX512-NEXT: vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9] -; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2] +; AVX512-NEXT: vpblendw $3, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0e,0xc0,0x03] +; AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX512-NEXT: vpbroadcastd %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xc9] +; AVX512-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 0 @@ -1255,9 +1267,9 @@ ; ; AVX512-LABEL: i32_shuf_X00X: ; AVX512: ## %bb.0: -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] -; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xc0] +; AVX512-NEXT: vpblendd $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x06] ; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 0 @@ -1779,9 +1791,9 @@ ; X86-AVX512-LABEL: insertps_with_undefs: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08] +; X86-AVX512-NEXT: vmovd (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x08] ; X86-AVX512-NEXT: ## xmm1 = mem[0],zero,zero,zero -; X86-AVX512-NEXT: vmovlhps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0] +; X86-AVX512-NEXT: vpunpcklqdq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xc0] ; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[0] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -1804,9 +1816,9 @@ ; ; X64-AVX512-LABEL: insertps_with_undefs: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f] +; X64-AVX512-NEXT: vmovd (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x0f] ; X64-AVX512-NEXT: ## xmm1 = mem[0],zero,zero,zero -; X64-AVX512-NEXT: vmovlhps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0] +; X64-AVX512-NEXT: vpunpcklqdq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xc0] ; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[0] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %1 = load float, float* %b, align 4 @@ -1892,11 +1904,11 @@ ; X86-AVX512-LABEL: insertps_pr20411: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X86-AVX512-NEXT: vpshufd $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xee] ; X86-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] -; X86-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] +; X86-AVX512-NEXT: vpblendd $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x02] ; X86-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; X86-AVX512-NEXT: vmovups %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] +; X86-AVX512-NEXT: vmovdqu %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x00] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: insertps_pr20411: @@ -1919,11 +1931,11 @@ ; ; X64-AVX512-LABEL: insertps_pr20411: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X64-AVX512-NEXT: vpshufd $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xee] ; X64-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] -; X64-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] +; X64-AVX512-NEXT: vpblendd $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x02] ; X64-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] +; X64-AVX512-NEXT: vmovdqu %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> %ptrcast = bitcast i32* %RET to <4 x i32>* @@ -2133,8 +2145,8 @@ ; ; AVX512-LABEL: build_vector_to_shuffle_1: ; AVX512: ## %bb.0: -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpblendd $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x02,0xc0,0x0a] ; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 1 @@ -2161,8 +2173,8 @@ ; ; AVX512-LABEL: build_vector_to_shuffle_2: ; AVX512: ## %bb.0: -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpblendd $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x02,0xc0,0x02] ; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 1 diff --git a/llvm/test/CodeGen/X86/stack-folding-avx512bf16.ll b/llvm/test/CodeGen/X86/stack-folding-avx512bf16.ll --- a/llvm/test/CodeGen/X86/stack-folding-avx512bf16.ll +++ b/llvm/test/CodeGen/X86/stack-folding-avx512bf16.ll @@ -12,7 +12,7 @@ define <32 x i16> @stack_fold_cvtne2ps2bf16(<16 x float> %a0, <16 x float> %a1) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -27,7 +27,7 @@ define <32 x i16> @stack_fold_cvtne2ps2bf16_mask(<16 x float> %a0, <16 x float> %a1, <32 x i16>* %passthru, i32 %U) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -48,7 +48,7 @@ define <32 x i16> @stack_fold_cvtne2ps2bf16_maskz(<16 x float> %a0, <16 x float> %a1, i32 %U) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -65,7 +65,7 @@ define <16 x i16> @stack_fold_cvtneps2bf16(<16 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtneps2bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -80,7 +80,7 @@ define <16 x i16> @stack_fold_cvtneps2bf16_mask(<16 x float> %a0, <16 x i16>* %passthru, i16 %U) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -101,7 +101,7 @@ define <16 x i16> @stack_fold_cvtneps2bf16_maskz(<16 x float> %a0, i16 %U) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -118,7 +118,7 @@ define <16 x float> @stack_fold_vdpbf16ps(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2) { ; CHECK-LABEL: stack_fold_vdpbf16ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -133,7 +133,7 @@ define <16 x float> @stack_fold_vdpbf16ps_mask(<16 x float>* %a0, <16 x i32> %a1, <16 x i32> %a2, <16 x float>* %passthru, i16 %U) { ; CHECK-LABEL: stack_fold_vdpbf16ps_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -154,7 +154,7 @@ define <16 x float> @stack_fold_vdpbf16ps_maskz(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %U) { ; CHECK-LABEL: stack_fold_vdpbf16ps_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -174,7 +174,7 @@ define <16 x i16> @stack_fold_cvtne2ps2bf16_ymm(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -189,7 +189,7 @@ define <16 x i16> @stack_fold_cvtne2ps2bf16_mask_ymm(<8 x float> %a0, <8 x float> %a1, <16 x i16>* %passthru, i16 %U) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -210,7 +210,7 @@ define <16 x i16> @stack_fold_cvtne2ps2bf16_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i16 %U) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -227,7 +227,7 @@ define <8 x i16> @stack_fold_cvtneps2bf16_ymm(<8 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -243,7 +243,7 @@ define <8 x i16> @stack_fold_cvtneps2bf16_mask_ymm(<8 x float> %a0, <8 x i16>* %passthru, i8 %U) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_mask_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -265,7 +265,7 @@ define <8 x i16> @stack_fold_cvtneps2bf16_maskz_ymm(<8 x float> %a0, i8 %U) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -283,7 +283,7 @@ define <8 x float> @stack_fold_vdpbf16ps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2) { ; CHECK-LABEL: stack_fold_vdpbf16ps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -298,7 +298,7 @@ define <8 x float> @stack_fold_vdpbf16ps_mask_ymm(<8 x float>* %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x float>* %passthru, i8 %U) { ; CHECK-LABEL: stack_fold_vdpbf16ps_mask_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -319,7 +319,7 @@ define <8 x float> @stack_fold_vdpbf16ps_maskz_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2, i8* %U) { ; CHECK-LABEL: stack_fold_vdpbf16ps_maskz_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -341,7 +341,7 @@ define <8 x i16> @stack_fold_cvtne2ps2bf16_xmm(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -356,7 +356,7 @@ define <8 x i16> @stack_fold_cvtne2ps2bf16_mask_xmm(<4 x float> %a0, <4 x float> %a1, <8 x i16>* %passthru, i8 %U) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -377,7 +377,7 @@ define <8 x i16> @stack_fold_cvtne2ps2bf16_maskz_xmm(<4 x float> %a0, <4 x float> %a1, i8 %U) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -394,7 +394,7 @@ define <8 x i16> @stack_fold_cvtneps2bf16_xmm(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -409,7 +409,7 @@ define <8 x i16> @stack_fold_cvtneps2bf16_mask_xmm(<4 x float> %a0, <8 x i16>* %passthru, i8 %U) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_mask_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -429,7 +429,7 @@ define <8 x i16> @stack_fold_cvtneps2bf16_maskz_xmm(<4 x float> %a0, i8 %U) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -446,7 +446,7 @@ define <4 x float> @stack_fold_vdpbf16ps_xmm(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vdpbf16ps_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -461,7 +461,7 @@ define <4 x float> @stack_fold_vdpbf16ps_mask_xmm(<4 x float>* %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x float>* %passthru, i8 %U) { ; CHECK-LABEL: stack_fold_vdpbf16ps_mask_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -483,7 +483,7 @@ define <4 x float> @stack_fold_vdpbf16ps_maskz_xmm(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2, i8* %U) { ; CHECK-LABEL: stack_fold_vdpbf16ps_maskz_xmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll b/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll --- a/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll +++ b/llvm/test/CodeGen/X86/stack-folding-avx512vp2intersect.ll @@ -7,11 +7,11 @@ define void @stack_fold_vp2intersectd(<16 x i32>* %a, <16 x i32> %b, <16 x i1>* nocapture %m0, <16 x i1>* nocapture %m1) { ; CHECK-LABEL: stack_fold_vp2intersectd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload ; CHECK-NEXT: kmovw %k0, (%rsi) ; CHECK-NEXT: kmovw %k1, (%rdx) @@ -31,11 +31,11 @@ define void @stack_fold_vp2intersectq(<8 x i64>* %a, <8 x i64> %b, <8 x i1>* nocapture %m0, <8 x i1>* nocapture %m1) { ; CHECK-LABEL: stack_fold_vp2intersectq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload ; CHECK-NEXT: kmovw %k1, %eax ; CHECK-NEXT: kmovw %k0, %ecx @@ -57,11 +57,11 @@ define void @stack_fold_vp2intersectd_256(<8 x i32>* %a, <8 x i32> %b, <8 x i1>* nocapture %m0, <8 x i1>* nocapture %m1) { ; CHECK-LABEL: stack_fold_vp2intersectd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload ; CHECK-NEXT: kmovw %k1, %eax ; CHECK-NEXT: kmovw %k0, %ecx @@ -83,11 +83,11 @@ define void @stack_fold_vp2intersectq_256(<4 x i64>* %a, <4 x i64> %b, <4 x i1>* nocapture %m0, <4 x i1>* nocapture %m1) { ; CHECK-LABEL: stack_fold_vp2intersectq_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload ; CHECK-NEXT: kshiftlw $12, %k0, %k2 ; CHECK-NEXT: kshiftrw $12, %k2, %k2 @@ -113,11 +113,11 @@ define void @stack_fold_vp2intersectd_128(<4 x i32>* %a, <4 x i32> %b, <4 x i1>* nocapture %m0, <4 x i1>* nocapture %m1) { ; CHECK-LABEL: stack_fold_vp2intersectd_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload ; CHECK-NEXT: kshiftlw $12, %k0, %k2 ; CHECK-NEXT: kshiftrw $12, %k2, %k2 @@ -142,11 +142,11 @@ define void @stack_fold_vp2intersectq_128(<2 x i64>* %a, <2 x i64> %b, <2 x i1>* nocapture %m0, <2 x i1>* nocapture %m1) { ; CHECK-LABEL: stack_fold_vp2intersectq_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload ; CHECK-NEXT: kshiftlw $14, %k0, %k2 ; CHECK-NEXT: kshiftrw $14, %k2, %k2 diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll @@ -12,7 +12,7 @@ define <8 x double> @stack_fold_addpd_zmm(<8 x double> %a0, <8 x double> %a1) { ; CHECK-LABEL: stack_fold_addpd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -26,7 +26,7 @@ define <8 x double> @stack_fold_addpd_zmm_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) { ; CHECK-LABEL: stack_fold_addpd_zmm_k: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -46,7 +46,7 @@ define <8 x double> @stack_fold_addpd_zmm_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) { ; CHECK-LABEL: stack_fold_addpd_zmm_k_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -66,7 +66,7 @@ define <8 x double> @stack_fold_addpd_zmm_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_addpd_zmm_kz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -83,7 +83,7 @@ define <16 x float> @stack_fold_addps_zmm(<16 x float> %a0, <16 x float> %a1) { ; CHECK-LABEL: stack_fold_addps_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -97,7 +97,7 @@ define <16 x float> @stack_fold_addps_zmm_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) { ; CHECK-LABEL: stack_fold_addps_zmm_k: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -117,7 +117,7 @@ define <16 x float> @stack_fold_addps_zmm_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) { ; CHECK-LABEL: stack_fold_addps_zmm_k_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -137,7 +137,7 @@ define <16 x float> @stack_fold_addps_zmm_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_addps_zmm_kz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -154,7 +154,7 @@ define double @stack_fold_addsd(double %a0, double %a1) { ; CHECK-LABEL: stack_fold_addsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -168,7 +168,7 @@ define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_addsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -185,7 +185,7 @@ define float @stack_fold_addss(float %a0, float %a1) { ; CHECK-LABEL: stack_fold_addss: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: vmovd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -199,7 +199,7 @@ define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_addss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -216,7 +216,7 @@ define <8 x double> @stack_fold_andnpd_zmm(<8 x double> %a0, <8 x double> %a1) { ; CHECK-LABEL: stack_fold_andnpd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -238,7 +238,7 @@ define <16 x float> @stack_fold_andnps_zmm(<16 x float> %a0, <16 x float> %a1) { ; CHECK-LABEL: stack_fold_andnps_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -260,7 +260,7 @@ define <8 x double> @stack_fold_andpd_zmm(<8 x double> %a0, <8 x double> %a1) { ; CHECK-LABEL: stack_fold_andpd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -281,7 +281,7 @@ define <16 x float> @stack_fold_andps_zmm(<16 x float> %a0, <16 x float> %a1) { ; CHECK-LABEL: stack_fold_andps_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -302,7 +302,7 @@ define i8 @stack_fold_cmppd(<8 x double> %a0, <8 x double> %a1) { ; CHECK-LABEL: stack_fold_cmppd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -323,10 +323,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $184, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 192 -; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -356,10 +356,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $184, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 192 -; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -387,7 +387,7 @@ define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) { ; CHECK-LABEL: stack_fold_cmpps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -408,10 +408,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $184, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 192 -; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -441,10 +441,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $184, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 192 -; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -472,7 +472,7 @@ define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_divsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -489,7 +489,7 @@ define float @stack_fold_divss(float %a0, float %a1) { ; CHECK-LABEL: stack_fold_divss: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: vmovd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -503,7 +503,7 @@ define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_divss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -520,7 +520,7 @@ define <8 x double> @stack_fold_cvtdq2pd(<8 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtdq2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -534,7 +534,7 @@ define <8 x double> @stack_fold_cvtudq2pd(<8 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtudq2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -548,7 +548,7 @@ define <8 x float> @stack_fold_cvtpd2ps(<8 x double> %a0) { ; CHECK-LABEL: stack_fold_cvtpd2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -562,7 +562,7 @@ define <16 x float> @stack_fold_cvtph2ps(<16 x i16> %a0) { ; CHECK-LABEL: stack_fold_cvtph2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -581,7 +581,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: retq %1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 0, <16 x i16> undef, i16 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -592,7 +592,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_insertps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -608,7 +608,7 @@ define <8 x double> @stack_fold_maxpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_maxpd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -623,7 +623,7 @@ define <8 x double> @stack_fold_maxpd_zmm_commutable(<8 x double> %a0, <8 x double> %a1) #1 { ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -637,7 +637,7 @@ define <8 x double> @stack_fold_maxpd_zmm_commutable_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) #1 { ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_k: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -657,7 +657,7 @@ define <8 x double> @stack_fold_maxpd_zmm_commutable_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) #1 { ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_k_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -677,7 +677,7 @@ define <8 x double> @stack_fold_maxpd_zmm_commutable_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #1 { ; CHECK-LABEL: stack_fold_maxpd_zmm_commutable_kz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -694,7 +694,7 @@ define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) #0 { ; CHECK-LABEL: stack_fold_maxps_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -709,7 +709,7 @@ define <16 x float> @stack_fold_maxps_zmm_commutable(<16 x float> %a0, <16 x float> %a1) #1 { ; CHECK-LABEL: stack_fold_maxps_zmm_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -723,7 +723,7 @@ define <16 x float> @stack_fold_maxps_zmm_commutable_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) #1 { ; CHECK-LABEL: stack_fold_maxps_zmm_commutable_k: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -743,7 +743,7 @@ define <16 x float> @stack_fold_maxps_zmm_commutable_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) #1 { ; CHECK-LABEL: stack_fold_maxps_zmm_commutable_k_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -763,7 +763,7 @@ define <16 x float> @stack_fold_maxps_zmm_commutable_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #1 { ; CHECK-LABEL: stack_fold_maxps_zmm_commutable_kz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -780,7 +780,7 @@ define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_minpd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -795,7 +795,7 @@ define <8 x double> @stack_fold_minpd_zmm_commutable(<8 x double> %a0, <8 x double> %a1) #1 { ; CHECK-LABEL: stack_fold_minpd_zmm_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -809,7 +809,7 @@ define <8 x double> @stack_fold_minpd_zmm_commutable_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) #1 { ; CHECK-LABEL: stack_fold_minpd_zmm_commutable_k: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -829,7 +829,7 @@ define <8 x double> @stack_fold_minpd_zmm_commutable_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) #1 { ; CHECK-LABEL: stack_fold_minpd_zmm_commutable_k_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -849,7 +849,7 @@ define <8 x double> @stack_fold_minpd_zmm_commutable_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #1 { ; CHECK-LABEL: stack_fold_minpd_zmm_commutable_kz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -866,7 +866,7 @@ define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) #0 { ; CHECK-LABEL: stack_fold_minps_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -881,7 +881,7 @@ define <16 x float> @stack_fold_minps_zmm_commutable(<16 x float> %a0, <16 x float> %a1) #1 { ; CHECK-LABEL: stack_fold_minps_zmm_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -895,7 +895,7 @@ define <16 x float> @stack_fold_minps_zmm_commutable_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) #1 { ; CHECK-LABEL: stack_fold_minps_zmm_commutable_k: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -915,7 +915,7 @@ define <16 x float> @stack_fold_minps_zmm_commutable_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) #1 { ; CHECK-LABEL: stack_fold_minps_zmm_commutable_k_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -935,7 +935,7 @@ define <16 x float> @stack_fold_minps_zmm_commutable_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #1 { ; CHECK-LABEL: stack_fold_minps_zmm_commutable_kz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -952,7 +952,7 @@ define <8 x double> @stack_fold_mulpd_zmm(<8 x double> %a0, <8 x double> %a1) { ; CHECK-LABEL: stack_fold_mulpd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -966,7 +966,7 @@ define <8 x double> @stack_fold_mulpd_zmm_k(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) { ; CHECK-LABEL: stack_fold_mulpd_zmm_k: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -986,7 +986,7 @@ define <8 x double> @stack_fold_mulpd_zmm_k_commuted(<8 x double> %a0, <8 x double> %a1, i8 %mask, <8 x double>* %passthru) { ; CHECK-LABEL: stack_fold_mulpd_zmm_k_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1006,7 +1006,7 @@ define <8 x double> @stack_fold_mulpd_zmm_kz(<8 x double> %a0, <8 x double> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_mulpd_zmm_kz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1023,7 +1023,7 @@ define <16 x float> @stack_fold_mulps_zmm(<16 x float> %a0, <16 x float> %a1) { ; CHECK-LABEL: stack_fold_mulps_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1037,7 +1037,7 @@ define <16 x float> @stack_fold_mulps_zmm_k(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) { ; CHECK-LABEL: stack_fold_mulps_zmm_k: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1057,7 +1057,7 @@ define <16 x float> @stack_fold_mulps_zmm_k_commuted(<16 x float> %a0, <16 x float> %a1, i16 %mask, <16 x float>* %passthru) { ; CHECK-LABEL: stack_fold_mulps_zmm_k_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1077,7 +1077,7 @@ define <16 x float> @stack_fold_mulps_zmm_kz(<16 x float> %a0, <16 x float> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_mulps_zmm_kz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1094,7 +1094,7 @@ define double @stack_fold_mulsd(double %a0, double %a1) { ; CHECK-LABEL: stack_fold_mulsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1108,7 +1108,7 @@ define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_mulsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1125,7 +1125,7 @@ define float @stack_fold_mulss(float %a0, float %a1) { ; CHECK-LABEL: stack_fold_mulss: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: vmovd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1139,7 +1139,7 @@ define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_mulss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1156,7 +1156,7 @@ define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_orpd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1177,7 +1177,7 @@ define <16 x float> @stack_fold_orps_zmm(<16 x float> %a0, <16 x float> %a1) #0 { ; CHECK-LABEL: stack_fold_orps_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1200,8 +1200,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1221,8 +1221,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1249,8 +1249,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1273,8 +1273,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1301,8 +1301,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1323,7 +1323,7 @@ define <8 x double> @stack_fold_subpd_zmm(<8 x double> %a0, <8 x double> %a1) { ; CHECK-LABEL: stack_fold_subpd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1337,7 +1337,7 @@ define <16 x float> @stack_fold_subps_zmm(<16 x float> %a0, <16 x float> %a1) { ; CHECK-LABEL: stack_fold_subps_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1351,7 +1351,7 @@ define double @stack_fold_subsd(double %a0, double %a1) { ; CHECK-LABEL: stack_fold_subsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovq %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1365,7 +1365,7 @@ define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_subsd_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1382,7 +1382,7 @@ define float @stack_fold_subss(float %a0, float %a1) { ; CHECK-LABEL: stack_fold_subss: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: vmovd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1396,7 +1396,7 @@ define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_subss_int: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1413,7 +1413,7 @@ define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_xorpd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1434,7 +1434,7 @@ define <16 x float> @stack_fold_xorps_zmm(<16 x float> %a0, <16 x float> %a1) #0 { ; CHECK-LABEL: stack_fold_xorps_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1500,11 +1500,11 @@ define <4 x float> @stack_fold_extracti32x4(<16 x float> %a0) { ; CHECK-LABEL: stack_fold_extracti32x4: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill +; CHECK-NEXT: vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = shufflevector <16 x float> %a0, <16 x float> undef, <4 x i32> @@ -1515,11 +1515,11 @@ define <2 x double> @stack_fold_extractf64x2(<8 x double> %a0) { ; CHECK-LABEL: stack_fold_extractf64x2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill +; CHECK-NEXT: vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = shufflevector <8 x double> %a0, <8 x double> undef, <2 x i32> @@ -1530,11 +1530,11 @@ define <8 x float> @stack_fold_extracti32x8(<16 x float> %a0) { ; CHECK-LABEL: stack_fold_extracti32x8: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill +; CHECK-NEXT: vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: retq %1 = shufflevector <16 x float> %a0, <16 x float> undef, <8 x i32> %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -1544,11 +1544,11 @@ define <4 x double> @stack_fold_extractf64x4(<8 x double> %a0) { ; CHECK-LABEL: stack_fold_extractf64x4: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill +; CHECK-NEXT: vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: retq %1 = shufflevector <8 x double> %a0, <8 x double> undef, <4 x i32> %2 = tail call <2 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -1558,12 +1558,12 @@ define <16 x float> @stack_fold_insertf32x8(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_insertf32x8: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <16 x i32> @@ -1573,12 +1573,12 @@ define <8 x double> @stack_fold_insertf64x4(<4 x double> %a0, <4 x double> %a1) { ; CHECK-LABEL: stack_fold_insertf64x4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> @@ -1588,7 +1588,7 @@ define <8 x double> @stack_fold_insertf64x4_mask(<8 x double>* %passthru, <4 x double> %a0, <4 x double> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_insertf64x4_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1609,7 +1609,7 @@ define <8 x double> @stack_fold_insertf64x4_maskz(<4 x double> %a0, <4 x double> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_insertf64x4_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1627,7 +1627,7 @@ define <16 x float> @stack_fold_vpermt2ps(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) { ; CHECK-LABEL: stack_fold_vpermt2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1642,7 +1642,7 @@ define <16 x float> @stack_fold_vpermi2ps(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2) { ; CHECK-LABEL: stack_fold_vpermi2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1656,7 +1656,7 @@ define <16 x float> @stack_fold_vpermi2ps_mask(<16 x float> %x0, <16 x i32>* %x1, <16 x float> %x2, i16 %mask) { ; CHECK-LABEL: stack_fold_vpermi2ps_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1677,7 +1677,7 @@ define <16 x float> @stack_fold_vpermt2ps_mask(<16 x i32>* %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) { ; CHECK-LABEL: stack_fold_vpermt2ps_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1696,7 +1696,7 @@ define <16 x float> @stack_fold_vpermt2ps_maskz(<16 x i32>* %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) { ; CHECK-LABEL: stack_fold_vpermt2ps_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1715,7 +1715,7 @@ define <8 x double> @stack_fold_vpermt2pd(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) { ; CHECK-LABEL: stack_fold_vpermt2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1731,7 +1731,7 @@ define <8 x double> @stack_fold_vpermi2pd(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2) { ; CHECK-LABEL: stack_fold_vpermi2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1745,7 +1745,7 @@ define <8 x double> @stack_fold_permpd(<8 x double> %a0) { ; CHECK-LABEL: stack_fold_permpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1764,7 +1764,7 @@ define <8 x double> @stack_fold_permpd_mask(<8 x double>* %passthru, <8 x double> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_permpd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1789,7 +1789,7 @@ define <8 x double> @stack_fold_permpd_maskz(<8 x double> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_permpd_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1807,7 +1807,7 @@ define <8 x double> @stack_fold_permpdvar(<8 x i64> %a0, <8 x double> %a1) { ; CHECK-LABEL: stack_fold_permpdvar: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1826,11 +1826,11 @@ define <16 x float> @stack_fold_permps(<16 x i32> %a0, <16 x float> %a1) { ; CHECK-LABEL: stack_fold_permps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a1, <16 x i32> %a0) @@ -1841,7 +1841,7 @@ define <8 x double> @stack_fold_permilpd_zmm(<8 x double> %a0) { ; CHECK-LABEL: stack_fold_permilpd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1856,7 +1856,7 @@ define <8 x double> @stack_fold_permilpd_zmm_mask(<8 x double>* %passthru, <8 x double> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_permilpd_zmm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1878,7 +1878,7 @@ define <8 x double> @stack_fold_permilpd_zmm_maskz(<8 x double> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_permilpd_zmm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1896,7 +1896,7 @@ define <8 x double> @stack_fold_permilpdvar_zmm(<8 x double> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_permilpdvar_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1911,7 +1911,7 @@ define <8 x double> @stack_fold_permilpdvar_zmm_mask(<8 x double>* %passthru, <8 x double> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_permilpdvar_zmm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1932,7 +1932,7 @@ define <8 x double> @stack_fold_permilpdvar_zmm_maskz(<8 x double> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_permilpdvar_zmm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1949,11 +1949,11 @@ define <16 x float> @stack_fold_permilps_zmm(<16 x float> %a0) { ; CHECK-LABEL: stack_fold_permilps_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -1964,7 +1964,7 @@ define <16 x float> @stack_fold_permilps_zmm_mask(<16 x float>* %passthru, <16 x float> %a0, i16 %mask) { ; CHECK-LABEL: stack_fold_permilps_zmm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1986,7 +1986,7 @@ define <16 x float> @stack_fold_permilps_zmm_maskz(<16 x float> %a0, i16 %mask) { ; CHECK-LABEL: stack_fold_permilps_zmm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2004,7 +2004,7 @@ define <16 x float> @stack_fold_permilpsvar_zmm(<16 x float> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_permilpsvar_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2019,7 +2019,7 @@ define <16 x float> @stack_fold_permilpsvar_zmm_mask(<16 x float>* %passthru, <16 x float> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_permilpsvar_zmm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2040,7 +2040,7 @@ define <16 x float> @stack_fold_permilpsvar_zmm_maskz(<16 x float> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_permilpsvar_zmm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll @@ -12,7 +12,7 @@ define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_addpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -26,7 +26,7 @@ define <4 x double> @stack_fold_addpd_ymm(<4 x double> %a0, <4 x double> %a1) { ; CHECK-LABEL: stack_fold_addpd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -40,7 +40,7 @@ define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_addps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -54,7 +54,7 @@ define <8 x float> @stack_fold_addps_ymm(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_addps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -68,7 +68,7 @@ define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_andnpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -90,7 +90,7 @@ define <4 x double> @stack_fold_andnpd_ymm(<4 x double> %a0, <4 x double> %a1) { ; CHECK-LABEL: stack_fold_andnpd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -112,7 +112,7 @@ define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_andnps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -134,7 +134,7 @@ define <8 x float> @stack_fold_andnps_ymm(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_andnps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -156,7 +156,7 @@ define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_andpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -177,7 +177,7 @@ define <4 x double> @stack_fold_andpd_ymm(<4 x double> %a0, <4 x double> %a1) { ; CHECK-LABEL: stack_fold_andpd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -198,7 +198,7 @@ define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_andps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -219,7 +219,7 @@ define <8 x float> @stack_fold_andps_ymm(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_andps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -240,7 +240,7 @@ define i8 @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_cmppd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -259,7 +259,7 @@ define i8 @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) { ; CHECK-LABEL: stack_fold_cmppd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -279,7 +279,7 @@ define i8 @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_cmpps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -298,7 +298,7 @@ define i8 @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_cmpps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -317,7 +317,7 @@ define <2 x double> @stack_fold_divpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_divpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -331,7 +331,7 @@ define <4 x double> @stack_fold_divpd_ymm(<4 x double> %a0, <4 x double> %a1) { ; CHECK-LABEL: stack_fold_divpd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -345,7 +345,7 @@ define <4 x float> @stack_fold_divps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_divps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -359,7 +359,7 @@ define <8 x float> @stack_fold_divps_ymm(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_divps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -373,7 +373,7 @@ define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtdq2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -388,7 +388,7 @@ define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtdq2pd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -402,7 +402,7 @@ define <2 x double> @stack_fold_cvtudq2pd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtudq2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -417,7 +417,7 @@ define <4 x double> @stack_fold_cvtudq2pd_ymm(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_cvtudq2pd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -431,7 +431,7 @@ define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_cvtpd2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -445,7 +445,7 @@ define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) { ; CHECK-LABEL: stack_fold_cvtpd2ps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -460,7 +460,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_maxpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -475,7 +475,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 { ; CHECK-LABEL: stack_fold_maxpd_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -489,7 +489,7 @@ define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_maxpd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -504,7 +504,7 @@ define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 { ; CHECK-LABEL: stack_fold_maxpd_ymm_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -518,7 +518,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 { ; CHECK-LABEL: stack_fold_maxps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -533,7 +533,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 { ; CHECK-LABEL: stack_fold_maxps_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -547,7 +547,7 @@ define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 { ; CHECK-LABEL: stack_fold_maxps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -562,7 +562,7 @@ define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 { ; CHECK-LABEL: stack_fold_maxps_ymm_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -576,7 +576,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 { ; CHECK-LABEL: stack_fold_minps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -591,7 +591,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 { ; CHECK-LABEL: stack_fold_minps_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -605,7 +605,7 @@ define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 { ; CHECK-LABEL: stack_fold_minps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -620,7 +620,7 @@ define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 { ; CHECK-LABEL: stack_fold_minps_ymm_commutable: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -634,7 +634,7 @@ define <2 x double> @stack_fold_mulpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_mulpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -648,7 +648,7 @@ define <4 x double> @stack_fold_mulpd_ymm(<4 x double> %a0, <4 x double> %a1) { ; CHECK-LABEL: stack_fold_mulpd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -662,7 +662,7 @@ define <4 x float> @stack_fold_mulps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_mulps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -676,7 +676,7 @@ define <8 x float> @stack_fold_mulps_ymm(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_mulps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -690,7 +690,7 @@ define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_orpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -711,7 +711,7 @@ define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_orpd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -732,7 +732,7 @@ define <4 x float> @stack_fold_orps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_orps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -753,7 +753,7 @@ define <8 x float> @stack_fold_orps_ymm(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_orps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -774,8 +774,8 @@ define <4 x double> @stack_fold_shuff64x2_maskz(<4 x double> %a, <4 x double> %b, i8 %mask) { ; CHECK-LABEL: stack_fold_shuff64x2_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -795,8 +795,8 @@ define <8 x float> @stack_fold_shuff32x4_maskz(<8 x float> %a, <8 x float> %b, i8 %mask) { ; CHECK-LABEL: stack_fold_shuff32x4_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -815,7 +815,7 @@ define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_shufps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -830,7 +830,7 @@ define <4 x float> @stack_fold_shufps_mask(<4 x float>* %passthru, <4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_shufps_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -852,7 +852,7 @@ define <4 x float> @stack_fold_shufps_maskz(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_shufps_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -871,7 +871,7 @@ define <8 x float> @stack_fold_shufps_ymm(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_shufps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -886,7 +886,7 @@ define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_subpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -900,7 +900,7 @@ define <4 x double> @stack_fold_subpd_ymm(<4 x double> %a0, <4 x double> %a1) { ; CHECK-LABEL: stack_fold_subpd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -914,7 +914,7 @@ define <4 x float> @stack_fold_subps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_subps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -928,7 +928,7 @@ define <8 x float> @stack_fold_subps_ymm(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_subps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -942,7 +942,7 @@ define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_xorpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -963,7 +963,7 @@ define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 { ; CHECK-LABEL: stack_fold_xorpd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -984,7 +984,7 @@ define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_xorps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1005,7 +1005,7 @@ define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_xorps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1026,11 +1026,11 @@ define <4 x float> @stack_fold_extractf32x4(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_extractf32x4: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill +; CHECK-NEXT: vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> @@ -1041,11 +1041,11 @@ define <2 x double> @stack_fold_extractf64x2(<4 x double> %a0, <4 x double> %a1) { ; CHECK-LABEL: stack_fold_extractf64x2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill +; CHECK-NEXT: vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <2 x i32> @@ -1056,12 +1056,12 @@ define <8 x float> @stack_fold_insertf32x4(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_insertf32x4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> @@ -1071,12 +1071,12 @@ define <4 x double> @stack_fold_insertf64x2(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: stack_fold_insertf64x2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <4 x i32> @@ -1086,7 +1086,7 @@ define <4 x float> @stack_fold_vpermt2ps(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) { ; CHECK-LABEL: stack_fold_vpermt2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1100,7 +1100,7 @@ define <4 x float> @stack_fold_vpermi2ps(<4 x i32> %x0, <4 x float> %x1, <4 x float> %x2) { ; CHECK-LABEL: stack_fold_vpermi2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1114,7 +1114,7 @@ define <2 x double> @stack_fold_vpermt2pd(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) { ; CHECK-LABEL: stack_fold_vpermt2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1128,7 +1128,7 @@ define <2 x double> @stack_fold_vpermi2pd(<2 x i64> %x0, <2 x double> %x1, <2 x double> %x2) { ; CHECK-LABEL: stack_fold_vpermi2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1142,7 +1142,7 @@ define <8 x float> @stack_fold_vpermt2ps_ymm(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) { ; CHECK-LABEL: stack_fold_vpermt2ps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1156,7 +1156,7 @@ define <8 x float> @stack_fold_vpermi2ps_ymm(<8 x i32> %x0, <8 x float> %x1, <8 x float> %x2) { ; CHECK-LABEL: stack_fold_vpermi2ps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1170,7 +1170,7 @@ define <4 x double> @stack_fold_vpermt2pd_ymm(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { ; CHECK-LABEL: stack_fold_vpermt2pd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1184,7 +1184,7 @@ define <4 x double> @stack_fold_vpermi2pd_ymm(<4 x i64> %x0, <4 x double> %x1, <4 x double> %x2) { ; CHECK-LABEL: stack_fold_vpermi2pd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1198,7 +1198,7 @@ define <4 x double> @stack_fold_permpd(<4 x double> %a0) { ; CHECK-LABEL: stack_fold_permpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1217,8 +1217,8 @@ define <4 x double> @stack_fold_permpdvar(<4 x i64> %a0, <4 x double> %a1) { ; CHECK-LABEL: stack_fold_permpdvar: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1238,11 +1238,11 @@ define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_permps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0) @@ -1253,7 +1253,7 @@ define <2 x double> @stack_fold_permilpd(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_permilpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1268,7 +1268,7 @@ define <4 x double> @stack_fold_permilpd_ymm(<4 x double> %a0) { ; CHECK-LABEL: stack_fold_permilpd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1283,7 +1283,7 @@ define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_permilpdvar: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1298,7 +1298,7 @@ define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_permilpdvar_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1313,11 +1313,11 @@ define <4 x float> @stack_fold_permilps(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_permilps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,2,1,0] ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -1328,11 +1328,11 @@ define <8 x float> @stack_fold_permilps_ymm(<8 x float> %a0) { ; CHECK-LABEL: stack_fold_permilps_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: # ymm0 = mem[3,2,1,0,7,6,5,4] ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -1343,7 +1343,7 @@ define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_permilpsvar: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1358,7 +1358,7 @@ define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_permilpsvar_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1373,7 +1373,7 @@ define <8 x float> @stack_fold_permilpsvar_ymm_maskz(<8 x float> %a0, <8 x i32> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_permilpsvar_ymm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx2.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx2.ll --- a/llvm/test/CodeGen/X86/stack-folding-int-avx2.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx2.ll @@ -12,7 +12,7 @@ define <4 x double> @stack_fold_broadcastsd_ymm(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_broadcastsd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -30,7 +30,7 @@ define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_broadcastss: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -48,7 +48,7 @@ define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_broadcastss_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -71,7 +71,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq ; zext forces execution domain @@ -84,7 +84,7 @@ define <8 x i32> @stack_fold_inserti128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_inserti128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -102,7 +102,7 @@ define <16 x i16> @stack_fold_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_mpsadbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -117,7 +117,7 @@ define <32 x i8> @stack_fold_pabsb(<32 x i8> %a0) { ; CHECK-LABEL: stack_fold_pabsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -133,7 +133,7 @@ define <8 x i32> @stack_fold_pabsd(<8 x i32> %a0) { ; CHECK-LABEL: stack_fold_pabsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -149,7 +149,7 @@ define <16 x i16> @stack_fold_pabsw(<16 x i16> %a0) { ; CHECK-LABEL: stack_fold_pabsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -165,7 +165,7 @@ define <16 x i16> @stack_fold_packssdw(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_packssdw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -180,7 +180,7 @@ define <32 x i8> @stack_fold_packsswb(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_packsswb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -195,7 +195,7 @@ define <16 x i16> @stack_fold_packusdw(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_packusdw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -210,7 +210,7 @@ define <32 x i8> @stack_fold_packuswb(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_packuswb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -225,7 +225,7 @@ define <32 x i8> @stack_fold_paddb(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -239,7 +239,7 @@ define <8 x i32> @stack_fold_paddd(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_paddd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -253,7 +253,7 @@ define <4 x i64> @stack_fold_paddq(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_paddq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -267,7 +267,7 @@ define <32 x i8> @stack_fold_paddsb(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -282,7 +282,7 @@ define <16 x i16> @stack_fold_paddsw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -297,7 +297,7 @@ define <32 x i8> @stack_fold_paddusb(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddusb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -312,7 +312,7 @@ define <16 x i16> @stack_fold_paddusw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddusw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -327,7 +327,7 @@ define <16 x i16> @stack_fold_paddw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -341,7 +341,7 @@ define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_palignr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -356,7 +356,7 @@ define <32 x i8> @stack_fold_pand(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pand: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -373,7 +373,7 @@ define <32 x i8> @stack_fold_pandn(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pandn: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -391,7 +391,7 @@ define <32 x i8> @stack_fold_pavgb(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pavgb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -410,7 +410,7 @@ define <16 x i16> @stack_fold_pavgw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pavgw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -429,7 +429,7 @@ define <4 x i32> @stack_fold_pblendd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pblendd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -448,7 +448,7 @@ define <8 x i32> @stack_fold_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pblendd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -466,7 +466,7 @@ define <32 x i8> @stack_fold_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %c) { ; CHECK-LABEL: stack_fold_pblendvb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -481,7 +481,7 @@ define <16 x i16> @stack_fold_pblendw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pblendw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -496,7 +496,7 @@ define <16 x i8> @stack_fold_pbroadcastb(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -510,7 +510,7 @@ define <32 x i8> @stack_fold_pbroadcastb_ymm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -524,7 +524,7 @@ define <4 x i32> @stack_fold_pbroadcastd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -541,7 +541,7 @@ define <8 x i32> @stack_fold_pbroadcastd_ymm(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -558,7 +558,7 @@ define <2 x i64> @stack_fold_pbroadcastq(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -575,7 +575,7 @@ define <4 x i64> @stack_fold_pbroadcastq_ymm(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -592,7 +592,7 @@ define <8 x i16> @stack_fold_pbroadcastw(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -606,7 +606,7 @@ define <16 x i16> @stack_fold_pbroadcastw_ymm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pbroadcastw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -620,7 +620,7 @@ define <32 x i8> @stack_fold_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -635,7 +635,7 @@ define <8 x i32> @stack_fold_pcmpeqd(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -650,7 +650,7 @@ define <4 x i64> @stack_fold_pcmpeqq(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -665,7 +665,7 @@ define <16 x i16> @stack_fold_pcmpeqw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -680,7 +680,7 @@ define <32 x i8> @stack_fold_pcmpgtb(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpgtb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -695,7 +695,7 @@ define <8 x i32> @stack_fold_pcmpgtd(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pcmpgtd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -710,7 +710,7 @@ define <4 x i64> @stack_fold_pcmpgtq(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_pcmpgtq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -725,7 +725,7 @@ define <16 x i16> @stack_fold_pcmpgtw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pcmpgtw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -740,7 +740,7 @@ define <8 x i32> @stack_fold_perm2i128(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_perm2i128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -758,7 +758,7 @@ define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_permd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -776,7 +776,7 @@ define <4 x double> @stack_fold_permpd(<4 x double> %a0) { ; CHECK-LABEL: stack_fold_permpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -795,11 +795,11 @@ define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_permps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0) @@ -810,7 +810,7 @@ define <4 x i64> @stack_fold_permq(<4 x i64> %a0) { ; CHECK-LABEL: stack_fold_permq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -828,7 +828,7 @@ define <8 x i32> @stack_fold_phaddd(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_phaddd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -843,7 +843,7 @@ define <16 x i16> @stack_fold_phaddsw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_phaddsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -858,7 +858,7 @@ define <16 x i16> @stack_fold_phaddw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_phaddw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -873,7 +873,7 @@ define <8 x i32> @stack_fold_phsubd(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_phsubd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -888,7 +888,7 @@ define <16 x i16> @stack_fold_phsubsw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_phsubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -903,7 +903,7 @@ define <16 x i16> @stack_fold_phsubw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_phsubw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -918,7 +918,7 @@ define <16 x i16> @stack_fold_pmaddubsw(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaddubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -933,7 +933,7 @@ define <8 x i32> @stack_fold_pmaddwd(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaddwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -948,7 +948,7 @@ define <32 x i8> @stack_fold_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaxsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -963,7 +963,7 @@ define <8 x i32> @stack_fold_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmaxsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -978,7 +978,7 @@ define <16 x i16> @stack_fold_pmaxsw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaxsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -993,7 +993,7 @@ define <32 x i8> @stack_fold_pmaxub(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaxub: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1008,7 +1008,7 @@ define <8 x i32> @stack_fold_pmaxud(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmaxud: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1023,7 +1023,7 @@ define <16 x i16> @stack_fold_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaxuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1038,7 +1038,7 @@ define <32 x i8> @stack_fold_pminsb(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pminsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1053,7 +1053,7 @@ define <8 x i32> @stack_fold_pminsd(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pminsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1068,7 +1068,7 @@ define <16 x i16> @stack_fold_pminsw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pminsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1083,7 +1083,7 @@ define <32 x i8> @stack_fold_pminub(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pminub: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1098,7 +1098,7 @@ define <8 x i32> @stack_fold_pminud(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pminud: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1113,7 +1113,7 @@ define <16 x i16> @stack_fold_pminuw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pminuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1128,7 +1128,7 @@ define <8 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1143,7 +1143,7 @@ define <4 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1158,7 +1158,7 @@ define <16 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1172,7 +1172,7 @@ define <4 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pmovsxdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1186,7 +1186,7 @@ define <8 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1200,7 +1200,7 @@ define <4 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1215,7 +1215,7 @@ define <8 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1231,7 +1231,7 @@ define <4 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1247,7 +1247,7 @@ define <16 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1262,7 +1262,7 @@ define <4 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pmovzxdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1277,7 +1277,7 @@ define <8 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1292,7 +1292,7 @@ define <4 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1308,7 +1308,7 @@ define <4 x i64> @stack_fold_pmuldq(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmuldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1328,7 +1328,7 @@ define <16 x i16> @stack_fold_pmulhrsw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmulhrsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1343,7 +1343,7 @@ define <16 x i16> @stack_fold_pmulhuw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmulhuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1358,7 +1358,7 @@ define <16 x i16> @stack_fold_pmulhw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmulhw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1373,7 +1373,7 @@ define <8 x i32> @stack_fold_pmulld(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmulld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1387,7 +1387,7 @@ define <16 x i16> @stack_fold_pmullw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmullw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1401,7 +1401,7 @@ define <4 x i64> @stack_fold_pmuludq(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmuludq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1419,7 +1419,7 @@ define <32 x i8> @stack_fold_por(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_por: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1436,7 +1436,7 @@ define <4 x i64> @stack_fold_psadbw(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_psadbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1451,7 +1451,7 @@ define <32 x i8> @stack_fold_pshufb(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pshufb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1466,7 +1466,7 @@ define <8 x i32> @stack_fold_pshufd(<8 x i32> %a0) { ; CHECK-LABEL: stack_fold_pshufd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1484,7 +1484,7 @@ define <16 x i16> @stack_fold_vpshufhw(<16 x i16> %a0) { ; CHECK-LABEL: stack_fold_vpshufhw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1499,7 +1499,7 @@ define <16 x i16> @stack_fold_vpshuflw(<16 x i16> %a0) { ; CHECK-LABEL: stack_fold_vpshuflw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1514,7 +1514,7 @@ define <32 x i8> @stack_fold_psignb(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_psignb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1529,7 +1529,7 @@ define <8 x i32> @stack_fold_psignd(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_psignd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1544,7 +1544,7 @@ define <16 x i16> @stack_fold_psignw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_psignw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1559,7 +1559,7 @@ define <8 x i32> @stack_fold_pslld(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pslld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1574,7 +1574,7 @@ define <4 x i64> @stack_fold_psllq(<4 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1589,7 +1589,7 @@ define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psllvd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1604,7 +1604,7 @@ define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_psllvd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1619,7 +1619,7 @@ define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllvq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1634,7 +1634,7 @@ define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllvq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1649,7 +1649,7 @@ define <16 x i16> @stack_fold_psllw(<16 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psllw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1664,7 +1664,7 @@ define <8 x i32> @stack_fold_psrad(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrad: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1679,7 +1679,7 @@ define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psravd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1694,7 +1694,7 @@ define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_psravd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1709,7 +1709,7 @@ define <16 x i16> @stack_fold_psraw(<16 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psraw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1724,7 +1724,7 @@ define <8 x i32> @stack_fold_psrld(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1739,7 +1739,7 @@ define <4 x i64> @stack_fold_psrlq(<4 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1754,7 +1754,7 @@ define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrlvd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1769,7 +1769,7 @@ define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrlvd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1784,7 +1784,7 @@ define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlvq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1799,7 +1799,7 @@ define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlvq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1814,7 +1814,7 @@ define <16 x i16> @stack_fold_psrlw(<16 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psrlw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1829,7 +1829,7 @@ define <32 x i8> @stack_fold_psubb(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1843,7 +1843,7 @@ define <8 x i32> @stack_fold_psubd(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_psubd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1857,7 +1857,7 @@ define <4 x i64> @stack_fold_psubq(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_psubq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1871,7 +1871,7 @@ define <32 x i8> @stack_fold_psubsb(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1886,7 +1886,7 @@ define <16 x i16> @stack_fold_psubsw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1901,7 +1901,7 @@ define <32 x i8> @stack_fold_psubusb(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubusb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1916,7 +1916,7 @@ define <16 x i16> @stack_fold_psubusw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubusw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1931,7 +1931,7 @@ define <16 x i16> @stack_fold_psubw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1945,7 +1945,7 @@ define <32 x i8> @stack_fold_punpckhbw(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_punpckhbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1960,7 +1960,7 @@ define <8 x i32> @stack_fold_punpckhdq(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_punpckhdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1978,7 +1978,7 @@ define <4 x i64> @stack_fold_punpckhqdq(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_punpckhqdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1996,7 +1996,7 @@ define <16 x i16> @stack_fold_punpckhwd(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_punpckhwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2011,7 +2011,7 @@ define <32 x i8> @stack_fold_punpcklbw(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_punpcklbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2026,7 +2026,7 @@ define <8 x i32> @stack_fold_punpckldq(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_punpckldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2044,7 +2044,7 @@ define <4 x i64> @stack_fold_punpcklqdq(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_punpcklqdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2062,7 +2062,7 @@ define <16 x i16> @stack_fold_punpcklwd(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_punpcklwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2077,7 +2077,7 @@ define <32 x i8> @stack_fold_pxor(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pxor: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -14,8 +14,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -35,8 +35,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -62,8 +62,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -86,8 +86,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -107,8 +107,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -134,8 +134,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -156,7 +156,7 @@ define <64 x i8> @stack_fold_pavgb(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_pavgb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -171,7 +171,7 @@ define <64 x i8> @stack_fold_pavgb_commuted(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_pavgb_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -185,7 +185,7 @@ define <64 x i8> @stack_fold_pavgb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { ; CHECK-LABEL: stack_fold_pavgb_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -206,7 +206,7 @@ define <64 x i8> @stack_fold_pavgb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { ; CHECK-LABEL: stack_fold_pavgb_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -227,7 +227,7 @@ define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_pavgb_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -244,7 +244,7 @@ define <64 x i8> @stack_fold_pavgb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_pavgb_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -261,7 +261,7 @@ define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pavgw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -276,7 +276,7 @@ define <32 x i16> @stack_fold_pavgw_commuted(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pavgw_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -290,7 +290,7 @@ define <32 x i16> @stack_fold_pavgw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { ; CHECK-LABEL: stack_fold_pavgw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -311,7 +311,7 @@ define <32 x i16> @stack_fold_pavgw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { ; CHECK-LABEL: stack_fold_pavgw_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -332,7 +332,7 @@ define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pavgw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -349,7 +349,7 @@ define <32 x i16> @stack_fold_pavgw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pavgw_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -371,7 +371,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq ; zext forces execution domain @@ -389,7 +389,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq ; zext forces execution domain @@ -407,7 +407,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: retq ; zext forces execution domain %1 = zext <16 x i16> %a0 to <16 x i32> @@ -424,7 +424,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: retq ; zext forces execution domain %1 = zext <8 x i32> %a0 to <8 x i64> @@ -436,7 +436,7 @@ define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_inserti32x8: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -455,7 +455,7 @@ define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_inserti64x4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -474,7 +474,7 @@ define <64 x i8> @stack_fold_pabsb(<64 x i8> %a0) { ; CHECK-LABEL: stack_fold_pabsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -492,8 +492,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -515,7 +515,7 @@ define <64 x i8> @stack_fold_pabsb_maskz(<64 x i8> %a0, i64 %mask) { ; CHECK-LABEL: stack_fold_pabsb_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -534,7 +534,7 @@ define <16 x i32> @stack_fold_pabsd(<16 x i32> %a0) { ; CHECK-LABEL: stack_fold_pabsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -552,8 +552,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -575,7 +575,7 @@ define <16 x i32> @stack_fold_pabsd_maskz(<16 x i32> %a0, i16 %mask) { ; CHECK-LABEL: stack_fold_pabsd_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -594,7 +594,7 @@ define <8 x i64> @stack_fold_pabsq(<8 x i64> %a0) { ; CHECK-LABEL: stack_fold_pabsq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -612,8 +612,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -635,7 +635,7 @@ define <8 x i64> @stack_fold_pabsq_maskz(<8 x i64> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pabsq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -654,7 +654,7 @@ define <32 x i16> @stack_fold_pabsw(<32 x i16> %a0) { ; CHECK-LABEL: stack_fold_pabsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -672,8 +672,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -695,7 +695,7 @@ define <32 x i16> @stack_fold_pabsw_maskz(<32 x i16> %a0, i32 %mask) { ; CHECK-LABEL: stack_fold_pabsw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -714,7 +714,7 @@ define <32 x i16> @stack_fold_packssdw(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_packssdw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -729,7 +729,7 @@ define <64 x i8> @stack_fold_packsswb(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_packsswb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -744,7 +744,7 @@ define <32 x i16> @stack_fold_packusdw(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_packusdw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -759,7 +759,7 @@ define <32 x i16> @stack_fold_packusdw_mask(<32 x i16>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_packusdw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -779,7 +779,7 @@ define <32 x i16> @stack_fold_packusdw_maskz(<16 x i32> %a0, <16 x i32> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_packusdw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -796,7 +796,7 @@ define <64 x i8> @stack_fold_packuswb(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_packuswb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -811,7 +811,7 @@ define <64 x i8> @stack_fold_paddb(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -825,7 +825,7 @@ define <64 x i8> @stack_fold_paddb_commuted(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddb_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -839,7 +839,7 @@ define <64 x i8> @stack_fold_paddb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { ; CHECK-LABEL: stack_fold_paddb_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -860,7 +860,7 @@ define <64 x i8> @stack_fold_paddb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { ; CHECK-LABEL: stack_fold_paddb_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -881,7 +881,7 @@ define <64 x i8> @stack_fold_paddb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_paddb_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -898,7 +898,7 @@ define <64 x i8> @stack_fold_paddb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_paddb_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -915,7 +915,7 @@ define <16 x i32> @stack_fold_paddd(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_paddd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -929,7 +929,7 @@ define <16 x i32> @stack_fold_paddd_commuted(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_paddd_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -943,7 +943,7 @@ define <16 x i32> @stack_fold_paddd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_paddd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -964,7 +964,7 @@ define <16 x i32> @stack_fold_paddd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_paddd_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -985,7 +985,7 @@ define <16 x i32> @stack_fold_paddd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_paddd_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1002,7 +1002,7 @@ define <16 x i32> @stack_fold_paddd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_paddd_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1019,7 +1019,7 @@ define <8 x i64> @stack_fold_paddq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_paddq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1033,7 +1033,7 @@ define <8 x i64> @stack_fold_paddq_commuted(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_paddq_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1047,7 +1047,7 @@ define <8 x i64> @stack_fold_paddq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { ; CHECK-LABEL: stack_fold_paddq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1068,7 +1068,7 @@ define <8 x i64> @stack_fold_paddq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { ; CHECK-LABEL: stack_fold_paddq_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1089,7 +1089,7 @@ define <8 x i64> @stack_fold_paddq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_paddq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1106,7 +1106,7 @@ define <8 x i64> @stack_fold_paddq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_paddq_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1123,7 +1123,7 @@ define <64 x i8> @stack_fold_paddsb(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1137,7 +1137,7 @@ define <64 x i8> @stack_fold_paddsb_commuted(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddsb_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1151,7 +1151,7 @@ define <64 x i8> @stack_fold_paddsb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { ; CHECK-LABEL: stack_fold_paddsb_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1172,7 +1172,7 @@ define <64 x i8> @stack_fold_paddsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { ; CHECK-LABEL: stack_fold_paddsb_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1193,7 +1193,7 @@ define <64 x i8> @stack_fold_paddsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_paddsb_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1210,7 +1210,7 @@ define <64 x i8> @stack_fold_paddsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_paddsb_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1227,7 +1227,7 @@ define <32 x i16> @stack_fold_paddsw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1241,7 +1241,7 @@ define <32 x i16> @stack_fold_paddsw_commuted(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddsw_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1255,7 +1255,7 @@ define <32 x i16> @stack_fold_paddsw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { ; CHECK-LABEL: stack_fold_paddsw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1276,7 +1276,7 @@ define <32 x i16> @stack_fold_paddsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { ; CHECK-LABEL: stack_fold_paddsw_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1297,7 +1297,7 @@ define <32 x i16> @stack_fold_paddsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_paddsw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1314,7 +1314,7 @@ define <32 x i16> @stack_fold_paddsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_paddsw_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1331,7 +1331,7 @@ define <64 x i8> @stack_fold_paddusb(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddusb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1345,7 +1345,7 @@ define <64 x i8> @stack_fold_paddusb_commuted(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddusb_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1359,7 +1359,7 @@ define <64 x i8> @stack_fold_paddusb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { ; CHECK-LABEL: stack_fold_paddusb_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1380,7 +1380,7 @@ define <64 x i8> @stack_fold_paddusb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) { ; CHECK-LABEL: stack_fold_paddusb_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1401,7 +1401,7 @@ define <64 x i8> @stack_fold_paddusb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_paddusb_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1418,7 +1418,7 @@ define <64 x i8> @stack_fold_paddusb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_paddusb_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1435,7 +1435,7 @@ define <32 x i16> @stack_fold_paddusw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddusw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1449,7 +1449,7 @@ define <32 x i16> @stack_fold_paddusw_commuted(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddusw_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1463,7 +1463,7 @@ define <32 x i16> @stack_fold_paddusw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { ; CHECK-LABEL: stack_fold_paddusw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1484,7 +1484,7 @@ define <32 x i16> @stack_fold_paddusw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { ; CHECK-LABEL: stack_fold_paddusw_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1505,7 +1505,7 @@ define <32 x i16> @stack_fold_paddusw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_paddusw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1522,7 +1522,7 @@ define <32 x i16> @stack_fold_paddusw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_paddusw_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1539,7 +1539,7 @@ define <32 x i16> @stack_fold_paddw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1553,7 +1553,7 @@ define <32 x i16> @stack_fold_paddw_commuted(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddw_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1567,7 +1567,7 @@ define <32 x i16> @stack_fold_paddw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { ; CHECK-LABEL: stack_fold_paddw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1588,7 +1588,7 @@ define <32 x i16> @stack_fold_paddw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { ; CHECK-LABEL: stack_fold_paddw_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -1609,7 +1609,7 @@ define <32 x i16> @stack_fold_paddw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_paddw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1626,7 +1626,7 @@ define <32 x i16> @stack_fold_paddw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_paddw_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1645,8 +1645,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1666,8 +1666,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1693,8 +1693,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1715,11 +1715,11 @@ define <16 x i32> @stack_fold_pandd(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pandd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vpandd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <16 x i32> %a0, %a1 @@ -1729,11 +1729,11 @@ define <16 x i32> @stack_fold_pandd_commuted(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pandd_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vpandd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <16 x i32> %a1, %a0 @@ -1743,14 +1743,14 @@ define <16 x i32> @stack_fold_pandd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_pandd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovaps %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm0 -; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpandd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <16 x i32> %a0, %a1 @@ -1764,14 +1764,14 @@ define <16 x i32> @stack_fold_pandd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_pandd_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovaps %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm0 -; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpandd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <16 x i32> %a1, %a0 @@ -1785,12 +1785,12 @@ define <16 x i32> @stack_fold_pandd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pandd_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: vpandd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <16 x i32> %a0, %a1 @@ -1802,12 +1802,12 @@ define <16 x i32> @stack_fold_pandd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pandd_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: vpandd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <16 x i32> %a1, %a0 @@ -1819,11 +1819,11 @@ define <8 x i64> @stack_fold_pandq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pandq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a0, %a1 @@ -1833,11 +1833,11 @@ define <8 x i64> @stack_fold_pandq_commuted(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pandq_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a1, %a0 @@ -1847,14 +1847,14 @@ define <8 x i64> @stack_fold_pandq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { ; CHECK-LABEL: stack_fold_pandq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovapd %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovapd (%rdi), %zmm0 -; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a0, %a1 @@ -1868,14 +1868,14 @@ define <8 x i64> @stack_fold_pandq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { ; CHECK-LABEL: stack_fold_pandq_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovapd %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovapd (%rdi), %zmm0 -; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a1, %a0 @@ -1889,12 +1889,12 @@ define <8 x i64> @stack_fold_pandq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pandq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a0, %a1 @@ -1906,12 +1906,12 @@ define <8 x i64> @stack_fold_pandq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pandq_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = and <8 x i64> %a1, %a0 @@ -1923,7 +1923,7 @@ define <16 x i32> @stack_fold_vpconflictd(<16 x i32> %a0) { ; CHECK-LABEL: stack_fold_vpconflictd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1938,7 +1938,7 @@ define <8 x i64> @stack_fold_vpconflictq(<8 x i64> %a0) { ; CHECK-LABEL: stack_fold_vpconflictq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1953,7 +1953,7 @@ define i64 @stack_fold_pcmpeqb(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1970,7 +1970,7 @@ define i16 @stack_fold_pcmpeqd(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1988,7 +1988,7 @@ define i8 @stack_fold_pcmpeqq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2006,7 +2006,7 @@ define i32 @stack_fold_pcmpeqw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2025,10 +2025,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $184, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 192 -; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2057,10 +2057,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $184, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 192 -; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2089,10 +2089,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $184, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 192 -; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2121,8 +2121,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2149,7 +2149,7 @@ define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_permbvar: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2164,7 +2164,7 @@ define <64 x i8> @stack_fold_permbvar_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_permbvar_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2185,7 +2185,7 @@ define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_permbvar_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2202,7 +2202,7 @@ define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_permd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2221,7 +2221,7 @@ define <64 x i8> @stack_fold_vpermi2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { ; CHECK-LABEL: stack_fold_vpermi2b: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2235,7 +2235,7 @@ define <16 x i32> @stack_fold_vpermi2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: stack_fold_vpermi2d: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2249,7 +2249,7 @@ define <8 x i64> @stack_fold_vpermi2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { ; CHECK-LABEL: stack_fold_vpermi2q: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2263,7 +2263,7 @@ define <32 x i16> @stack_fold_vpermi2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { ; CHECK-LABEL: stack_fold_vpermi2w: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2277,7 +2277,7 @@ define <8 x i64> @stack_fold_permq(<8 x i64> %a0) { ; CHECK-LABEL: stack_fold_permq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2296,7 +2296,7 @@ define <8 x i64> @stack_fold_permq_mask(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_permq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2321,7 +2321,7 @@ define <8 x i64> @stack_fold_permq_maskz(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_permq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2339,7 +2339,7 @@ define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_permqvar: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2358,7 +2358,7 @@ define <8 x i64> @stack_fold_permqvar_mask(<8 x i64>* %passthru, <8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_permqvar_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2382,7 +2382,7 @@ define <64 x i8> @stack_fold_vpermt2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { ; CHECK-LABEL: stack_fold_vpermt2b: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2397,7 +2397,7 @@ define <16 x i32> @stack_fold_vpermt2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: stack_fold_vpermt2d: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2412,7 +2412,7 @@ define <8 x i64> @stack_fold_vpermt2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { ; CHECK-LABEL: stack_fold_vpermt2q: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2427,7 +2427,7 @@ define <32 x i16> @stack_fold_vpermt2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { ; CHECK-LABEL: stack_fold_vpermt2w: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2442,7 +2442,7 @@ define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_permwvar: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2457,7 +2457,7 @@ define <32 x i16> @stack_fold_permwvar_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_permwvar_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2478,7 +2478,7 @@ define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_permwvar_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2762,7 +2762,7 @@ define <16 x i32> @stack_fold_vplzcntd(<16 x i32> %a0) { ; CHECK-LABEL: stack_fold_vplzcntd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2776,7 +2776,7 @@ define <8 x i64> @stack_fold_vplzcntq(<8 x i64> %a0) { ; CHECK-LABEL: stack_fold_vplzcntq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2790,7 +2790,7 @@ define <32 x i16> @stack_fold_pmaddubsw_zmm(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaddubsw_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2805,7 +2805,7 @@ define <32 x i16> @stack_fold_pmaddubsw_zmm_mask(<32 x i16>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pmaddubsw_zmm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2826,7 +2826,7 @@ define <32 x i16> @stack_fold_pmaddubsw_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pmaddubsw_zmm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2843,7 +2843,7 @@ define <16 x i32> @stack_fold_pmaddwd_zmm(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaddwd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2858,7 +2858,7 @@ define <16 x i32> @stack_fold_pmaddwd_zmm_commuted(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaddwd_zmm_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2872,7 +2872,7 @@ define <16 x i32> @stack_fold_pmaddwd_zmm_mask(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2893,7 +2893,7 @@ define <16 x i32> @stack_fold_pmaddwd_zmm_mask_commuted(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2914,7 +2914,7 @@ define <16 x i32> @stack_fold_pmaddwd_zmm_maskz(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2931,7 +2931,7 @@ define <16 x i32> @stack_fold_pmaddwd_zmm_maskz_commuted(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2948,7 +2948,7 @@ define <64 x i8> @stack_fold_pmaxsb(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaxsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2963,7 +2963,7 @@ define <64 x i8> @stack_fold_pmaxsb_commuted(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaxsb_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2978,7 +2978,7 @@ define <64 x i8> @stack_fold_pmaxsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxsb_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3000,7 +3000,7 @@ define <64 x i8> @stack_fold_pmaxsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxsb_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3022,7 +3022,7 @@ define <64 x i8> @stack_fold_pmaxsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_pmaxsb_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3040,7 +3040,7 @@ define <64 x i8> @stack_fold_pmaxsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_pmaxsb_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3058,7 +3058,7 @@ define <16 x i32> @stack_fold_pmaxsd(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmaxsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3073,7 +3073,7 @@ define <16 x i32> @stack_fold_pmaxsd_commuted(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmaxsd_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3088,7 +3088,7 @@ define <16 x i32> @stack_fold_pmaxsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxsd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3110,7 +3110,7 @@ define <16 x i32> @stack_fold_pmaxsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxsd_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3132,7 +3132,7 @@ define <16 x i32> @stack_fold_pmaxsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pmaxsd_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3150,7 +3150,7 @@ define <16 x i32> @stack_fold_pmaxsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pmaxsd_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3168,7 +3168,7 @@ define <8 x i64> @stack_fold_pmaxsq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmaxsq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3183,7 +3183,7 @@ define <8 x i64> @stack_fold_pmaxsq_commuted(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmaxsq_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3198,7 +3198,7 @@ define <8 x i64> @stack_fold_pmaxsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxsq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3220,7 +3220,7 @@ define <8 x i64> @stack_fold_pmaxsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxsq_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3242,7 +3242,7 @@ define <8 x i64> @stack_fold_pmaxsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaxsq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3260,7 +3260,7 @@ define <8 x i64> @stack_fold_pmaxsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaxsq_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3278,7 +3278,7 @@ define <32 x i16> @stack_fold_pmaxsw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaxsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3293,7 +3293,7 @@ define <32 x i16> @stack_fold_pmaxsw_commuted(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaxsw_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3308,7 +3308,7 @@ define <32 x i16> @stack_fold_pmaxsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxsw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3330,7 +3330,7 @@ define <32 x i16> @stack_fold_pmaxsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxsw_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3352,7 +3352,7 @@ define <32 x i16> @stack_fold_pmaxsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pmaxsw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3370,7 +3370,7 @@ define <32 x i16> @stack_fold_pmaxsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pmaxsw_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3388,7 +3388,7 @@ define <64 x i8> @stack_fold_pmaxub(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaxub: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3403,7 +3403,7 @@ define <64 x i8> @stack_fold_pmaxub_commuted(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaxub_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3418,7 +3418,7 @@ define <64 x i8> @stack_fold_pmaxub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxub_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3440,7 +3440,7 @@ define <64 x i8> @stack_fold_pmaxub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxub_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3462,7 +3462,7 @@ define <64 x i8> @stack_fold_pmaxub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_pmaxub_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3480,7 +3480,7 @@ define <64 x i8> @stack_fold_pmaxub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_pmaxub_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3498,7 +3498,7 @@ define <16 x i32> @stack_fold_pmaxud(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmaxud: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3513,7 +3513,7 @@ define <16 x i32> @stack_fold_pmaxud_commuted(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmaxud_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3528,7 +3528,7 @@ define <16 x i32> @stack_fold_pmaxud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxud_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3550,7 +3550,7 @@ define <16 x i32> @stack_fold_pmaxud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxud_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3572,7 +3572,7 @@ define <16 x i32> @stack_fold_pmaxud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pmaxud_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3590,7 +3590,7 @@ define <16 x i32> @stack_fold_pmaxud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pmaxud_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3608,7 +3608,7 @@ define <8 x i64> @stack_fold_pmaxuq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmaxuq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3623,7 +3623,7 @@ define <8 x i64> @stack_fold_pmaxuq_commuted(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmaxuq_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3638,7 +3638,7 @@ define <8 x i64> @stack_fold_pmaxuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxuq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3660,7 +3660,7 @@ define <8 x i64> @stack_fold_pmaxuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxuq_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3682,7 +3682,7 @@ define <8 x i64> @stack_fold_pmaxuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaxuq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3700,7 +3700,7 @@ define <8 x i64> @stack_fold_pmaxuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaxuq_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3718,7 +3718,7 @@ define <32 x i16> @stack_fold_pmaxuw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaxuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3733,7 +3733,7 @@ define <32 x i16> @stack_fold_pmaxuw_commuted(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaxuw_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3748,7 +3748,7 @@ define <32 x i16> @stack_fold_pmaxuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxuw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3770,7 +3770,7 @@ define <32 x i16> @stack_fold_pmaxuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { ; CHECK-LABEL: stack_fold_pmaxuw_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3792,7 +3792,7 @@ define <32 x i16> @stack_fold_pmaxuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pmaxuw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3810,7 +3810,7 @@ define <32 x i16> @stack_fold_pmaxuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pmaxuw_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3828,7 +3828,7 @@ define <64 x i8> @stack_fold_pminsb(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_pminsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3843,7 +3843,7 @@ define <64 x i8> @stack_fold_pminsb_commuted(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_pminsb_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3858,7 +3858,7 @@ define <64 x i8> @stack_fold_pminsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { ; CHECK-LABEL: stack_fold_pminsb_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3880,7 +3880,7 @@ define <64 x i8> @stack_fold_pminsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { ; CHECK-LABEL: stack_fold_pminsb_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3902,7 +3902,7 @@ define <64 x i8> @stack_fold_pminsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_pminsb_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3920,7 +3920,7 @@ define <64 x i8> @stack_fold_pminsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_pminsb_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3938,7 +3938,7 @@ define <16 x i32> @stack_fold_pminsd(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pminsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3953,7 +3953,7 @@ define <16 x i32> @stack_fold_pminsd_commuted(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pminsd_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3968,7 +3968,7 @@ define <16 x i32> @stack_fold_pminsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { ; CHECK-LABEL: stack_fold_pminsd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3990,7 +3990,7 @@ define <16 x i32> @stack_fold_pminsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { ; CHECK-LABEL: stack_fold_pminsd_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4012,7 +4012,7 @@ define <16 x i32> @stack_fold_pminsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pminsd_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4030,7 +4030,7 @@ define <16 x i32> @stack_fold_pminsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pminsd_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4048,7 +4048,7 @@ define <8 x i64> @stack_fold_pminsq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pminsq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4063,7 +4063,7 @@ define <8 x i64> @stack_fold_pminsq_commuted(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pminsq_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4078,7 +4078,7 @@ define <8 x i64> @stack_fold_pminsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { ; CHECK-LABEL: stack_fold_pminsq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4100,7 +4100,7 @@ define <8 x i64> @stack_fold_pminsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { ; CHECK-LABEL: stack_fold_pminsq_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4122,7 +4122,7 @@ define <8 x i64> @stack_fold_pminsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pminsq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4140,7 +4140,7 @@ define <8 x i64> @stack_fold_pminsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pminsq_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4158,7 +4158,7 @@ define <32 x i16> @stack_fold_pminsw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pminsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4173,7 +4173,7 @@ define <32 x i16> @stack_fold_pminsw_commuted(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pminsw_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4188,7 +4188,7 @@ define <32 x i16> @stack_fold_pminsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { ; CHECK-LABEL: stack_fold_pminsw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4210,7 +4210,7 @@ define <32 x i16> @stack_fold_pminsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { ; CHECK-LABEL: stack_fold_pminsw_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4232,7 +4232,7 @@ define <32 x i16> @stack_fold_pminsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pminsw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4250,7 +4250,7 @@ define <32 x i16> @stack_fold_pminsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pminsw_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4268,7 +4268,7 @@ define <64 x i8> @stack_fold_pminub(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_pminub: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4283,7 +4283,7 @@ define <64 x i8> @stack_fold_pminub_commuted(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_pminub_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4298,7 +4298,7 @@ define <64 x i8> @stack_fold_pminub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { ; CHECK-LABEL: stack_fold_pminub_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4320,7 +4320,7 @@ define <64 x i8> @stack_fold_pminub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) { ; CHECK-LABEL: stack_fold_pminub_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4342,7 +4342,7 @@ define <64 x i8> @stack_fold_pminub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_pminub_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4360,7 +4360,7 @@ define <64 x i8> @stack_fold_pminub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_pminub_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4378,7 +4378,7 @@ define <16 x i32> @stack_fold_pminud(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pminud: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4393,7 +4393,7 @@ define <16 x i32> @stack_fold_pminud_commuted(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pminud_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4408,7 +4408,7 @@ define <16 x i32> @stack_fold_pminud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { ; CHECK-LABEL: stack_fold_pminud_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4430,7 +4430,7 @@ define <16 x i32> @stack_fold_pminud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) { ; CHECK-LABEL: stack_fold_pminud_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4452,7 +4452,7 @@ define <16 x i32> @stack_fold_pminud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pminud_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4470,7 +4470,7 @@ define <16 x i32> @stack_fold_pminud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pminud_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4488,7 +4488,7 @@ define <8 x i64> @stack_fold_pminuq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pminuq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4503,7 +4503,7 @@ define <8 x i64> @stack_fold_pminuq_commuted(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pminuq_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4518,7 +4518,7 @@ define <8 x i64> @stack_fold_pminuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { ; CHECK-LABEL: stack_fold_pminuq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4540,7 +4540,7 @@ define <8 x i64> @stack_fold_pminuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) { ; CHECK-LABEL: stack_fold_pminuq_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4562,7 +4562,7 @@ define <8 x i64> @stack_fold_pminuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pminuq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4580,7 +4580,7 @@ define <8 x i64> @stack_fold_pminuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pminuq_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4598,7 +4598,7 @@ define <32 x i16> @stack_fold_pminuw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pminuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4613,7 +4613,7 @@ define <32 x i16> @stack_fold_pminuw_commuted(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pminuw_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4628,7 +4628,7 @@ define <32 x i16> @stack_fold_pminuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { ; CHECK-LABEL: stack_fold_pminuw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4650,7 +4650,7 @@ define <32 x i16> @stack_fold_pminuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) { ; CHECK-LABEL: stack_fold_pminuw_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4672,7 +4672,7 @@ define <32 x i16> @stack_fold_pminuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pminuw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4690,7 +4690,7 @@ define <32 x i16> @stack_fold_pminuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pminuw_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4712,7 +4712,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) @@ -4728,7 +4728,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: retq %1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -4739,7 +4739,7 @@ define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_movq_load: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4762,7 +4762,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: retq %1 = trunc <8 x i64> %a0 to <8 x i32> %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -4777,7 +4777,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) @@ -4793,7 +4793,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: retq %1 = trunc <32 x i16> %a0 to <32 x i8> %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -4808,7 +4808,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) @@ -4824,7 +4824,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: retq %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -4839,7 +4839,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: retq %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -4854,7 +4854,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) @@ -4870,7 +4870,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: retq %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -4881,7 +4881,7 @@ define <16 x i32> @stack_fold_pmovsxbd_zmm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4895,7 +4895,7 @@ define <8 x i64> @stack_fold_pmovsxbq_zmm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbq_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4910,7 +4910,7 @@ define <32 x i16> @stack_fold_pmovsxbw_zmm(<32 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbw_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4924,7 +4924,7 @@ define <8 x i64> @stack_fold_pmovsxdq_zmm(<8 x i32> %a0) { ; CHECK-LABEL: stack_fold_pmovsxdq_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4938,7 +4938,7 @@ define <16 x i32> @stack_fold_pmovsxwd_zmm(<16 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4952,7 +4952,7 @@ define <8 x i64> @stack_fold_pmovsxwq_zmm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwq_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4966,7 +4966,7 @@ define <8 x i64> @stack_fold_pmovsxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pmovsxwq_mask_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4983,7 +4983,7 @@ define <8 x i64> @stack_fold_pmovsxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pmovsxwq_maskz_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5004,7 +5004,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1) @@ -5020,7 +5020,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: retq %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -5035,7 +5035,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: retq %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -5050,7 +5050,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1) @@ -5066,7 +5066,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: retq %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1) %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -5077,7 +5077,7 @@ define <16 x i32> @stack_fold_pmovzxbd_zmm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5092,7 +5092,7 @@ define <8 x i64> @stack_fold_pmovzxbq_zmm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbq_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5108,7 +5108,7 @@ define <32 x i16> @stack_fold_pmovzxbw_zmm(<32 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbw_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5123,7 +5123,7 @@ define <8 x i64> @stack_fold_pmovzxdq_zmm(<8 x i32> %a0) { ; CHECK-LABEL: stack_fold_pmovzxdq_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5138,7 +5138,7 @@ define <16 x i32> @stack_fold_pmovzxwd_zmm(<16 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5153,7 +5153,7 @@ define <8 x i64> @stack_fold_pmovzxwq_zmm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwq_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5168,7 +5168,7 @@ define <8 x i64> @stack_fold_pmovzxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pmovzxwq_mask_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5186,7 +5186,7 @@ define <8 x i64> @stack_fold_pmovzxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pmovzxwq_maskz_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5204,7 +5204,7 @@ define <16 x i32> @stack_fold_pmulld(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmulld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5218,7 +5218,7 @@ define <16 x i32> @stack_fold_pmulld_commuted(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmulld_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5232,7 +5232,7 @@ define <16 x i32> @stack_fold_pmulld_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_pmulld_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -5253,7 +5253,7 @@ define <16 x i32> @stack_fold_pmulld_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_pmulld_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -5274,7 +5274,7 @@ define <16 x i32> @stack_fold_pmulld_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pmulld_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5291,7 +5291,7 @@ define <16 x i32> @stack_fold_pmulld_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pmulld_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5308,7 +5308,7 @@ define <8 x i64> @stack_fold_pmullq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmullq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5322,7 +5322,7 @@ define <8 x i64> @stack_fold_pmullq_commuted(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmullq_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5336,7 +5336,7 @@ define <8 x i64> @stack_fold_pmullq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { ; CHECK-LABEL: stack_fold_pmullq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -5357,7 +5357,7 @@ define <8 x i64> @stack_fold_pmullq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { ; CHECK-LABEL: stack_fold_pmullq_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -5378,7 +5378,7 @@ define <8 x i64> @stack_fold_pmullq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmullq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5395,7 +5395,7 @@ define <8 x i64> @stack_fold_pmullq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmullq_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5412,7 +5412,7 @@ define <32 x i16> @stack_fold_pmullw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmullw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5426,7 +5426,7 @@ define <32 x i16> @stack_fold_pmullw_commuted(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmullw_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5440,7 +5440,7 @@ define <32 x i16> @stack_fold_pmullw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { ; CHECK-LABEL: stack_fold_pmullw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -5461,7 +5461,7 @@ define <32 x i16> @stack_fold_pmullw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) { ; CHECK-LABEL: stack_fold_pmullw_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -5482,7 +5482,7 @@ define <32 x i16> @stack_fold_pmullw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pmullw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5499,7 +5499,7 @@ define <32 x i16> @stack_fold_pmullw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pmullw_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5516,7 +5516,7 @@ define <8 x i64> @stack_fold_pmuldq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmuldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5534,7 +5534,7 @@ define <8 x i64> @stack_fold_pmuldq_commuted(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmuldq_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5552,7 +5552,7 @@ define <8 x i64> @stack_fold_pmuldq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { ; CHECK-LABEL: stack_fold_pmuldq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -5577,7 +5577,7 @@ define <8 x i64> @stack_fold_pmuldq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { ; CHECK-LABEL: stack_fold_pmuldq_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -5602,7 +5602,7 @@ define <8 x i64> @stack_fold_pmuldq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmuldq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5623,7 +5623,7 @@ define <8 x i64> @stack_fold_pmuldq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmuldq_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5647,7 +5647,7 @@ define <8 x i64> @stack_fold_pmuludq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmuludq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5663,7 +5663,7 @@ define <8 x i64> @stack_fold_pmuludq_commuted(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmuludq_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5679,7 +5679,7 @@ define <8 x i64> @stack_fold_pmuludq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { ; CHECK-LABEL: stack_fold_pmuludq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -5702,7 +5702,7 @@ define <8 x i64> @stack_fold_pmuludq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { ; CHECK-LABEL: stack_fold_pmuludq_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -5725,7 +5725,7 @@ define <8 x i64> @stack_fold_pmuludq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmuludq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5744,7 +5744,7 @@ define <8 x i64> @stack_fold_pmuludq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmuludq_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5763,7 +5763,7 @@ define <16 x i32> @stack_fold_vpopcntd(<16 x i32> %a0) { ; CHECK-LABEL: stack_fold_vpopcntd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5778,7 +5778,7 @@ define <8 x i64> @stack_fold_vpopcntq(<8 x i64> %a0) { ; CHECK-LABEL: stack_fold_vpopcntq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -5793,11 +5793,11 @@ define <16 x i32> @stack_fold_pord(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pord: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vpord {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <16 x i32> %a0, %a1 @@ -5807,11 +5807,11 @@ define <16 x i32> @stack_fold_pord_commuted(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pord_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vpord {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <16 x i32> %a1, %a0 @@ -5821,14 +5821,14 @@ define <16 x i32> @stack_fold_pord_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_pord_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovaps %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm0 -; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpord {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <16 x i32> %a0, %a1 @@ -5842,14 +5842,14 @@ define <16 x i32> @stack_fold_pord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_pord_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovaps %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm0 -; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpord {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <16 x i32> %a1, %a0 @@ -5863,12 +5863,12 @@ define <16 x i32> @stack_fold_pord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pord_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: vpord {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <16 x i32> %a0, %a1 @@ -5880,12 +5880,12 @@ define <16 x i32> @stack_fold_pord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pord_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: vpord {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <16 x i32> %a1, %a0 @@ -5897,11 +5897,11 @@ define <8 x i64> @stack_fold_porq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_porq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <8 x i64> %a0, %a1 @@ -5911,11 +5911,11 @@ define <8 x i64> @stack_fold_porq_commuted(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_porq_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <8 x i64> %a1, %a0 @@ -5925,14 +5925,14 @@ define <8 x i64> @stack_fold_porq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { ; CHECK-LABEL: stack_fold_porq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovapd %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovapd (%rdi), %zmm0 -; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <8 x i64> %a0, %a1 @@ -5946,14 +5946,14 @@ define <8 x i64> @stack_fold_porq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { ; CHECK-LABEL: stack_fold_porq_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovapd %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovapd (%rdi), %zmm0 -; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <8 x i64> %a1, %a0 @@ -5967,12 +5967,12 @@ define <8 x i64> @stack_fold_porq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_porq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <8 x i64> %a0, %a1 @@ -5984,12 +5984,12 @@ define <8 x i64> @stack_fold_porq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_porq_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = or <8 x i64> %a1, %a0 @@ -6001,7 +6001,7 @@ define <8 x i64> @stack_fold_psadbw(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_psadbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6016,7 +6016,7 @@ define <8 x i64> @stack_fold_psadbw_commute(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_psadbw_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6030,7 +6030,7 @@ define <64 x i8> @stack_fold_pshufb_zmm(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_pshufb_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6045,7 +6045,7 @@ define <64 x i8> @stack_fold_pshufb_zmm_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_pshufb_zmm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6065,7 +6065,7 @@ define <64 x i8> @stack_fold_pshufb_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_pshufb_zmm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6082,7 +6082,7 @@ define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) { ; CHECK-LABEL: stack_fold_pshufd_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6102,8 +6102,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6124,7 +6124,7 @@ define <16 x i32> @stack_fold_pshufd_zmm_maskz(<16 x i32> %a0, i16 %mask) { ; CHECK-LABEL: stack_fold_pshufd_zmm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6142,7 +6142,7 @@ define <32 x i16> @stack_fold_pshufhw_zmm(<32 x i16> %a0) { ; CHECK-LABEL: stack_fold_pshufhw_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6159,8 +6159,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6181,7 +6181,7 @@ define <32 x i16> @stack_fold_pshufhw_zmm_maskz(<32 x i16> %a0, i32 %mask) { ; CHECK-LABEL: stack_fold_pshufhw_zmm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6199,7 +6199,7 @@ define <32 x i16> @stack_fold_pshuflw_zmm(<32 x i16> %a0) { ; CHECK-LABEL: stack_fold_pshuflw_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6216,8 +6216,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6238,7 +6238,7 @@ define <32 x i16> @stack_fold_pshuflw_zmm_maskz(<32 x i16> %a0, i32 %mask) { ; CHECK-LABEL: stack_fold_pshuflw_zmm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6256,7 +6256,7 @@ define <16 x i32> @stack_fold_pslld(<16 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pslld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6271,7 +6271,7 @@ define <16 x i32> @stack_fold_pslld_mask(<16 x i32>* %passthru, <16 x i32> %a0, <4 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pslld_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6291,7 +6291,7 @@ define <16 x i32> @stack_fold_pslld_maskz(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pslld_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6308,7 +6308,7 @@ define <16 x i32> @stack_fold_pslldi(<16 x i32> %a0) { ; CHECK-LABEL: stack_fold_pslldi: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6323,7 +6323,7 @@ define <16 x i32> @stack_fold_pslldi_mask(<16 x i32>* %passthru, <16 x i32> %a0, i16 %mask) { ; CHECK-LABEL: stack_fold_pslldi_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6343,7 +6343,7 @@ define <16 x i32> @stack_fold_pslldi_maskz(<16 x i32> %a0, i16 %mask) { ; CHECK-LABEL: stack_fold_pslldi_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6360,7 +6360,7 @@ define <64 x i8> @stack_fold_pslldq(<64 x i8> %a, <64 x i8> %b) { ; CHECK-LABEL: stack_fold_pslldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6375,7 +6375,7 @@ define <8 x i64> @stack_fold_psllq(<8 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6390,7 +6390,7 @@ define <8 x i64> @stack_fold_psllqi(<8 x i64> %a0) { ; CHECK-LABEL: stack_fold_psllqi: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6405,7 +6405,7 @@ define <16 x i32> @stack_fold_psllvd(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_psllvd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6420,7 +6420,7 @@ define <16 x i32> @stack_fold_psllvd_mask(<16 x i32>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_psllvd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6440,7 +6440,7 @@ define <16 x i32> @stack_fold_psllvd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_psllvd_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6457,7 +6457,7 @@ define <8 x i64> @stack_fold_psllvq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllvq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6472,7 +6472,7 @@ define <32 x i16> @stack_fold_psllvw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_psllvw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6487,7 +6487,7 @@ define <32 x i16> @stack_fold_psllw(<32 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psllw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6502,7 +6502,7 @@ define <32 x i16> @stack_fold_psllwi(<32 x i16> %a0) { ; CHECK-LABEL: stack_fold_psllwi: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6517,7 +6517,7 @@ define <16 x i32> @stack_fold_psrad(<16 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrad: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6532,7 +6532,7 @@ define <16 x i32> @stack_fold_psradi(<16 x i32> %a0) { ; CHECK-LABEL: stack_fold_psradi: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6547,7 +6547,7 @@ define <8 x i64> @stack_fold_psraq(<8 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psraq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6562,7 +6562,7 @@ define <8 x i64> @stack_fold_psraqi(<8 x i64> %a0) { ; CHECK-LABEL: stack_fold_psraqi: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6577,7 +6577,7 @@ define <16 x i32> @stack_fold_psravd(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_psravd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6592,7 +6592,7 @@ define <8 x i64> @stack_fold_psravq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_psravq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6607,7 +6607,7 @@ define <32 x i16> @stack_fold_psravw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_psravw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6622,7 +6622,7 @@ define <32 x i16> @stack_fold_psraw(<32 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psraw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6637,7 +6637,7 @@ define <32 x i16> @stack_fold_psrawi(<32 x i16> %a0) { ; CHECK-LABEL: stack_fold_psrawi: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6652,7 +6652,7 @@ define <16 x i32> @stack_fold_psrld(<16 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6667,7 +6667,7 @@ define <16 x i32> @stack_fold_psrldi(<16 x i32> %a0) { ; CHECK-LABEL: stack_fold_psrldi: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6682,7 +6682,7 @@ define <64 x i8> @stack_fold_psrldq(<64 x i8> %a, <64 x i8> %b) { ; CHECK-LABEL: stack_fold_psrldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6697,7 +6697,7 @@ define <8 x i64> @stack_fold_psrlq(<8 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6712,7 +6712,7 @@ define <8 x i64> @stack_fold_psrlqi(<8 x i64> %a0) { ; CHECK-LABEL: stack_fold_psrlqi: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6727,7 +6727,7 @@ define <16 x i32> @stack_fold_psrlvd(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrlvd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6742,7 +6742,7 @@ define <8 x i64> @stack_fold_psrlvq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlvq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6757,7 +6757,7 @@ define <32 x i16> @stack_fold_psrlvw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_psrlvw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6772,7 +6772,7 @@ define <32 x i16> @stack_fold_psrlw(<32 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psrlw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6787,7 +6787,7 @@ define <32 x i16> @stack_fold_psrlwi(<32 x i16> %a0) { ; CHECK-LABEL: stack_fold_psrlwi: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6802,7 +6802,7 @@ define <64 x i8> @stack_fold_psubb(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6816,7 +6816,7 @@ define <16 x i32> @stack_fold_psubd(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_psubd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6830,7 +6830,7 @@ define <8 x i64> @stack_fold_psubq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_psubq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6844,7 +6844,7 @@ define <64 x i8> @stack_fold_psubsb(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6858,7 +6858,7 @@ define <32 x i16> @stack_fold_psubsw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6872,7 +6872,7 @@ define <64 x i8> @stack_fold_psubusb(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubusb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6886,7 +6886,7 @@ define <32 x i16> @stack_fold_psubusw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubusw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6900,7 +6900,7 @@ define <32 x i16> @stack_fold_psubw(<32 x i16> %a0, <32 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6916,8 +6916,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6937,8 +6937,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6965,8 +6965,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -6989,8 +6989,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -7017,8 +7017,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -7039,7 +7039,7 @@ define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: stack_fold_ternlogd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -7054,7 +7054,7 @@ define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { ; CHECK-LABEL: stack_fold_ternlogq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -7070,7 +7070,7 @@ define <64 x i8> @stack_fold_punpckhbw_zmm(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_punpckhbw_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -7085,7 +7085,7 @@ define <64 x i8> @stack_fold_punpckhbw_mask_zmm(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_punpckhbw_mask_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -7107,7 +7107,7 @@ define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_punpckhbw_maskz_zmm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -7125,11 +7125,11 @@ define <16 x i32> @stack_fold_pxord(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pxord: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vpxord {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <16 x i32> %a0, %a1 @@ -7139,11 +7139,11 @@ define <16 x i32> @stack_fold_pxord_commuted(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: stack_fold_pxord_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vpxord {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <16 x i32> %a1, %a0 @@ -7153,14 +7153,14 @@ define <16 x i32> @stack_fold_pxord_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_pxord_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovaps %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm0 -; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpxord {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <16 x i32> %a0, %a1 @@ -7174,14 +7174,14 @@ define <16 x i32> @stack_fold_pxord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_pxord_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovaps %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm0 -; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpxord {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <16 x i32> %a1, %a0 @@ -7195,12 +7195,12 @@ define <16 x i32> @stack_fold_pxord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pxord_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: vpxord {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <16 x i32> %a0, %a1 @@ -7212,12 +7212,12 @@ define <16 x i32> @stack_fold_pxord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pxord_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: vpxord {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <16 x i32> %a1, %a0 @@ -7229,11 +7229,11 @@ define <8 x i64> @stack_fold_pxorq(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pxorq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vpxorq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <8 x i64> %a0, %a1 @@ -7243,11 +7243,11 @@ define <8 x i64> @stack_fold_pxorq_commuted(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-LABEL: stack_fold_pxorq_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: vpxorq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <8 x i64> %a1, %a0 @@ -7257,14 +7257,14 @@ define <8 x i64> @stack_fold_pxorq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { ; CHECK-LABEL: stack_fold_pxorq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovapd %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovapd (%rdi), %zmm0 -; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpxorq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <8 x i64> %a0, %a1 @@ -7278,14 +7278,14 @@ define <8 x i64> @stack_fold_pxorq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) { ; CHECK-LABEL: stack_fold_pxorq_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; CHECK-NEXT: vmovapd %zmm0, %zmm1 +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vmovapd (%rdi), %zmm0 -; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vpxorq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <8 x i64> %a1, %a0 @@ -7299,12 +7299,12 @@ define <8 x i64> @stack_fold_pxorq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pxorq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: vpxorq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <8 x i64> %a0, %a1 @@ -7316,12 +7316,12 @@ define <8 x i64> @stack_fold_pxorq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pxorq_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload +; CHECK-NEXT: vpxorq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = xor <8 x i64> %a1, %a0 diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll @@ -12,8 +12,8 @@ define <8 x i32> @stack_fold_valignd_ymm(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: stack_fold_valignd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -29,8 +29,8 @@ define <8 x i32> @stack_fold_valignd_ymm_mask(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %passthru, i8 %mask) { ; CHECK-LABEL: stack_fold_valignd_ymm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -52,8 +52,8 @@ define <8 x i32> @stack_fold_valignd_ymm_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mask) { ; CHECK-LABEL: stack_fold_valignd_ymm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -72,8 +72,8 @@ define <4 x i64> @stack_fold_valignq_ymm(<4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: stack_fold_valignq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -89,7 +89,7 @@ define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pavgb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -108,7 +108,7 @@ define <32 x i8> @stack_fold_pavgb_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pavgb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -127,7 +127,7 @@ define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pavgw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -146,7 +146,7 @@ define <16 x i16> @stack_fold_pavgw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pavgw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -165,7 +165,7 @@ define <4 x i32> @stack_fold_vpconflictd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_vpconflictd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -179,7 +179,7 @@ define <8 x i32> @stack_fold_vpconflictd_ymm(<8 x i32> %a0) { ; CHECK-LABEL: stack_fold_vpconflictd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -193,7 +193,7 @@ define <2 x i64> @stack_fold_vpconflictq(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_vpconflictq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -207,7 +207,7 @@ define <4 x i64> @stack_fold_vpconflictq_ymm(<4 x i64> %a0) { ; CHECK-LABEL: stack_fold_vpconflictq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -226,7 +226,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq ; zext forces execution domain @@ -244,7 +244,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq ; zext forces execution domain @@ -257,7 +257,7 @@ define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_inserti32x4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -276,7 +276,7 @@ define <4 x i64> @stack_fold_inserti64x2(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_inserti64x2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -295,7 +295,7 @@ define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pabsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -311,7 +311,7 @@ define <32 x i8> @stack_fold_pabsb_ymm(<32 x i8> %a0) { ; CHECK-LABEL: stack_fold_pabsb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -327,7 +327,7 @@ define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pabsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -343,7 +343,7 @@ define <8 x i32> @stack_fold_pabsd_ymm(<8 x i32> %a0) { ; CHECK-LABEL: stack_fold_pabsd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -359,7 +359,7 @@ define <2 x i64> @stack_fold_pabsq(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_pabsq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -375,7 +375,7 @@ define <4 x i64> @stack_fold_pabsq_ymm(<4 x i64> %a0) { ; CHECK-LABEL: stack_fold_pabsq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -391,7 +391,7 @@ define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pabsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -407,7 +407,7 @@ define <16 x i16> @stack_fold_pabsw_ymm(<16 x i16> %a0) { ; CHECK-LABEL: stack_fold_pabsw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -423,7 +423,7 @@ define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_packssdw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -438,7 +438,7 @@ define <16 x i16> @stack_fold_packssdw_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_packssdw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -453,7 +453,7 @@ define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_packsswb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -468,7 +468,7 @@ define <32 x i8> @stack_fold_packsswb_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_packsswb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -483,7 +483,7 @@ define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_packusdw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -498,7 +498,7 @@ define <16 x i16> @stack_fold_packusdw_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_packusdw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -513,7 +513,7 @@ define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_packuswb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -528,7 +528,7 @@ define <32 x i8> @stack_fold_packuswb_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_packuswb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -543,7 +543,7 @@ define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -557,7 +557,7 @@ define <16 x i8> @stack_fold_paddb_mask(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_paddb_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -578,7 +578,7 @@ define <16 x i8> @stack_fold_paddb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_paddb_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -595,7 +595,7 @@ define <32 x i8> @stack_fold_paddb_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -609,7 +609,7 @@ define <32 x i8> @stack_fold_paddb_mask_ymm(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %a2, i32 %mask) { ; CHECK-LABEL: stack_fold_paddb_mask_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -630,7 +630,7 @@ define <32 x i8> @stack_fold_paddb_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_paddb_maskz_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -647,7 +647,7 @@ define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_paddd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -661,7 +661,7 @@ define <8 x i32> @stack_fold_paddd_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_paddd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -675,7 +675,7 @@ define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_paddq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -689,7 +689,7 @@ define <4 x i64> @stack_fold_paddq_ymm(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_paddq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -703,7 +703,7 @@ define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -718,7 +718,7 @@ define <32 x i8> @stack_fold_paddsb_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddsb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -733,7 +733,7 @@ define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -748,7 +748,7 @@ define <16 x i16> @stack_fold_paddsw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddsw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -763,7 +763,7 @@ define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddusb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -778,7 +778,7 @@ define <32 x i8> @stack_fold_paddusb_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_paddusb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -793,7 +793,7 @@ define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddusw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -808,7 +808,7 @@ define <16 x i16> @stack_fold_paddusw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddusw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -823,7 +823,7 @@ define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -837,7 +837,7 @@ define <16 x i16> @stack_fold_paddw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_paddw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -851,8 +851,8 @@ define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_palignr: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -868,8 +868,8 @@ define <32 x i8> @stack_fold_palignr_mask(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %passthru, i32 %mask) { ; CHECK-LABEL: stack_fold_palignr_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -891,8 +891,8 @@ define <32 x i8> @stack_fold_palignr_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_palignr_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -911,7 +911,7 @@ define i16 @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -928,7 +928,7 @@ define i8 @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -946,7 +946,7 @@ define i8 @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -964,7 +964,7 @@ define i8 @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pcmpeqw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -981,8 +981,8 @@ define <32 x i8> @stack_fold_permbvar(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_permbvar: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1002,7 +1002,7 @@ define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_permd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1021,7 +1021,7 @@ define <16 x i8> @stack_fold_vpermi2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { ; CHECK-LABEL: stack_fold_vpermi2b: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1035,7 +1035,7 @@ define <32 x i8> @stack_fold_vpermi2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { ; CHECK-LABEL: stack_fold_vpermi2b_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1049,7 +1049,7 @@ define <4 x i32> @stack_fold_vpermi2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { ; CHECK-LABEL: stack_fold_vpermi2d: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1063,7 +1063,7 @@ define <8 x i32> @stack_fold_vpermi2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { ; CHECK-LABEL: stack_fold_vpermi2d_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1077,7 +1077,7 @@ define <2 x i64> @stack_fold_vpermi2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { ; CHECK-LABEL: stack_fold_vpermi2q: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1091,7 +1091,7 @@ define <4 x i64> @stack_fold_vpermi2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { ; CHECK-LABEL: stack_fold_vpermi2q_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1105,7 +1105,7 @@ define <8 x i16> @stack_fold_vpermi2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { ; CHECK-LABEL: stack_fold_vpermi2w: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1119,7 +1119,7 @@ define <16 x i16> @stack_fold_vpermi2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { ; CHECK-LABEL: stack_fold_vpermi2w_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1133,7 +1133,7 @@ define <4 x i64> @stack_fold_permq(<4 x i64> %a0) { ; CHECK-LABEL: stack_fold_permq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1152,8 +1152,8 @@ define <4 x i64> @stack_fold_permqvar(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_permqvar: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1173,7 +1173,7 @@ define <16 x i8> @stack_fold_vpermt2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { ; CHECK-LABEL: stack_fold_vpermt2b: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1188,7 +1188,7 @@ define <32 x i8> @stack_fold_vpermt2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { ; CHECK-LABEL: stack_fold_vpermt2b_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1203,7 +1203,7 @@ define <4 x i32> @stack_fold_vpermt2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { ; CHECK-LABEL: stack_fold_vpermt2d: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1218,7 +1218,7 @@ define <8 x i32> @stack_fold_vpermt2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { ; CHECK-LABEL: stack_fold_vpermt2d_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1233,7 +1233,7 @@ define <2 x i64> @stack_fold_vpermt2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { ; CHECK-LABEL: stack_fold_vpermt2q: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1248,7 +1248,7 @@ define <4 x i64> @stack_fold_vpermt2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { ; CHECK-LABEL: stack_fold_vpermt2q_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1263,7 +1263,7 @@ define <8 x i16> @stack_fold_vpermt2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { ; CHECK-LABEL: stack_fold_vpermt2w: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1278,7 +1278,7 @@ define <16 x i16> @stack_fold_vpermt2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { ; CHECK-LABEL: stack_fold_vpermt2w_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1293,8 +1293,8 @@ define <16 x i16> @stack_fold_permwvar(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_permwvar: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1314,7 +1314,7 @@ define <4 x i32> @stack_fold_vplzcntd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_vplzcntd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1328,7 +1328,7 @@ define <8 x i32> @stack_fold_vplzcntd_ymm(<8 x i32> %a0) { ; CHECK-LABEL: stack_fold_vplzcntd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1342,7 +1342,7 @@ define <2 x i64> @stack_fold_vplzcntq(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_vplzcntq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1356,7 +1356,7 @@ define <4 x i64> @stack_fold_vplzcntq_ymm(<4 x i64> %a0) { ; CHECK-LABEL: stack_fold_vplzcntq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1370,7 +1370,7 @@ define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaddubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1385,7 +1385,7 @@ define <8 x i16> @stack_fold_pmaddubsw_mask(<8 x i16>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaddubsw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1406,7 +1406,7 @@ define <8 x i16> @stack_fold_pmaddubsw_maskz(<16 x i8> %a0, <16 x i8> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaddubsw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1423,7 +1423,7 @@ define <16 x i16> @stack_fold_pmaddubsw_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaddubsw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1438,7 +1438,7 @@ define <16 x i16> @stack_fold_pmaddubsw_ymm_mask(<16 x i16>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pmaddubsw_ymm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1459,7 +1459,7 @@ define <16 x i16> @stack_fold_pmaddubsw_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pmaddubsw_ymm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1476,7 +1476,7 @@ define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaddwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1491,7 +1491,7 @@ define <4 x i32> @stack_fold_pmaddwd_mask(<4 x i32>* %passthru, <8 x i16> %a0, <8 x i16> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaddwd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1513,7 +1513,7 @@ define <4 x i32> @stack_fold_pmaddwd_maskz(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaddwd_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1531,7 +1531,7 @@ define <8 x i32> @stack_fold_pmaddwd_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaddwd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1546,7 +1546,7 @@ define <8 x i32> @stack_fold_pmaddwd_ymm_mask(<8 x i32>* %passthru, <16 x i16> %a0, <16 x i16> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaddwd_ymm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1567,7 +1567,7 @@ define <8 x i32> @stack_fold_pmaddwd_ymm_maskz(<16 x i16> %a0, <16 x i16> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaddwd_ymm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1584,7 +1584,7 @@ define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaxsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1599,7 +1599,7 @@ define <32 x i8> @stack_fold_pmaxsb_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaxsb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1614,7 +1614,7 @@ define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmaxsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1629,7 +1629,7 @@ define <8 x i32> @stack_fold_pmaxsd_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmaxsd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1644,7 +1644,7 @@ define <2 x i64> @stack_fold_pmaxsq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmaxsq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1659,7 +1659,7 @@ define <4 x i64> @stack_fold_pmaxsq_ymm(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmaxsq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1674,7 +1674,7 @@ define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaxsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1689,7 +1689,7 @@ define <16 x i16> @stack_fold_pmaxsw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaxsw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1704,7 +1704,7 @@ define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaxub: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1719,7 +1719,7 @@ define <32 x i8> @stack_fold_pmaxub_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pmaxub_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1734,7 +1734,7 @@ define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmaxud: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1749,7 +1749,7 @@ define <8 x i32> @stack_fold_pmaxud_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmaxud_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1764,7 +1764,7 @@ define <2 x i64> @stack_fold_pmaxuq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmaxuq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1779,7 +1779,7 @@ define <2 x i64> @stack_fold_pmaxuq_mask(<2 x i64>* %passthru, <2 x i64> %a0, <2 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaxuq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1801,7 +1801,7 @@ define <2 x i64> @stack_fold_pmaxuq_maskz(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaxuq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1820,7 +1820,7 @@ define <4 x i64> @stack_fold_pmaxuq_ymm(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_pmaxuq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1835,7 +1835,7 @@ define <4 x i64> @stack_fold_pmaxuq_ymm_mask(<4 x i64>* %passthru, <4 x i64> %a0, <4 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaxuq_ymm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1857,7 +1857,7 @@ define <4 x i64> @stack_fold_pmaxuq_ymm_maskz(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmaxuq_ymm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1876,7 +1876,7 @@ define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaxuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1891,7 +1891,7 @@ define <16 x i16> @stack_fold_pmaxuw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pmaxuw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1907,7 +1907,7 @@ define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pminsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1922,7 +1922,7 @@ define <32 x i8> @stack_fold_pminsb_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pminsb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1937,7 +1937,7 @@ define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pminsd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1952,7 +1952,7 @@ define <8 x i32> @stack_fold_pminsd_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pminsd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1967,7 +1967,7 @@ define <2 x i64> @stack_fold_pminsq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_pminsq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1982,7 +1982,7 @@ define <4 x i64> @stack_fold_pminsq_ymm(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_pminsq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -1997,7 +1997,7 @@ define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pminsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2012,7 +2012,7 @@ define <16 x i16> @stack_fold_pminsw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pminsw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2027,7 +2027,7 @@ define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pminub: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2042,7 +2042,7 @@ define <32 x i8> @stack_fold_pminub_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pminub_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2057,7 +2057,7 @@ define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pminud: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2072,7 +2072,7 @@ define <8 x i32> @stack_fold_pminud_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pminud_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2087,7 +2087,7 @@ define <2 x i64> @stack_fold_pminuq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_pminuq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2102,7 +2102,7 @@ define <4 x i64> @stack_fold_pminuq_ymm(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_pminuq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2117,7 +2117,7 @@ define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_pminuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2132,7 +2132,7 @@ define <16 x i16> @stack_fold_pminuw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_pminuw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2151,7 +2151,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1) @@ -2167,7 +2167,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = trunc <4 x i64> %a0 to <4 x i32> @@ -2183,7 +2183,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = trunc <16 x i16> %a0 to <16 x i8> @@ -2199,7 +2199,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1) @@ -2215,7 +2215,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1) @@ -2231,7 +2231,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1) @@ -2243,7 +2243,7 @@ define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2258,7 +2258,7 @@ define <8 x i32> @stack_fold_pmovsxbd_ymm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2273,7 +2273,7 @@ define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2288,7 +2288,7 @@ define <4 x i64> @stack_fold_pmovsxbq_ymm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2303,7 +2303,7 @@ define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2318,7 +2318,7 @@ define <16 x i16> @stack_fold_pmovsxbw_ymm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovsxbw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2332,7 +2332,7 @@ define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pmovsxdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2347,7 +2347,7 @@ define <4 x i64> @stack_fold_pmovsxdq_ymm(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pmovsxdq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2361,7 +2361,7 @@ define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2376,7 +2376,7 @@ define <8 x i32> @stack_fold_pmovsxwd_ymm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2390,7 +2390,7 @@ define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2405,7 +2405,7 @@ define <4 x i64> @stack_fold_pmovsxwq_ymm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovsxwq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2424,7 +2424,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1) @@ -2440,7 +2440,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1) @@ -2456,7 +2456,7 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1) @@ -2468,7 +2468,7 @@ define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2484,7 +2484,7 @@ define <8 x i32> @stack_fold_pmovzxbd_ymm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2500,7 +2500,7 @@ define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2516,7 +2516,7 @@ define <4 x i64> @stack_fold_pmovzxbq_ymm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2532,7 +2532,7 @@ define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2548,7 +2548,7 @@ define <16 x i16> @stack_fold_pmovzxbw_ymm(<16 x i8> %a0) { ; CHECK-LABEL: stack_fold_pmovzxbw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2563,7 +2563,7 @@ define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pmovzxdq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2579,7 +2579,7 @@ define <4 x i64> @stack_fold_pmovzxdq_ymm(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pmovzxdq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2594,7 +2594,7 @@ define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2610,7 +2610,7 @@ define <8 x i32> @stack_fold_pmovzxwd_ymm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2625,7 +2625,7 @@ define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2641,7 +2641,7 @@ define <4 x i64> @stack_fold_pmovzxwq_ymm(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pmovzxwq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2657,7 +2657,7 @@ define <4 x i64> @stack_fold_pmovzxwq_maskz_ymm(<8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pmovzxwq_maskz_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2677,7 +2677,7 @@ define <4 x i64> @stack_fold_pmovzxwq_mask_ymm(<4 x i64> %passthru, <8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pmovzxwq_mask_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2697,7 +2697,7 @@ define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmuldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2717,7 +2717,7 @@ define <4 x i64> @stack_fold_pmuldq_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmuldq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2737,7 +2737,7 @@ define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmuludq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2755,7 +2755,7 @@ define <4 x i64> @stack_fold_pmuludq_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_pmuludq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2773,8 +2773,8 @@ define <4 x i64> @stack_fold_pmuludq_ymm_mask(<4 x i64>* %passthru, <8 x i32> %a0, <8 x i32> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmuludq_ymm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2800,8 +2800,8 @@ define <4 x i64> @stack_fold_pmuludq_ymm_maskz(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { ; CHECK-LABEL: stack_fold_pmuludq_ymm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2824,7 +2824,7 @@ define <4 x i32> @stack_fold_vpopcntd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_vpopcntd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2839,7 +2839,7 @@ define <8 x i32> @stack_fold_vpopcntd_ymm(<8 x i32> %a0) { ; CHECK-LABEL: stack_fold_vpopcntd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2854,7 +2854,7 @@ define <2 x i64> @stack_fold_vpopcntq(<2 x i64> %a0) { ; CHECK-LABEL: stack_fold_vpopcntq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2869,7 +2869,7 @@ define <4 x i64> @stack_fold_vpopcntq_ymm(<4 x i64> %a0) { ; CHECK-LABEL: stack_fold_vpopcntq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2884,7 +2884,7 @@ define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psadbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2899,7 +2899,7 @@ define <2 x i64> @stack_fold_psadbw_commute(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psadbw_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2913,7 +2913,7 @@ define <4 x i64> @stack_fold_psadbw_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_psadbw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2928,7 +2928,7 @@ define <4 x i64> @stack_fold_psadbw_ymm_commute(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_psadbw_ymm_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2942,7 +2942,7 @@ define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pshufb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2957,7 +2957,7 @@ define <16 x i8> @stack_fold_pshufb_mask(<16 x i8>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pshufb_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2977,7 +2977,7 @@ define <16 x i8> @stack_fold_pshufb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_pshufb_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -2994,7 +2994,7 @@ define <32 x i8> @stack_fold_pshufb_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_pshufb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3009,7 +3009,7 @@ define <32 x i8> @stack_fold_pshufb_ymm_mask(<32 x i8>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pshufb_ymm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3029,7 +3029,7 @@ define <32 x i8> @stack_fold_pshufb_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_pshufb_ymm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3046,7 +3046,7 @@ define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) { ; CHECK-LABEL: stack_fold_pshufd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3064,8 +3064,8 @@ define <4 x i32> @stack_fold_pshufd_mask(<4 x i32> %passthru, <4 x i32> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pshufd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3085,7 +3085,7 @@ define <4 x i32> @stack_fold_pshufd_maskz(<4 x i32> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pshufd_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3104,7 +3104,7 @@ define <8 x i32> @stack_fold_pshufd_ymm(<8 x i32> %a0) { ; CHECK-LABEL: stack_fold_pshufd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3122,8 +3122,8 @@ define <8 x i32> @stack_fold_pshufd_ymm_mask(<8 x i32> %passthru, <8 x i32> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pshufd_ymm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3142,7 +3142,7 @@ define <8 x i32> @stack_fold_pshufd_ymm_maskz(<8 x i32> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pshufd_ymm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3160,7 +3160,7 @@ define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pshufhw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3175,8 +3175,8 @@ define <8 x i16> @stack_fold_pshufhw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pshufhw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3195,7 +3195,7 @@ define <8 x i16> @stack_fold_pshufhw_maskz(<8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pshufhw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3213,7 +3213,7 @@ define <16 x i16> @stack_fold_pshufhw_ymm(<16 x i16> %a0) { ; CHECK-LABEL: stack_fold_pshufhw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3228,8 +3228,8 @@ define <16 x i16> @stack_fold_pshufhw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) { ; CHECK-LABEL: stack_fold_pshufhw_ymm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3248,7 +3248,7 @@ define <16 x i16> @stack_fold_pshufhw_ymm_maskz(<16 x i16> %a0, i16 %mask) { ; CHECK-LABEL: stack_fold_pshufhw_ymm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3266,7 +3266,7 @@ define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) { ; CHECK-LABEL: stack_fold_pshuflw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3281,8 +3281,8 @@ define <8 x i16> @stack_fold_pshuflw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pshuflw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3301,7 +3301,7 @@ define <8 x i16> @stack_fold_pshuflw_maskz(<8 x i16> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pshuflw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3319,7 +3319,7 @@ define <16 x i16> @stack_fold_pshuflw_ymm(<16 x i16> %a0) { ; CHECK-LABEL: stack_fold_pshuflw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3334,8 +3334,8 @@ define <16 x i16> @stack_fold_pshuflw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) { ; CHECK-LABEL: stack_fold_pshuflw_ymm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3354,7 +3354,7 @@ define <16 x i16> @stack_fold_pshuflw_ymm_maskz(<16 x i16> %a0, i16 %mask) { ; CHECK-LABEL: stack_fold_pshuflw_ymm_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3372,7 +3372,7 @@ define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pslld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3387,7 +3387,7 @@ define <8 x i32> @stack_fold_pslld_ymm(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_pslld_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3402,7 +3402,7 @@ define <16 x i8> @stack_fold_pslldq(<16 x i8> %a) { ; CHECK-LABEL: stack_fold_pslldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3417,7 +3417,7 @@ define <32 x i8> @stack_fold_pslldq_ymm(<32 x i8> %a) { ; CHECK-LABEL: stack_fold_pslldq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3432,7 +3432,7 @@ define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3447,7 +3447,7 @@ define <4 x i64> @stack_fold_psllq_ymm(<4 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3462,7 +3462,7 @@ define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psllvd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3477,7 +3477,7 @@ define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_psllvd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3492,7 +3492,7 @@ define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllvq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3507,7 +3507,7 @@ define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_psllvq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3522,7 +3522,7 @@ define <8 x i16> @stack_fold_psllvw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psllvw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3537,7 +3537,7 @@ define <16 x i16> @stack_fold_psllvw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_psllvw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3552,7 +3552,7 @@ define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psllw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3567,7 +3567,7 @@ define <16 x i16> @stack_fold_psllw_ymm(<16 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psllw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3582,7 +3582,7 @@ define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrad: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3597,7 +3597,7 @@ define <8 x i32> @stack_fold_psrad_ymm(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrad_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3612,7 +3612,7 @@ define <2 x i64> @stack_fold_psraq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psraq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3627,7 +3627,7 @@ define <4 x i64> @stack_fold_psraq_ymm(<4 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psraq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3642,7 +3642,7 @@ define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psravd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3657,7 +3657,7 @@ define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_psravd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3672,7 +3672,7 @@ define <2 x i64> @stack_fold_psravq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psravq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3687,7 +3687,7 @@ define <4 x i64> @stack_fold_psravq_ymm(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_psravq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3702,7 +3702,7 @@ define <8 x i16> @stack_fold_psravw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psravw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3717,7 +3717,7 @@ define <16 x i16> @stack_fold_psravw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_psravw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3732,7 +3732,7 @@ define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psraw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3747,7 +3747,7 @@ define <16 x i16> @stack_fold_psraw_ymm(<16 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psraw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3762,7 +3762,7 @@ define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrld: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3777,8 +3777,8 @@ define <8 x i32> @stack_fold_psrld_ymm(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrld_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3794,7 +3794,7 @@ define <16 x i8> @stack_fold_psrldq(<16 x i8> %a) { ; CHECK-LABEL: stack_fold_psrldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3809,7 +3809,7 @@ define <32 x i8> @stack_fold_psrldq_ymm(<32 x i8> %a) { ; CHECK-LABEL: stack_fold_psrldq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3824,7 +3824,7 @@ define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3839,7 +3839,7 @@ define <4 x i64> @stack_fold_psrlq_ymm(<4 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3854,7 +3854,7 @@ define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrlvd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3869,7 +3869,7 @@ define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_psrlvd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3884,7 +3884,7 @@ define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlvq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3899,7 +3899,7 @@ define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_psrlvq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3914,7 +3914,7 @@ define <8 x i16> @stack_fold_psrlvw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psrlvw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3929,7 +3929,7 @@ define <16 x i16> @stack_fold_psrlvw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_psrlvw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3944,7 +3944,7 @@ define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psrlw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3959,7 +3959,7 @@ define <16 x i16> @stack_fold_psrlw_ymm(<16 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psrlw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3974,7 +3974,7 @@ define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -3988,7 +3988,7 @@ define <32 x i8> @stack_fold_psubb_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4002,7 +4002,7 @@ define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: stack_fold_psubd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4016,7 +4016,7 @@ define <8 x i32> @stack_fold_psubd_ymm(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: stack_fold_psubd_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4030,7 +4030,7 @@ define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: stack_fold_psubq: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4044,7 +4044,7 @@ define <4 x i64> @stack_fold_psubq_ymm(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: stack_fold_psubq_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4058,7 +4058,7 @@ define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubsb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4073,7 +4073,7 @@ define <32 x i8> @stack_fold_psubsb_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubsb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4088,7 +4088,7 @@ define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4103,7 +4103,7 @@ define <16 x i16> @stack_fold_psubsw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubsw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4118,7 +4118,7 @@ define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubusb: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4133,7 +4133,7 @@ define <32 x i8> @stack_fold_psubusb_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_psubusb_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4148,7 +4148,7 @@ define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubusw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4163,7 +4163,7 @@ define <16 x i16> @stack_fold_psubusw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubusw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4178,7 +4178,7 @@ define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4192,7 +4192,7 @@ define <16 x i16> @stack_fold_psubw_ymm(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: stack_fold_psubw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4206,7 +4206,7 @@ define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_punpckhbw: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4221,7 +4221,7 @@ define <16 x i8> @stack_fold_punpckhbw_mask(<16 x i8>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_punpckhbw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4243,7 +4243,7 @@ define <16 x i8> @stack_fold_punpckhbw_maskz(<16 x i8> %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { ; CHECK-LABEL: stack_fold_punpckhbw_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4261,7 +4261,7 @@ define <32 x i8> @stack_fold_punpckhbw_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_punpckhbw_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4276,7 +4276,7 @@ define <32 x i8> @stack_fold_punpckhbw_mask_ymm(<32 x i8>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_punpckhbw_mask_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4298,7 +4298,7 @@ define <32 x i8> @stack_fold_punpckhbw_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { ; CHECK-LABEL: stack_fold_punpckhbw_maskz_ymm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4316,8 +4316,8 @@ define <4 x i64> @stack_fold_shufi64x2_maskz(<4 x i64> %a, <4 x i64> %b, i8 %mask) { ; CHECK-LABEL: stack_fold_shufi64x2_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -4337,8 +4337,8 @@ define <8 x i32> @stack_fold_shufi32x4_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mask) { ; CHECK-LABEL: stack_fold_shufi32x4_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512vnni.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512vnni.ll --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512vnni.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512vnni.ll @@ -7,7 +7,7 @@ define <16 x i32> @stack_fold_vpdpwssd(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -21,7 +21,7 @@ define <16 x i32> @stack_fold_vpdpwssd_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -35,7 +35,7 @@ define <16 x i32> @stack_fold_vpdpwssd_mask(<16 x i32>* %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_vpdpwssd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -55,7 +55,7 @@ define <16 x i32> @stack_fold_vpdpwssd_mask_commuted(<16 x i32>* %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_vpdpwssd_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -75,7 +75,7 @@ define <16 x i32> @stack_fold_vpdpwssd_maskz(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %mask) { ; CHECK-LABEL: stack_fold_vpdpwssd_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -93,7 +93,7 @@ define <16 x i32> @stack_fold_vpdpwssd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %mask) { ; CHECK-LABEL: stack_fold_vpdpwssd_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -111,7 +111,7 @@ define <16 x i32> @stack_fold_vpdpwssds(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssds: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -125,7 +125,7 @@ define <16 x i32> @stack_fold_vpdpwssds_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssds_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -139,7 +139,7 @@ define <16 x i32> @stack_fold_vpdpwssds_mask(<16 x i32>* %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_vpdpwssds_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -159,7 +159,7 @@ define <16 x i32> @stack_fold_vpdpwssds_mask_commuted(<16 x i32>* %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_vpdpwssds_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -179,7 +179,7 @@ define <16 x i32> @stack_fold_vpdpwssds_maskz(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %mask) { ; CHECK-LABEL: stack_fold_vpdpwssds_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -197,7 +197,7 @@ define <16 x i32> @stack_fold_vpdpwssds_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %mask) { ; CHECK-LABEL: stack_fold_vpdpwssds_maskz_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll --- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll @@ -16,7 +16,7 @@ define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -30,7 +30,7 @@ define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -44,7 +44,7 @@ define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -58,7 +58,7 @@ define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd_256_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -72,7 +72,7 @@ define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssds: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -86,7 +86,7 @@ define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssds_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -100,7 +100,7 @@ define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssds_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -114,7 +114,7 @@ define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssds_256_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -128,7 +128,7 @@ define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -142,7 +142,7 @@ define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusd_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -157,7 +157,7 @@ define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -171,7 +171,7 @@ define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusd_256_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -186,7 +186,7 @@ define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusds: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -200,7 +200,7 @@ define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusds_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -215,7 +215,7 @@ define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusds_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP @@ -229,7 +229,7 @@ define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusds_256_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP diff --git a/llvm/test/CodeGen/X86/statepoint-no-realign-stack.ll b/llvm/test/CodeGen/X86/statepoint-no-realign-stack.ll --- a/llvm/test/CodeGen/X86/statepoint-no-realign-stack.ll +++ b/llvm/test/CodeGen/X86/statepoint-no-realign-stack.ll @@ -21,10 +21,10 @@ ; CHECK-NEXT: .cfi_def_cfa_register %rbp ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa %ymm0, (%rsp) ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq foo +; CHECK-NEXT: callq foo@PLT ; CHECK-NEXT: .Ltmp0: ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp @@ -40,10 +40,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vmovups %ymm0, (%rsp) +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vmovdqu %ymm0, (%rsp) ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq foo +; CHECK-NEXT: callq foo@PLT ; CHECK-NEXT: .Ltmp1: ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -66,11 +66,11 @@ ; CHECK-NEXT: .cfi_def_cfa_register %rbp ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp -; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: vmovdqa %ymm0, (%rsp) ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq do_safepoint +; CHECK-NEXT: callq do_safepoint@PLT ; CHECK-NEXT: .Ltmp2: -; CHECK-NEXT: vmovaps (%rsp), %ymm0 +; CHECK-NEXT: vmovdqa (%rsp), %ymm0 ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 @@ -86,11 +86,11 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: vmovups %ymm0, (%rsp) +; CHECK-NEXT: vmovdqu %ymm0, (%rsp) ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq do_safepoint +; CHECK-NEXT: callq do_safepoint@PLT ; CHECK-NEXT: .Ltmp3: -; CHECK-NEXT: vmovups (%rsp), %ymm0 +; CHECK-NEXT: vmovdqu (%rsp), %ymm0 ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -15,28 +15,57 @@ ; define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind { -; X86-LABEL: test_broadcast_2f64_4f64: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_2f64_4f64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: retl ; -; X64-LABEL: test_broadcast_2f64_4f64: -; X64: # %bb.0: -; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-NEXT: retq +; X86-AVX2-LABEL: test_broadcast_2f64_4f64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: test_broadcast_2f64_4f64: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_2f64_4f64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_2f64_4f64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_2f64_4f64: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512-NEXT: retq %1 = load <2 x double>, <2 x double> *%p %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> ret <4 x double> %2 } define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_2f64_8f64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_2f64_8f64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_2f64_8f64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_2f64_8f64: ; X86-AVX512: # %bb.0: @@ -44,11 +73,17 @@ ; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_2f64_8f64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_2f64_8f64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_2f64_8f64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_2f64_8f64: ; X64-AVX512: # %bb.0: @@ -60,12 +95,19 @@ } define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_4f64_8f64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovaps (%eax), %ymm0 -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_4f64_8f64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovaps (%eax), %ymm0 +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_4f64_8f64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vmovdqa (%eax), %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_4f64_8f64: ; X86-AVX512: # %bb.0: @@ -73,11 +115,17 @@ ; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_4f64_8f64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_4f64_8f64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_4f64_8f64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_4f64_8f64: ; X64-AVX512: # %bb.0: @@ -89,11 +137,17 @@ } define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_2i64_4i64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_2i64_4i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_2i64_4i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_2i64_4i64: ; X86-AVX512: # %bb.0: @@ -101,10 +155,15 @@ ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_2i64_4i64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_2i64_4i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_2i64_4i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_2i64_4i64: ; X64-AVX512: # %bb.0: @@ -116,12 +175,19 @@ } define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_2i64_8i64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_2i64_8i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_2i64_8i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_2i64_8i64: ; X86-AVX512: # %bb.0: @@ -129,11 +195,17 @@ ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_2i64_8i64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_2i64_8i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_2i64_8i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_2i64_8i64: ; X64-AVX512: # %bb.0: @@ -145,12 +217,19 @@ } define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_4i64_8i64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovaps (%eax), %ymm0 -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_4i64_8i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovaps (%eax), %ymm0 +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_4i64_8i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vmovdqa (%eax), %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_4i64_8i64: ; X86-AVX512: # %bb.0: @@ -158,11 +237,17 @@ ; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_4i64_8i64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_4i64_8i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_4i64_8i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_4i64_8i64: ; X64-AVX512: # %bb.0: @@ -174,28 +259,57 @@ } define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind { -; X86-LABEL: test_broadcast_4f32_8f32: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_4f32_8f32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: retl ; -; X64-LABEL: test_broadcast_4f32_8f32: -; X64: # %bb.0: -; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-NEXT: retq +; X86-AVX2-LABEL: test_broadcast_4f32_8f32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: test_broadcast_4f32_8f32: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_4f32_8f32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_4f32_8f32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_4f32_8f32: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512-NEXT: retq %1 = load <4 x float>, <4 x float> *%p %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> ret <8 x float> %2 } define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_4f32_16f32: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_4f32_16f32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_4f32_16f32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_4f32_16f32: ; X86-AVX512: # %bb.0: @@ -203,11 +317,17 @@ ; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_4f32_16f32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_4f32_16f32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_4f32_16f32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_4f32_16f32: ; X64-AVX512: # %bb.0: @@ -219,12 +339,19 @@ } define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_8f32_16f32: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovaps (%eax), %ymm0 -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_8f32_16f32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovaps (%eax), %ymm0 +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_8f32_16f32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vmovdqa (%eax), %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_8f32_16f32: ; X86-AVX512: # %bb.0: @@ -232,11 +359,17 @@ ; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_8f32_16f32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_8f32_16f32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_8f32_16f32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_8f32_16f32: ; X64-AVX512: # %bb.0: @@ -248,11 +381,17 @@ } define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_4i32_8i32: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_4i32_8i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_4i32_8i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_4i32_8i32: ; X86-AVX512: # %bb.0: @@ -260,10 +399,15 @@ ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_4i32_8i32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_4i32_8i32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_4i32_8i32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_4i32_8i32: ; X64-AVX512: # %bb.0: @@ -275,12 +419,19 @@ } define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_4i32_16i32: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_4i32_16i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_4i32_16i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_4i32_16i32: ; X86-AVX512: # %bb.0: @@ -288,11 +439,17 @@ ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_4i32_16i32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_4i32_16i32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_4i32_16i32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_4i32_16i32: ; X64-AVX512: # %bb.0: @@ -304,12 +461,19 @@ } define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_8i32_16i32: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovaps (%eax), %ymm0 -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_8i32_16i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovaps (%eax), %ymm0 +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_8i32_16i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vmovdqa (%eax), %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_8i32_16i32: ; X86-AVX512: # %bb.0: @@ -317,11 +481,17 @@ ; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_8i32_16i32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_8i32_16i32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_8i32_16i32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_8i32_16i32: ; X64-AVX512: # %bb.0: @@ -333,11 +503,17 @@ } define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_8i16_16i16: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_8i16_16i16: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_8i16_16i16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_8i16_16i16: ; X86-AVX512: # %bb.0: @@ -345,10 +521,15 @@ ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_8i16_16i16: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_8i16_16i16: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_8i16_16i16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_8i16_16i16: ; X64-AVX512: # %bb.0: @@ -360,12 +541,19 @@ } define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_8i16_32i16: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_8i16_32i16: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_8i16_32i16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_8i16_32i16: ; X86-AVX512: # %bb.0: @@ -373,11 +561,17 @@ ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_8i16_32i16: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_8i16_32i16: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_8i16_32i16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_8i16_32i16: ; X64-AVX512: # %bb.0: @@ -389,12 +583,19 @@ } define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_16i16_32i16: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovaps (%eax), %ymm0 -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_16i16_32i16: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovaps (%eax), %ymm0 +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_16i16_32i16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vmovdqa (%eax), %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_16i16_32i16: ; X86-AVX512: # %bb.0: @@ -402,11 +603,17 @@ ; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_16i16_32i16: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_16i16_32i16: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_16i16_32i16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_16i16_32i16: ; X64-AVX512: # %bb.0: @@ -418,11 +625,17 @@ } define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_16i8_32i8: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_16i8_32i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_16i8_32i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_16i8_32i8: ; X86-AVX512: # %bb.0: @@ -430,10 +643,15 @@ ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_16i8_32i8: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_16i8_32i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_16i8_32i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_16i8_32i8: ; X64-AVX512: # %bb.0: @@ -445,12 +663,19 @@ } define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_16i8_64i8: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_16i8_64i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_16i8_64i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_16i8_64i8: ; X86-AVX512: # %bb.0: @@ -458,11 +683,17 @@ ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_16i8_64i8: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_16i8_64i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_16i8_64i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_16i8_64i8: ; X64-AVX512: # %bb.0: @@ -474,12 +705,19 @@ } define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { -; X86-AVX-LABEL: test_broadcast_32i8_64i8: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovaps (%eax), %ymm0 -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_32i8_64i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovaps (%eax), %ymm0 +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_32i8_64i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vmovdqa (%eax), %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_32i8_64i8: ; X86-AVX512: # %bb.0: @@ -487,11 +725,17 @@ ; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_32i8_64i8: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_32i8_64i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_32i8_64i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_32i8_64i8: ; X64-AVX512: # %bb.0: @@ -507,21 +751,53 @@ ; define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) { -; X86-LABEL: test_broadcast_2f64_4f64_reuse: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vmovaps %xmm0, (%eax) -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_2f64_4f64_reuse: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: vmovaps (%ecx), %xmm0 +; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl ; -; X64-LABEL: test_broadcast_2f64_4f64_reuse: -; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX2-LABEL: test_broadcast_2f64_4f64_reuse: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vmovdqa (%ecx), %xmm0 +; X86-AVX2-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: test_broadcast_2f64_4f64_reuse: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vmovdqa (%ecx), %xmm0 +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_2f64_4f64_reuse: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_2f64_4f64_reuse: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_2f64_4f64_reuse: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq %1 = load <2 x double>, <2 x double>* %p0 store <2 x double> %1, <2 x double>* %p1 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> @@ -529,21 +805,53 @@ } define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) { -; X86-LABEL: test_broadcast_2i64_4i64_reuse: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vmovaps %xmm0, (%eax) -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_2i64_4i64_reuse: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: vmovaps (%ecx), %xmm0 +; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl ; -; X64-LABEL: test_broadcast_2i64_4i64_reuse: -; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX2-LABEL: test_broadcast_2i64_4i64_reuse: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vmovdqa (%ecx), %xmm0 +; X86-AVX2-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: test_broadcast_2i64_4i64_reuse: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vmovdqa (%ecx), %xmm0 +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_2i64_4i64_reuse: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_2i64_4i64_reuse: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq %1 = load <2 x i64>, <2 x i64>* %p0 store <2 x i64> %1, <2 x i64>* %p1 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> @@ -551,21 +859,53 @@ } define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) { -; X86-LABEL: test_broadcast_4f32_8f32_reuse: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vmovaps %xmm0, (%eax) -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_4f32_8f32_reuse: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: vmovaps (%ecx), %xmm0 +; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl ; -; X64-LABEL: test_broadcast_4f32_8f32_reuse: -; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX2-LABEL: test_broadcast_4f32_8f32_reuse: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vmovdqa (%ecx), %xmm0 +; X86-AVX2-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: test_broadcast_4f32_8f32_reuse: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vmovdqa (%ecx), %xmm0 +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_4f32_8f32_reuse: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_4f32_8f32_reuse: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_4f32_8f32_reuse: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq %1 = load <4 x float>, <4 x float>* %p0 store <4 x float> %1, <4 x float>* %p1 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> @@ -573,21 +913,53 @@ } define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) { -; X86-LABEL: test_broadcast_4i32_8i32_reuse: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vmovaps %xmm0, (%eax) -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_4i32_8i32_reuse: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: vmovaps (%ecx), %xmm0 +; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl ; -; X64-LABEL: test_broadcast_4i32_8i32_reuse: -; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX2-LABEL: test_broadcast_4i32_8i32_reuse: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vmovdqa (%ecx), %xmm0 +; X86-AVX2-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: test_broadcast_4i32_8i32_reuse: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vmovdqa (%ecx), %xmm0 +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_4i32_8i32_reuse: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_4i32_8i32_reuse: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_4i32_8i32_reuse: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x i32> %1, <4 x i32>* %p1 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> @@ -595,43 +967,107 @@ } define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind { -; X86-LABEL: test_broadcast_8i16_16i16_reuse: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vmovaps %xmm0, (%eax) -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_8i16_16i16_reuse: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: vmovaps (%ecx), %xmm0 +; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl ; -; X64-LABEL: test_broadcast_8i16_16i16_reuse: -; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq - %1 = load <8 x i16>, <8 x i16> *%p0 - store <8 x i16> %1, <8 x i16>* %p1 - %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> - ret <16 x i16> %2 -} - -define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind { -; X86-LABEL: test_broadcast_16i8_32i8_reuse: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vmovaps %xmm0, (%eax) -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX2-LABEL: test_broadcast_8i16_16i16_reuse: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vmovdqa (%ecx), %xmm0 +; X86-AVX2-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl ; -; X64-LABEL: test_broadcast_16i8_32i8_reuse: -; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovaps %xmm0, (%rsi) -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX512-LABEL: test_broadcast_8i16_16i16_reuse: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vmovdqa (%ecx), %xmm0 +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_8i16_16i16_reuse: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_8i16_16i16_reuse: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_8i16_16i16_reuse: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq + %1 = load <8 x i16>, <8 x i16> *%p0 + store <8 x i16> %1, <8 x i16>* %p1 + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> + ret <16 x i16> %2 +} + +define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind { +; X86-AVX1-LABEL: test_broadcast_16i8_32i8_reuse: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: vmovaps (%ecx), %xmm0 +; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_16i8_32i8_reuse: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vmovdqa (%ecx), %xmm0 +; X86-AVX2-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: test_broadcast_16i8_32i8_reuse: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vmovdqa (%ecx), %xmm0 +; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: test_broadcast_16i8_32i8_reuse: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovaps %xmm0, (%rsi) +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_16i8_32i8_reuse: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_broadcast_16i8_32i8_reuse: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq %1 = load <16 x i8>, <16 x i8> *%p0 store <16 x i8> %1, <16 x i8>* %p1 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> @@ -643,36 +1079,52 @@ ; define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) { -; X86-AVX-LABEL: test_broadcast_4i32_8i32_chain: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: vmovaps %xmm1, (%eax) -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_4i32_8i32_chain: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: vmovaps %xmm1, (%eax) +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_4i32_8i32_chain: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: vmovdqa %xmm1, (%eax) +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_4i32_8i32_chain: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX512-NEXT: vmovaps %xmm1, (%eax) +; X86-AVX512-NEXT: vmovdqa %xmm1, (%eax) ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_4i32_8i32_chain: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_4i32_8i32_chain: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vmovdqa %xmm1, (%rsi) +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX512-NEXT: vmovdqa %xmm1, (%rsi) ; X64-AVX512-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x float> zeroinitializer, <4 x float>* %p1 @@ -681,38 +1133,56 @@ } define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) { -; X86-AVX-LABEL: test_broadcast_4i32_16i32_chain: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: vmovaps %xmm1, (%eax) -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_broadcast_4i32_16i32_chain: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: vmovaps %xmm1, (%eax) +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_broadcast_4i32_16i32_chain: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: vmovdqa %xmm1, (%eax) +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_broadcast_4i32_16i32_chain: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X86-AVX512-NEXT: vmovaps %xmm1, (%eax) +; X86-AVX512-NEXT: vmovdqa %xmm1, (%eax) ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_broadcast_4i32_16i32_chain: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_broadcast_4i32_16i32_chain: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vmovdqa %xmm1, (%rsi) +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX512-NEXT: vmovdqa %xmm1, (%rsi) ; X64-AVX512-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x float> zeroinitializer, <4 x float>* %p1 @@ -1058,444 +1528,732 @@ ; define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind { -; X86-LABEL: reg_broadcast_2f64_4f64: -; X86: # %bb.0: -; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_2f64_4f64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl ; -; X64-LABEL: reg_broadcast_2f64_4f64: -; X64: # %bb.0: -; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX2-LABEL: reg_broadcast_2f64_4f64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: reg_broadcast_2f64_4f64: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: reg_broadcast_2f64_4f64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_2f64_4f64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: reg_broadcast_2f64_4f64: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> ret <4 x double> %1 } define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind { -; X86-AVX-LABEL: reg_broadcast_2f64_8f64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_2f64_8f64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: reg_broadcast_2f64_8f64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: reg_broadcast_2f64_8f64: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: reg_broadcast_2f64_8f64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: reg_broadcast_2f64_8f64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_2f64_8f64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: reg_broadcast_2f64_8f64: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> ret <8 x double> %1 } define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind { -; X86-AVX-LABEL: reg_broadcast_4f64_8f64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_4f64_8f64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: reg_broadcast_4f64_8f64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: reg_broadcast_4f64_8f64: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: reg_broadcast_4f64_8f64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: reg_broadcast_4f64_8f64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_4f64_8f64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: reg_broadcast_4f64_8f64: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> ret <8 x double> %1 } define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind { -; X86-LABEL: reg_broadcast_2i64_4i64: -; X86: # %bb.0: -; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_2i64_4i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl ; -; X64-LABEL: reg_broadcast_2i64_4i64: -; X64: # %bb.0: -; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX2-LABEL: reg_broadcast_2i64_4i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: reg_broadcast_2i64_4i64: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: reg_broadcast_2i64_4i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_2i64_4i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: reg_broadcast_2i64_4i64: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> ret <4 x i64> %1 } define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind { -; X86-AVX-LABEL: reg_broadcast_2i64_8i64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_2i64_8i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: reg_broadcast_2i64_8i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: reg_broadcast_2i64_8i64: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: reg_broadcast_2i64_8i64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: reg_broadcast_2i64_8i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_2i64_8i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: reg_broadcast_2i64_8i64: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> ret <8 x i64> %1 } define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind { -; X86-AVX-LABEL: reg_broadcast_4i64_8i64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_4i64_8i64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: reg_broadcast_4i64_8i64: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: reg_broadcast_4i64_8i64: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: reg_broadcast_4i64_8i64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: reg_broadcast_4i64_8i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_4i64_8i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: reg_broadcast_4i64_8i64: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> ret <8 x i64> %1 } define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind { -; X86-LABEL: reg_broadcast_4f32_8f32: -; X86: # %bb.0: -; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_4f32_8f32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl ; -; X64-LABEL: reg_broadcast_4f32_8f32: -; X64: # %bb.0: -; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX2-LABEL: reg_broadcast_4f32_8f32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: reg_broadcast_4f32_8f32: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: reg_broadcast_4f32_8f32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_4f32_8f32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: reg_broadcast_4f32_8f32: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> ret <8 x float> %1 } define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind { -; X86-AVX-LABEL: reg_broadcast_4f32_16f32: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_4f32_16f32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: reg_broadcast_4f32_16f32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: reg_broadcast_4f32_16f32: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: reg_broadcast_4f32_16f32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: reg_broadcast_4f32_16f32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_4f32_16f32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: reg_broadcast_4f32_16f32: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> ret <16 x float> %1 } define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind { -; X86-AVX-LABEL: reg_broadcast_8f32_16f32: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_8f32_16f32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: reg_broadcast_8f32_16f32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: reg_broadcast_8f32_16f32: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: reg_broadcast_8f32_16f32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: reg_broadcast_8f32_16f32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_8f32_16f32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: reg_broadcast_8f32_16f32: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> ret <16 x float> %1 } define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind { -; X86-LABEL: reg_broadcast_4i32_8i32: -; X86: # %bb.0: -; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_4i32_8i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl ; -; X64-LABEL: reg_broadcast_4i32_8i32: -; X64: # %bb.0: -; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX2-LABEL: reg_broadcast_4i32_8i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: reg_broadcast_4i32_8i32: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: reg_broadcast_4i32_8i32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_4i32_8i32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: reg_broadcast_4i32_8i32: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> ret <8 x i32> %1 } define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind { -; X86-AVX-LABEL: reg_broadcast_4i32_16i32: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_4i32_16i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: reg_broadcast_4i32_16i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: reg_broadcast_4i32_16i32: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: reg_broadcast_4i32_16i32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: reg_broadcast_4i32_16i32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_4i32_16i32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: reg_broadcast_4i32_16i32: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> ret <16 x i32> %1 } define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind { -; X86-AVX-LABEL: reg_broadcast_8i32_16i32: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_8i32_16i32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: reg_broadcast_8i32_16i32: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: reg_broadcast_8i32_16i32: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: reg_broadcast_8i32_16i32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: reg_broadcast_8i32_16i32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_8i32_16i32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: reg_broadcast_8i32_16i32: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> ret <16 x i32> %1 } define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind { -; X86-LABEL: reg_broadcast_8i16_16i16: -; X86: # %bb.0: -; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_8i16_16i16: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl ; -; X64-LABEL: reg_broadcast_8i16_16i16: -; X64: # %bb.0: -; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX2-LABEL: reg_broadcast_8i16_16i16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: reg_broadcast_8i16_16i16: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: reg_broadcast_8i16_16i16: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_8i16_16i16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: reg_broadcast_8i16_16i16: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> ret <16 x i16> %1 } define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind { -; X86-AVX-LABEL: reg_broadcast_8i16_32i16: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_8i16_32i16: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: reg_broadcast_8i16_32i16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: reg_broadcast_8i16_32i16: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: reg_broadcast_8i16_32i16: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: reg_broadcast_8i16_32i16: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_8i16_32i16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: reg_broadcast_8i16_32i16: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> ret <32 x i16> %1 } define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind { -; X86-AVX-LABEL: reg_broadcast_16i16_32i16: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_16i16_32i16: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: reg_broadcast_16i16_32i16: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: reg_broadcast_16i16_32i16: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: reg_broadcast_16i16_32i16: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: reg_broadcast_16i16_32i16: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_16i16_32i16: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: reg_broadcast_16i16_32i16: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> ret <32 x i16> %1 } define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind { -; X86-LABEL: reg_broadcast_16i8_32i8: -; X86: # %bb.0: -; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_16i8_32i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: retl ; -; X64-LABEL: reg_broadcast_16i8_32i8: -; X64: # %bb.0: -; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X86-AVX2-LABEL: reg_broadcast_16i8_32i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: reg_broadcast_16i8_32i8: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: reg_broadcast_16i8_32i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_16i8_32i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: reg_broadcast_16i8_32i8: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> ret <32 x i8> %1 } define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind { -; X86-AVX-LABEL: reg_broadcast_16i8_64i8: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_16i8_64i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: reg_broadcast_16i8_64i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: reg_broadcast_16i8_64i8: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: reg_broadcast_16i8_64i8: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: reg_broadcast_16i8_64i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_16i8_64i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: reg_broadcast_16i8_64i8: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> ret <64 x i8> %1 } define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind { -; X86-AVX-LABEL: reg_broadcast_32i8_64i8: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: reg_broadcast_32i8_64i8: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: reg_broadcast_32i8_64i8: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: reg_broadcast_32i8_64i8: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: reg_broadcast_32i8_64i8: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: reg_broadcast_32i8_64i8: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: reg_broadcast_32i8_64i8: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: reg_broadcast_32i8_64i8: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> ret <64 x i8> %1 @@ -1506,60 +2264,117 @@ ; define <4 x i32> @test_2xi32_to_4xi32_mem(<2 x i32>* %vp) { -; X86-LABEL: test_2xi32_to_4xi32_mem: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X86-NEXT: retl +; X86-AVX1-LABEL: test_2xi32_to_4xi32_mem: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X86-AVX1-NEXT: retl ; -; X64-LABEL: test_2xi32_to_4xi32_mem: -; X64: # %bb.0: -; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X64-NEXT: retq +; X86-AVX2-LABEL: test_2xi32_to_4xi32_mem: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vpbroadcastq (%eax), %xmm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: test_2xi32_to_4xi32_mem: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vpbroadcastq (%eax), %xmm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: test_2xi32_to_4xi32_mem: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_2xi32_to_4xi32_mem: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_2xi32_to_4xi32_mem: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpbroadcastq (%rdi), %xmm0 +; X64-AVX512-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> ret <4 x i32> %res } define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) { -; X86-LABEL: test_2xi32_to_8xi32_mem: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vbroadcastsd (%eax), %ymm0 -; X86-NEXT: retl +; X86-AVX1-LABEL: test_2xi32_to_8xi32_mem: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastsd (%eax), %ymm0 +; X86-AVX1-NEXT: retl ; -; X64-LABEL: test_2xi32_to_8xi32_mem: -; X64: # %bb.0: -; X64-NEXT: vbroadcastsd (%rdi), %ymm0 -; X64-NEXT: retq +; X86-AVX2-LABEL: test_2xi32_to_8xi32_mem: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vpbroadcastq (%eax), %ymm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: test_2xi32_to_8xi32_mem: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vpbroadcastq (%eax), %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: test_2xi32_to_8xi32_mem: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_2xi32_to_8xi32_mem: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_2xi32_to_8xi32_mem: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; X64-AVX512-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> ret <8 x i32> %res } define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) { -; X86-AVX-LABEL: test_2xi32_to_16xi32_mem: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm0 -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_2xi32_to_16xi32_mem: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastsd (%eax), %ymm0 +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_2xi32_to_16xi32_mem: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vpbroadcastq (%eax), %ymm0 +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: test_2xi32_to_16xi32_mem: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vbroadcastsd (%eax), %zmm0 +; X86-AVX512-NEXT: vpbroadcastq (%eax), %zmm0 ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: test_2xi32_to_16xi32_mem: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_2xi32_to_16xi32_mem: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_2xi32_to_16xi32_mem: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_2xi32_to_16xi32_mem: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastsd (%rdi), %zmm0 +; X64-AVX512-NEXT: vpbroadcastq (%rdi), %zmm0 ; X64-AVX512-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> @@ -1571,16 +2386,38 @@ ; define <4 x double> @broadcast_v4f64_f64_u000(double* %p) { -; X86-LABEL: broadcast_v4f64_f64_u000: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vbroadcastsd (%eax), %ymm0 -; X86-NEXT: retl +; X86-AVX1-LABEL: broadcast_v4f64_f64_u000: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastsd (%eax), %ymm0 +; X86-AVX1-NEXT: retl ; -; X64-LABEL: broadcast_v4f64_f64_u000: -; X64: # %bb.0: -; X64-NEXT: vbroadcastsd (%rdi), %ymm0 -; X64-NEXT: retq +; X86-AVX2-LABEL: broadcast_v4f64_f64_u000: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vpbroadcastq (%eax), %ymm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: broadcast_v4f64_f64_u000: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vpbroadcastq (%eax), %ymm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: broadcast_v4f64_f64_u000: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: broadcast_v4f64_f64_u000: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: broadcast_v4f64_f64_u000: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; X64-AVX512-NEXT: retq %s = load double, double* %p %vec = insertelement <2 x double> undef, double %s, i32 0 %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> @@ -1588,18 +2425,44 @@ } define <4 x double> @broadcast_v4f64_v2f64_4u61(<2 x double>* %vp, <4 x double> %default) { -; X86-LABEL: broadcast_v4f64_v2f64_4u61: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vinsertf128 $1, (%eax), %ymm0, %ymm1 -; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; X86-NEXT: retl +; X86-AVX1-LABEL: broadcast_v4f64_v2f64_4u61: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vinsertf128 $1, (%eax), %ymm0, %ymm1 +; X86-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; X86-AVX1-NEXT: retl ; -; X64-LABEL: broadcast_v4f64_v2f64_4u61: -; X64: # %bb.0: -; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1 -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; X64-NEXT: retq +; X86-AVX2-LABEL: broadcast_v4f64_v2f64_4u61: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vinserti128 $1, (%eax), %ymm0, %ymm1 +; X86-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: broadcast_v4f64_v2f64_4u61: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vinserti128 $1, (%eax), %ymm0, %ymm1 +; X86-AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: broadcast_v4f64_v2f64_4u61: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1 +; X64-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: broadcast_v4f64_v2f64_4u61: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm1 +; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: broadcast_v4f64_v2f64_4u61: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm1 +; X64-AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; X64-AVX512-NEXT: retq %vec = load <2 x double>, <2 x double>* %vp %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> %res = select <4 x i1> , <4 x double> %shuf, <4 x double> %default @@ -1626,12 +2489,19 @@ } define <8 x double> @broadcast_v8f64_v2f64_u1u10101(<2 x double>* %vp) { -; X86-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: broadcast_v8f64_v2f64_u1u10101: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: broadcast_v8f64_v2f64_u1u10101: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101: ; X86-AVX512: # %bb.0: @@ -1639,11 +2509,17 @@ ; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: broadcast_v8f64_v2f64_u1u10101: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: broadcast_v8f64_v2f64_u1u10101: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101: ; X64-AVX512: # %bb.0: @@ -1655,12 +2531,19 @@ } define <8 x double> @broadcast_v8f64_v2f64_0uuu0101(<2 x double>* %vp) { -; X86-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: broadcast_v8f64_v2f64_0uuu0101: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: broadcast_v8f64_v2f64_0uuu0101: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X86-AVX2-NEXT: retl ; ; X86-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101: ; X86-AVX512: # %bb.0: @@ -1668,11 +2551,17 @@ ; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: retl ; -; X64-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: broadcast_v8f64_v2f64_0uuu0101: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: broadcast_v8f64_v2f64_0uuu0101: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101: ; X64-AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/swap.ll b/llvm/test/CodeGen/X86/swap.ll --- a/llvm/test/CodeGen/X86/swap.ll +++ b/llvm/test/CodeGen/X86/swap.ll @@ -11,20 +11,20 @@ define dso_local void @_Z4SwapP1SS0_(%struct.S* nocapture %a, %struct.S* nocapture %b) local_unnamed_addr { ; NOAA-LABEL: _Z4SwapP1SS0_: ; NOAA: # %bb.0: # %entry -; NOAA-NEXT: vmovups (%rdi), %xmm0 -; NOAA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; NOAA-NEXT: vmovups (%rsi), %xmm0 -; NOAA-NEXT: vmovups %xmm0, (%rdi) -; NOAA-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; NOAA-NEXT: vmovups %xmm0, (%rsi) +; NOAA-NEXT: vmovdqu (%rdi), %xmm0 +; NOAA-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; NOAA-NEXT: vmovdqu (%rsi), %xmm0 +; NOAA-NEXT: vmovdqu %xmm0, (%rdi) +; NOAA-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 +; NOAA-NEXT: vmovdqu %xmm0, (%rsi) ; NOAA-NEXT: retq ; ; AA-LABEL: _Z4SwapP1SS0_: ; AA: # %bb.0: # %entry -; AA-NEXT: vmovups (%rdi), %xmm0 -; AA-NEXT: vmovups (%rsi), %xmm1 -; AA-NEXT: vmovups %xmm1, (%rdi) -; AA-NEXT: vmovups %xmm0, (%rsi) +; AA-NEXT: vmovdqu (%rdi), %xmm0 +; AA-NEXT: vmovdqu (%rsi), %xmm1 +; AA-NEXT: vmovdqu %xmm1, (%rdi) +; AA-NEXT: vmovdqu %xmm0, (%rsi) ; AA-NEXT: retq entry: %tmp.sroa.0 = alloca [16 x i8], align 1 @@ -86,16 +86,16 @@ define dso_local void @twoallocs(i8* nocapture %a, i8* nocapture %b) local_unnamed_addr { ; NOAA-LABEL: twoallocs: ; NOAA: # %bb.0: # %entry -; NOAA-NEXT: vmovups (%rdi), %xmm0 -; NOAA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; NOAA-NEXT: vmovups %xmm0, (%rsi) +; NOAA-NEXT: vmovdqu (%rdi), %xmm0 +; NOAA-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; NOAA-NEXT: vmovdqu %xmm0, (%rsi) ; NOAA-NEXT: retq ; ; AA-LABEL: twoallocs: ; AA: # %bb.0: # %entry -; AA-NEXT: vmovups (%rdi), %xmm0 -; AA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AA-NEXT: vmovups %xmm0, (%rsi) +; AA-NEXT: vmovdqu (%rdi), %xmm0 +; AA-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AA-NEXT: vmovdqu %xmm0, (%rsi) ; AA-NEXT: retq entry: %alloc1 = alloca [16 x i8], align 1 @@ -115,18 +115,18 @@ define dso_local void @onealloc_readback_1(i8* nocapture %a, i8* nocapture %b) local_unnamed_addr { ; NOAA-LABEL: onealloc_readback_1: ; NOAA: # %bb.0: # %entry -; NOAA-NEXT: vmovups (%rdi), %xmm0 -; NOAA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; NOAA-NEXT: vmovups (%rsi), %xmm0 -; NOAA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; NOAA-NEXT: vmovups %xmm0, (%rdi) +; NOAA-NEXT: vmovdqu (%rdi), %xmm0 +; NOAA-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; NOAA-NEXT: vmovdqu (%rsi), %xmm0 +; NOAA-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; NOAA-NEXT: vmovdqu %xmm0, (%rdi) ; NOAA-NEXT: retq ; ; AA-LABEL: onealloc_readback_1: ; AA: # %bb.0: # %entry -; AA-NEXT: vmovups (%rsi), %xmm0 -; AA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AA-NEXT: vmovups %xmm0, (%rdi) +; AA-NEXT: vmovdqu (%rsi), %xmm0 +; AA-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AA-NEXT: vmovdqu %xmm0, (%rdi) ; AA-NEXT: retq entry: %alloc = alloca [16 x i8], i8 2, align 1 @@ -145,18 +145,18 @@ define dso_local void @onealloc_readback_2(i8* nocapture %a, i8* nocapture %b) local_unnamed_addr { ; NOAA-LABEL: onealloc_readback_2: ; NOAA: # %bb.0: # %entry -; NOAA-NEXT: vmovups (%rdi), %xmm0 -; NOAA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; NOAA-NEXT: vmovups (%rsi), %xmm0 -; NOAA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; NOAA-NEXT: vmovups %xmm0, (%rdi) +; NOAA-NEXT: vmovdqu (%rdi), %xmm0 +; NOAA-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; NOAA-NEXT: vmovdqu (%rsi), %xmm0 +; NOAA-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; NOAA-NEXT: vmovdqu %xmm0, (%rdi) ; NOAA-NEXT: retq ; ; AA-LABEL: onealloc_readback_2: ; AA: # %bb.0: # %entry -; AA-NEXT: vmovups (%rsi), %xmm0 -; AA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AA-NEXT: vmovups %xmm0, (%rdi) +; AA-NEXT: vmovdqu (%rsi), %xmm0 +; AA-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AA-NEXT: vmovdqu %xmm0, (%rdi) ; AA-NEXT: retq entry: %alloc = alloca [16 x i8], i8 2, align 1 diff --git a/llvm/test/CodeGen/X86/swizzle-avx2.ll b/llvm/test/CodeGen/X86/swizzle-avx2.ll --- a/llvm/test/CodeGen/X86/swizzle-avx2.ll +++ b/llvm/test/CodeGen/X86/swizzle-avx2.ll @@ -14,8 +14,8 @@ define <8 x i32> @swizzle_1(<8 x i32> %v) { ; CHECK-LABEL: swizzle_1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,2,0,4,5,6,7] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,0,4,5,6,7] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> @@ -25,7 +25,7 @@ define <8 x i32> @swizzle_2(<8 x i32> %v) { ; CHECK-LABEL: swizzle_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> @@ -35,7 +35,7 @@ define <8 x i32> @swizzle_3(<8 x i32> %v) { ; CHECK-LABEL: swizzle_3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> @@ -45,8 +45,8 @@ define <8 x i32> @swizzle_4(<8 x i32> %v) { ; CHECK-LABEL: swizzle_4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,1,2,0,6,5,4,7] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,2,0,6,5,4,7] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> @@ -56,8 +56,8 @@ define <8 x i32> @swizzle_5(<8 x i32> %v) { ; CHECK-LABEL: swizzle_5: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,0,1,2,7,6,4,5] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,0,1,2,7,6,4,5] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> @@ -67,8 +67,8 @@ define <8 x i32> @swizzle_6(<8 x i32> %v) { ; CHECK-LABEL: swizzle_6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,1,0,2,4,5,6,7] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,0,2,4,5,6,7] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> @@ -78,8 +78,8 @@ define <8 x i32> @swizzle_7(<8 x i32> %v) { ; CHECK-LABEL: swizzle_7: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,3,1,4,5,6,7] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,3,1,4,5,6,7] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/trunc-subvector.ll b/llvm/test/CodeGen/X86/trunc-subvector.ll --- a/llvm/test/CodeGen/X86/trunc-subvector.ll +++ b/llvm/test/CodeGen/X86/trunc-subvector.ll @@ -28,7 +28,7 @@ ; ; AVX-LABEL: test2: ; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %x = sext <8 x i32> %v to <8 x i64> @@ -45,7 +45,7 @@ ; ; AVX-LABEL: test3: ; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %x = sext <8 x i32> %v to <8 x i64> @@ -79,8 +79,8 @@ ; ; AVX2-LABEL: test5: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [17179869187,17179869187,17179869187,17179869187] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17179869187,17179869187,17179869187,17179869187] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -126,7 +126,7 @@ ; ; AVX-LABEL: test7: ; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %x = zext <8 x i32> %v to <8 x i64> @@ -143,7 +143,7 @@ ; ; AVX-LABEL: test8: ; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %x = zext <8 x i32> %v to <8 x i64> @@ -177,8 +177,8 @@ ; ; AVX2-LABEL: test10: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [17179869187,17179869187,17179869187,17179869187] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17179869187,17179869187,17179869187,17179869187] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll --- a/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll +++ b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -19,7 +19,7 @@ ; ; AVX2-LABEL: load32bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; AVX2-NEXT: retq %A = load <8 x float>, <8 x float>* %Ap, align 16 ret <8 x float> %A @@ -43,7 +43,7 @@ ; ; AVX2-LABEL: store32bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vmovdqu %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq store <8 x float> %A, <8 x float>* %P, align 16 @@ -66,7 +66,7 @@ ; ; AVX2-LABEL: combine_16_byte_loads_no_intrinsic: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups 48(%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 48(%rdi), %ymm0 ; AVX2-NEXT: retq %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 @@ -91,7 +91,7 @@ ; ; AVX2-LABEL: combine_16_byte_loads_aligned: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps 48(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 48(%rdi), %ymm0 ; AVX2-NEXT: retq %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 @@ -117,7 +117,7 @@ ; ; AVX2-LABEL: combine_16_byte_loads_no_intrinsic_swap: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups 64(%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 64(%rdi), %ymm0 ; AVX2-NEXT: retq %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll @@ -493,12 +493,12 @@ ; ; CHECK-AVX2-LABEL: test_urem_one_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_one_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -511,10 +511,20 @@ ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: test_urem_one_ne: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX1-LABEL: test_urem_one_ne: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_one_ne: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_one_ne: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -55,13 +55,13 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: var_shuffle_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %index0 = extractelement <4 x i64> %indices, i32 0 %index1 = extractelement <4 x i64> %indices, i32 1 @@ -101,7 +101,7 @@ ; ; INT256-LABEL: var_shuffle_v8i32: ; INT256: # %bb.0: -; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; INT256-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; INT256-NEXT: retq %index0 = extractelement <8 x i32> %indices, i32 0 %index1 = extractelement <8 x i32> %indices, i32 1 @@ -475,13 +475,13 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: var_shuffle_v4f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %index0 = extractelement <4 x i64> %indices, i32 0 %index1 = extractelement <4 x i64> %indices, i32 1 @@ -521,7 +521,7 @@ ; ; INT256-LABEL: var_shuffle_v8f32: ; INT256: # %bb.0: -; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; INT256-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; INT256-NEXT: retq %index0 = extractelement <8 x i32> %indices, i32 0 %index1 = extractelement <8 x i32> %indices, i32 1 @@ -598,14 +598,14 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: var_shuffle_v4i64_from_v2i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %index0 = extractelement <4 x i64> %indices, i32 0 %index1 = extractelement <4 x i64> %indices, i32 1 @@ -648,7 +648,7 @@ ; INT256-LABEL: var_shuffle_v8i32_from_v4i32: ; INT256: # %bb.0: # %entry ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; INT256-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; INT256-NEXT: retq entry: %tmp1 = extractelement <8 x i32> %indices, i32 0 @@ -1021,14 +1021,14 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: var_shuffle_v4f64_from_v2f64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %index0 = extractelement <4 x i64> %indices, i32 0 %index1 = extractelement <4 x i64> %indices, i32 1 @@ -1071,7 +1071,7 @@ ; INT256-LABEL: var_shuffle_v8f32_from_v4f32: ; INT256: # %bb.0: # %entry ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; INT256-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; INT256-NEXT: retq entry: %tmp1 = extractelement <8 x i32> %indices, i32 0 @@ -1122,7 +1122,7 @@ ; INT256-LABEL: var_shuffle_v4i32_from_v8i32: ; INT256: # %bb.0: # %entry ; INT256-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; INT256-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; INT256-NEXT: vzeroupper ; INT256-NEXT: retq diff --git a/llvm/test/CodeGen/X86/var-permute-512.ll b/llvm/test/CodeGen/X86/var-permute-512.ll --- a/llvm/test/CodeGen/X86/var-permute-512.ll +++ b/llvm/test/CodeGen/X86/var-permute-512.ll @@ -6,7 +6,7 @@ define <8 x i64> @var_shuffle_v8i64(<8 x i64> %v, <8 x i64> %indices) nounwind { ; AVX512-LABEL: var_shuffle_v8i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %index0 = extractelement <8 x i64> %indices, i32 0 %index1 = extractelement <8 x i64> %indices, i32 1 @@ -38,7 +38,7 @@ define <16 x i32> @var_shuffle_v16i32(<16 x i32> %v, <16 x i32> %indices) nounwind { ; AVX512-LABEL: var_shuffle_v16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %index0 = extractelement <16 x i32> %indices, i32 0 %index1 = extractelement <16 x i32> %indices, i32 1 @@ -102,7 +102,7 @@ ; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3 ; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4 ; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vmovaps %zmm0, (%rsp) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsp) ; AVX512F-NEXT: andl $31, %eax ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 @@ -331,7 +331,7 @@ ; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3 ; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4 ; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vmovaps %zmm0, (%rsp) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsp) ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 @@ -556,7 +556,7 @@ ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4 ; AVX512BW-NEXT: vmovd %xmm4, %eax -; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsp) ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vmovd %eax, %xmm0 @@ -973,7 +973,7 @@ define <8 x double> @var_shuffle_v8f64(<8 x double> %v, <8 x i64> %indices) nounwind { ; AVX512-LABEL: var_shuffle_v8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %index0 = extractelement <8 x i64> %indices, i32 0 %index1 = extractelement <8 x i64> %indices, i32 1 @@ -1005,7 +1005,7 @@ define <16 x float> @var_shuffle_v16f32(<16 x float> %v, <16 x i32> %indices) nounwind { ; AVX512-LABEL: var_shuffle_v16f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %index0 = extractelement <16 x i32> %indices, i32 0 %index1 = extractelement <16 x i32> %indices, i32 1 @@ -1069,7 +1069,7 @@ ; AVX512F-NEXT: vpbroadcastd %esi, %zmm2 ; AVX512F-NEXT: vpaddd {{.*}}(%rip), %zmm2, %zmm1 ; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vmovaps %zmm0, (%rsp) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsp) ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 @@ -1301,10 +1301,10 @@ ; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1 ; AVX512F-NEXT: vpmovsxbd %xmm8, %zmm3 ; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3 -; AVX512F-NEXT: vmovaps %zmm3, 192(%rdi) -; AVX512F-NEXT: vmovaps %zmm1, 128(%rdi) -; AVX512F-NEXT: vmovaps %zmm2, 64(%rdi) -; AVX512F-NEXT: vmovaps %zmm0, (%rdi) +; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rdi) +; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rdi) +; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rdi) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdi) ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp ; AVX512F-NEXT: vzeroupper @@ -1320,7 +1320,7 @@ ; AVX512BW-NEXT: vpbroadcastd %esi, %zmm2 ; AVX512BW-NEXT: vpaddd {{.*}}(%rip), %zmm2, %zmm1 ; AVX512BW-NEXT: vmovd %xmm1, %eax -; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsp) ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vmovd %eax, %xmm0 @@ -1552,10 +1552,10 @@ ; AVX512BW-NEXT: vcvtdq2ps %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovsxbd %xmm8, %zmm3 ; AVX512BW-NEXT: vcvtdq2ps %zmm3, %zmm3 -; AVX512BW-NEXT: vmovaps %zmm3, 192(%rdi) -; AVX512BW-NEXT: vmovaps %zmm1, 128(%rdi) -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdi) -; AVX512BW-NEXT: vmovaps %zmm0, (%rdi) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdi) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rdi) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rdi) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdi) ; AVX512BW-NEXT: movq %rbp, %rsp ; AVX512BW-NEXT: popq %rbp ; AVX512BW-NEXT: vzeroupper @@ -1639,10 +1639,10 @@ ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm3, %zmm0 ; AVX512VBMI-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512VBMI-NEXT: vcvtdq2ps %zmm0, %zmm0 -; AVX512VBMI-NEXT: vmovaps %zmm0, 128(%rdi) -; AVX512VBMI-NEXT: vmovaps %zmm4, 64(%rdi) -; AVX512VBMI-NEXT: vmovaps %zmm1, (%rdi) -; AVX512VBMI-NEXT: vmovaps %zmm2, 192(%rdi) +; AVX512VBMI-NEXT: vmovdqa64 %zmm0, 128(%rdi) +; AVX512VBMI-NEXT: vmovdqa64 %zmm4, 64(%rdi) +; AVX512VBMI-NEXT: vmovdqa64 %zmm1, (%rdi) +; AVX512VBMI-NEXT: vmovdqa64 %zmm2, 192(%rdi) ; AVX512VBMI-NEXT: movq %rbp, %rsp ; AVX512VBMI-NEXT: popq %rbp ; AVX512VBMI-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vec-loadsingles-alignment.ll b/llvm/test/CodeGen/X86/vec-loadsingles-alignment.ll --- a/llvm/test/CodeGen/X86/vec-loadsingles-alignment.ll +++ b/llvm/test/CodeGen/X86/vec-loadsingles-alignment.ll @@ -8,7 +8,7 @@ define i32 @subb() nounwind ssp { ; CHECK-LABEL: subb: -; CHECK: vmovups e(%rip), %ymm +; CHECK: vmovdqu e(%rip), %ymm entry: %0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @e, i64 0, i64 7), align 4 %1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @e, i64 0, i64 6), align 8 diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -197,7 +197,7 @@ ; ; AVX512DQ-LABEL: strict_vector_fptosi_v2f64_to_v2i64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -503,7 +503,7 @@ ; ; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -853,7 +853,7 @@ ; AVX512DQ-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128: ; AVX512DQ-32: # %bb.0: ; AVX512DQ-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512DQ-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-32-NEXT: vcvttps2qq %ymm0, %zmm0 ; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-32-NEXT: vzeroupper @@ -861,7 +861,7 @@ ; ; AVX512DQ-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128: ; AVX512DQ-64: # %bb.0: -; AVX512DQ-64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-64-NEXT: vcvttps2qq %ymm0, %zmm0 ; AVX512DQ-64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-64-NEXT: vzeroupper @@ -1488,7 +1488,7 @@ ; AVX512DQ-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128: ; AVX512DQ-32: # %bb.0: ; AVX512DQ-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512DQ-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-32-NEXT: vcvttps2uqq %ymm0, %zmm0 ; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-32-NEXT: vzeroupper @@ -1496,7 +1496,7 @@ ; ; AVX512DQ-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128: ; AVX512DQ-64: # %bb.0: -; AVX512DQ-64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-64-NEXT: vcvttps2uqq %ymm0, %zmm0 ; AVX512DQ-64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-64-NEXT: vzeroupper @@ -1639,7 +1639,7 @@ ; ; AVX512F-LABEL: strict_vector_fptoui_v2f64_to_v2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -1652,7 +1652,7 @@ ; ; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512DQ-NEXT: vzeroupper @@ -2555,7 +2555,7 @@ ; ; AVX512F-LABEL: strict_vector_fptoui_v2f64_to_v2i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 @@ -2575,7 +2575,7 @@ ; ; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i1: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 @@ -3074,7 +3074,7 @@ ; ; AVX512F-LABEL: strict_vector_fptoui_v4f32_to_v4i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -3087,7 +3087,7 @@ ; ; AVX512DQ-LABEL: strict_vector_fptoui_v4f32_to_v4i32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -202,7 +202,7 @@ ; ; AVX512DQ-LABEL: strict_vector_fptosi_v4f64_to_v4i64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: ret{{[l|q]}} @@ -554,7 +554,7 @@ ; ; AVX512DQ-LABEL: strict_vector_fptoui_v4f64_to_v4i64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: ret{{[l|q]}} @@ -733,7 +733,7 @@ ; ; AVX512DQ-LABEL: strict_vector_fptosi_v4f32_to_v4i64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: ret{{[l|q]}} @@ -1085,7 +1085,7 @@ ; ; AVX512DQ-LABEL: strict_vector_fptoui_v4f32_to_v4i64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: ret{{[l|q]}} @@ -1130,7 +1130,7 @@ ; ; AVX512F-LABEL: strict_vector_fptoui_v4f64_to_v4i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -1144,7 +1144,7 @@ ; ; AVX512DQ-LABEL: strict_vector_fptoui_v4f64_to_v4i32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512DQ-NEXT: vzeroupper @@ -1392,7 +1392,7 @@ ; ; AVX512F-LABEL: strict_vector_fptoui_v8f32_to_v8i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: ret{{[l|q]}} @@ -1404,7 +1404,7 @@ ; ; AVX512DQ-LABEL: strict_vector_fptoui_v8f32_to_v8i32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll @@ -5,7 +5,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE41,SSE41-64 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=AVX,AVX1,AVX-32,AVX1-32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=AVX,AVX1,AVX-64,AVX1-64 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX,AVX512F,AVX-32 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX,AVX512F,AVX-32,AVX512F-32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX,AVX512F,AVX-64,AVX512F-64 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512vl -O3 | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX-32,AVX512VL-32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl -O3 | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX-64,AVX512VL-64 @@ -199,29 +199,29 @@ ; SSE41-64-NEXT: movaps %xmm1, %xmm0 ; SSE41-64-NEXT: retq ; -; AVX-32-LABEL: sitofp_v2i64_v2f32: -; AVX-32: # %bb.0: -; AVX-32-NEXT: pushl %ebp -; AVX-32-NEXT: .cfi_def_cfa_offset 8 -; AVX-32-NEXT: .cfi_offset %ebp, -8 -; AVX-32-NEXT: movl %esp, %ebp -; AVX-32-NEXT: .cfi_def_cfa_register %ebp -; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $24, %esp -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstps (%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-32-NEXT: movl %ebp, %esp -; AVX-32-NEXT: popl %ebp -; AVX-32-NEXT: .cfi_def_cfa %esp, 4 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: sitofp_v2i64_v2f32: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: pushl %ebp +; AVX1-32-NEXT: .cfi_def_cfa_offset 8 +; AVX1-32-NEXT: .cfi_offset %ebp, -8 +; AVX1-32-NEXT: movl %esp, %ebp +; AVX1-32-NEXT: .cfi_def_cfa_register %ebp +; AVX1-32-NEXT: andl $-8, %esp +; AVX1-32-NEXT: subl $24, %esp +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstps (%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX1-32-NEXT: movl %ebp, %esp +; AVX1-32-NEXT: popl %ebp +; AVX1-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-32-NEXT: retl ; ; AVX-64-LABEL: sitofp_v2i64_v2f32: ; AVX-64: # %bb.0: @@ -232,11 +232,59 @@ ; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX-64-NEXT: retq ; +; AVX512F-32-LABEL: sitofp_v2i64_v2f32: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-8, %esp +; AVX512F-32-NEXT: subl $24, %esp +; AVX512F-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512F-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstps (%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl +; +; AVX512VL-32-LABEL: sitofp_v2i64_v2f32: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $24, %esp +; AVX512VL-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512VL-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstps (%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; ; AVX512DQ-32-LABEL: sitofp_v2i64_v2f32: ; AVX512DQ-32: # %bb.0: ; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-32-NEXT: vcvtqq2ps %zmm0, %ymm1 -; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512DQ-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512DQ-32-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512DQ-32-NEXT: vzeroupper @@ -400,36 +448,36 @@ ; SSE41-64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE41-64-NEXT: retq ; -; AVX-32-LABEL: uitofp_v2i64_v2f32: -; AVX-32: # %bb.0: -; AVX-32-NEXT: pushl %ebp -; AVX-32-NEXT: .cfi_def_cfa_offset 8 -; AVX-32-NEXT: .cfi_offset %ebp, -8 -; AVX-32-NEXT: movl %esp, %ebp -; AVX-32-NEXT: .cfi_def_cfa_register %ebp -; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $24, %esp -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vextractps $1, %xmm0, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vextractps $3, %xmm0, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstps (%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-32-NEXT: movl %ebp, %esp -; AVX-32-NEXT: popl %ebp -; AVX-32-NEXT: .cfi_def_cfa %esp, 4 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: uitofp_v2i64_v2f32: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: pushl %ebp +; AVX1-32-NEXT: .cfi_def_cfa_offset 8 +; AVX1-32-NEXT: .cfi_offset %ebp, -8 +; AVX1-32-NEXT: movl %esp, %ebp +; AVX1-32-NEXT: .cfi_def_cfa_register %ebp +; AVX1-32-NEXT: andl $-8, %esp +; AVX1-32-NEXT: subl $24, %esp +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vextractps $1, %xmm0, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vextractps $3, %xmm0, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstps (%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX1-32-NEXT: movl %ebp, %esp +; AVX1-32-NEXT: popl %ebp +; AVX1-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-32-NEXT: retl ; ; AVX1-64-LABEL: uitofp_v2i64_v2f32: ; AVX1-64: # %bb.0: @@ -449,6 +497,37 @@ ; AVX1-64-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-64-NEXT: retq ; +; AVX512F-32-LABEL: uitofp_v2i64_v2f32: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-8, %esp +; AVX512F-32-NEXT: subl $24, %esp +; AVX512F-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512F-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpextrd $1, %xmm0, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vpextrd $3, %xmm0, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstps (%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl +; ; AVX512F-64-LABEL: uitofp_v2i64_v2f32: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax @@ -458,6 +537,37 @@ ; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX512F-64-NEXT: retq ; +; AVX512VL-32-LABEL: uitofp_v2i64_v2f32: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $24, %esp +; AVX512VL-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpextrd $1, %xmm0, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vpextrd $3, %xmm0, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstps (%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; ; AVX512VL-64-LABEL: uitofp_v2i64_v2f32: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax @@ -471,7 +581,7 @@ ; AVX512DQ-32: # %bb.0: ; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-32-NEXT: vcvtuqq2ps %zmm0, %ymm1 -; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512DQ-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512DQ-32-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512DQ-32-NEXT: vzeroupper @@ -588,13 +698,13 @@ ; ; AVX512DQVL-32-LABEL: uitofp_v4i1_v4f32: ; AVX512DQVL-32: # %bb.0: -; AVX512DQVL-32-NEXT: vandps {{\.LCPI.*}}{1to4}, %xmm0, %xmm0 +; AVX512DQVL-32-NEXT: vpandd {{\.LCPI.*}}{1to4}, %xmm0, %xmm0 ; AVX512DQVL-32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX512DQVL-32-NEXT: retl ; ; AVX512DQVL-64-LABEL: uitofp_v4i1_v4f32: ; AVX512DQVL-64: # %bb.0: -; AVX512DQVL-64-NEXT: vandps {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512DQVL-64-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512DQVL-64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX512DQVL-64-NEXT: retq %result = call <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i1(<4 x i1> %x, @@ -797,7 +907,7 @@ ; ; AVX512F-LABEL: uitofp_v4i32_v4f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -810,7 +920,7 @@ ; ; AVX512DQ-LABEL: uitofp_v4i32_v4f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -901,9 +1011,9 @@ ; ; AVX512F-LABEL: uitofp_v2i1_v2f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] -; AVX512F-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512F-NEXT: ret{{[l|q]}} ; @@ -923,23 +1033,23 @@ ; ; AVX512DQ-LABEL: uitofp_v2i1_v2f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512DQ-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] -; AVX512DQ-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512DQVL-32-LABEL: uitofp_v2i1_v2f64: ; AVX512DQVL-32: # %bb.0: -; AVX512DQVL-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512DQVL-32-NEXT: vandps {{\.LCPI.*}}{1to4}, %xmm0, %xmm0 +; AVX512DQVL-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512DQVL-32-NEXT: vpandd {{\.LCPI.*}}{1to4}, %xmm0, %xmm0 ; AVX512DQVL-32-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512DQVL-32-NEXT: retl ; ; AVX512DQVL-64-LABEL: uitofp_v2i1_v2f64: ; AVX512DQVL-64: # %bb.0: -; AVX512DQVL-64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512DQVL-64-NEXT: vandps {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512DQVL-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512DQVL-64-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512DQVL-64-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512DQVL-64-NEXT: retq %result = call <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i1(<2 x i1> %x, @@ -1208,29 +1318,29 @@ ; SSE41-64-NEXT: movapd %xmm1, %xmm0 ; SSE41-64-NEXT: retq ; -; AVX-32-LABEL: sitofp_v2i64_v2f64: -; AVX-32: # %bb.0: -; AVX-32-NEXT: pushl %ebp -; AVX-32-NEXT: .cfi_def_cfa_offset 8 -; AVX-32-NEXT: .cfi_offset %ebp, -8 -; AVX-32-NEXT: movl %esp, %ebp -; AVX-32-NEXT: .cfi_def_cfa_register %ebp -; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstpl (%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; AVX-32-NEXT: movl %ebp, %esp -; AVX-32-NEXT: popl %ebp -; AVX-32-NEXT: .cfi_def_cfa %esp, 4 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: sitofp_v2i64_v2f64: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: pushl %ebp +; AVX1-32-NEXT: .cfi_def_cfa_offset 8 +; AVX1-32-NEXT: .cfi_offset %ebp, -8 +; AVX1-32-NEXT: movl %esp, %ebp +; AVX1-32-NEXT: .cfi_def_cfa_register %ebp +; AVX1-32-NEXT: andl $-8, %esp +; AVX1-32-NEXT: subl $32, %esp +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstpl (%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX1-32-NEXT: movl %ebp, %esp +; AVX1-32-NEXT: popl %ebp +; AVX1-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-32-NEXT: retl ; ; AVX-64-LABEL: sitofp_v2i64_v2f64: ; AVX-64: # %bb.0: @@ -1241,9 +1351,57 @@ ; AVX-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-64-NEXT: retq ; +; AVX512F-32-LABEL: sitofp_v2i64_v2f64: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-8, %esp +; AVX512F-32-NEXT: subl $32, %esp +; AVX512F-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512F-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstpl (%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512F-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl +; +; AVX512VL-32-LABEL: sitofp_v2i64_v2f64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $32, %esp +; AVX512VL-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512VL-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstpl (%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512VL-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; ; AVX512DQ-LABEL: sitofp_v2i64_v2f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -1396,36 +1554,36 @@ ; SSE41-64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-64-NEXT: retq ; -; AVX-32-LABEL: uitofp_v2i64_v2f64: -; AVX-32: # %bb.0: -; AVX-32-NEXT: pushl %ebp -; AVX-32-NEXT: .cfi_def_cfa_offset 8 -; AVX-32-NEXT: .cfi_offset %ebp, -8 -; AVX-32-NEXT: movl %esp, %ebp -; AVX-32-NEXT: .cfi_def_cfa_register %ebp -; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $32, %esp -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vextractps $1, %xmm0, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vextractps $3, %xmm0, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstpl (%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; AVX-32-NEXT: movl %ebp, %esp -; AVX-32-NEXT: popl %ebp -; AVX-32-NEXT: .cfi_def_cfa %esp, 4 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: uitofp_v2i64_v2f64: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: pushl %ebp +; AVX1-32-NEXT: .cfi_def_cfa_offset 8 +; AVX1-32-NEXT: .cfi_offset %ebp, -8 +; AVX1-32-NEXT: movl %esp, %ebp +; AVX1-32-NEXT: .cfi_def_cfa_register %ebp +; AVX1-32-NEXT: andl $-8, %esp +; AVX1-32-NEXT: subl $32, %esp +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vextractps $1, %xmm0, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vextractps $3, %xmm0, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstpl (%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX1-32-NEXT: movl %ebp, %esp +; AVX1-32-NEXT: popl %ebp +; AVX1-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-32-NEXT: retl ; ; AVX1-64-LABEL: uitofp_v2i64_v2f64: ; AVX1-64: # %bb.0: @@ -1458,6 +1616,37 @@ ; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-64-NEXT: retq ; +; AVX512F-32-LABEL: uitofp_v2i64_v2f64: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-8, %esp +; AVX512F-32-NEXT: subl $32, %esp +; AVX512F-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512F-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpextrd $1, %xmm0, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vpextrd $3, %xmm0, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstpl (%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512F-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl +; ; AVX512F-64-LABEL: uitofp_v2i64_v2f64: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax @@ -1467,6 +1656,37 @@ ; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-64-NEXT: retq ; +; AVX512VL-32-LABEL: uitofp_v2i64_v2f64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $32, %esp +; AVX512VL-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpextrd $1, %xmm0, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vpextrd $3, %xmm0, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstpl (%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512VL-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; ; AVX512VL-64-LABEL: uitofp_v2i64_v2f64: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax @@ -1478,7 +1698,7 @@ ; ; AVX512DQ-LABEL: uitofp_v2i64_v2f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -419,7 +419,7 @@ ; ; AVX512F-LABEL: uitofp_v8i32_v8f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: ret{{[l|q]}} @@ -431,7 +431,7 @@ ; ; AVX512DQ-LABEL: uitofp_v8i32_v8f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: ret{{[l|q]}} @@ -474,15 +474,15 @@ ; ; AVX2-LABEL: uitofp_v4i1_v4f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: uitofp_v4i1_v4f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] -; AVX512F-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512F-NEXT: ret{{[l|q]}} ; @@ -500,20 +500,20 @@ ; ; AVX512DQ-LABEL: uitofp_v4i1_v4f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] -; AVX512DQ-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512DQVL-32-LABEL: uitofp_v4i1_v4f64: ; AVX512DQVL-32: # %bb.0: -; AVX512DQVL-32-NEXT: vandps {{\.LCPI.*}}{1to4}, %xmm0, %xmm0 +; AVX512DQVL-32-NEXT: vpandd {{\.LCPI.*}}{1to4}, %xmm0, %xmm0 ; AVX512DQVL-32-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512DQVL-32-NEXT: retl ; ; AVX512DQVL-64-LABEL: uitofp_v4i1_v4f64: ; AVX512DQVL-64: # %bb.0: -; AVX512DQVL-64-NEXT: vandps {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512DQVL-64-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512DQVL-64-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512DQVL-64-NEXT: retq %result = call <4 x double> @llvm.experimental.constrained.uitofp.v4f64.v4i1(<4 x i1> %x, @@ -603,7 +603,7 @@ ; ; AVX512F-LABEL: uitofp_v4i32_v4f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: ret{{[l|q]}} @@ -615,7 +615,7 @@ ; ; AVX512DQ-LABEL: uitofp_v4i32_v4f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: ret{{[l|q]}} @@ -631,40 +631,40 @@ } define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 { -; AVX-32-LABEL: sitofp_v4i64_v4f64: -; AVX-32: # %bb.0: -; AVX-32-NEXT: pushl %ebp -; AVX-32-NEXT: .cfi_def_cfa_offset 8 -; AVX-32-NEXT: .cfi_offset %ebp, -8 -; AVX-32-NEXT: movl %esp, %ebp -; AVX-32-NEXT: .cfi_def_cfa_register %ebp -; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $64, %esp -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstpl (%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-32-NEXT: movl %ebp, %esp -; AVX-32-NEXT: popl %ebp -; AVX-32-NEXT: .cfi_def_cfa %esp, 4 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: sitofp_v4i64_v4f64: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: pushl %ebp +; AVX1-32-NEXT: .cfi_def_cfa_offset 8 +; AVX1-32-NEXT: .cfi_offset %ebp, -8 +; AVX1-32-NEXT: movl %esp, %ebp +; AVX1-32-NEXT: .cfi_def_cfa_register %ebp +; AVX1-32-NEXT: andl $-8, %esp +; AVX1-32-NEXT: subl $64, %esp +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstpl (%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-32-NEXT: movl %ebp, %esp +; AVX1-32-NEXT: popl %ebp +; AVX1-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-32-NEXT: retl ; ; AVX1-64-LABEL: sitofp_v4i64_v4f64: ; AVX1-64: # %bb.0: @@ -682,6 +682,41 @@ ; AVX1-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-64-NEXT: retq ; +; AVX2-32-LABEL: sitofp_v4i64_v4f64: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: pushl %ebp +; AVX2-32-NEXT: .cfi_def_cfa_offset 8 +; AVX2-32-NEXT: .cfi_offset %ebp, -8 +; AVX2-32-NEXT: movl %esp, %ebp +; AVX2-32-NEXT: .cfi_def_cfa_register %ebp +; AVX2-32-NEXT: andl $-8, %esp +; AVX2-32-NEXT: subl $64, %esp +; AVX2-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fstpl (%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX2-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX2-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX2-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-32-NEXT: movl %ebp, %esp +; AVX2-32-NEXT: popl %ebp +; AVX2-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX2-32-NEXT: retl +; ; AVX2-64-LABEL: sitofp_v4i64_v4f64: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -698,6 +733,41 @@ ; AVX2-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-64-NEXT: retq ; +; AVX512F-32-LABEL: sitofp_v4i64_v4f64: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-8, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512F-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512F-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstpl (%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512F-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512F-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX512F-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl +; ; AVX512F-64-LABEL: sitofp_v4i64_v4f64: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -714,6 +784,41 @@ ; AVX512F-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-64-NEXT: retq ; +; AVX512VL-32-LABEL: sitofp_v4i64_v4f64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $64, %esp +; AVX512VL-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512VL-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstpl (%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512VL-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512VL-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX512VL-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; ; AVX512VL-64-LABEL: sitofp_v4i64_v4f64: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -732,7 +837,7 @@ ; ; AVX512DQ-LABEL: sitofp_v4i64_v4f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: ret{{[l|q]}} @@ -748,55 +853,55 @@ } define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 { -; AVX-32-LABEL: uitofp_v4i64_v4f64: -; AVX-32: # %bb.0: -; AVX-32-NEXT: pushl %ebp -; AVX-32-NEXT: .cfi_def_cfa_offset 8 -; AVX-32-NEXT: .cfi_offset %ebp, -8 -; AVX-32-NEXT: movl %esp, %ebp -; AVX-32-NEXT: .cfi_def_cfa_register %ebp -; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $64, %esp -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vextractps $1, %xmm0, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstpl (%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vextractps $3, %xmm0, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vextractps $1, %xmm1, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vextractps $3, %xmm1, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-32-NEXT: movl %ebp, %esp -; AVX-32-NEXT: popl %ebp -; AVX-32-NEXT: .cfi_def_cfa %esp, 4 -; AVX-32-NEXT: retl +; AVX1-32-LABEL: uitofp_v4i64_v4f64: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: pushl %ebp +; AVX1-32-NEXT: .cfi_def_cfa_offset 8 +; AVX1-32-NEXT: .cfi_offset %ebp, -8 +; AVX1-32-NEXT: movl %esp, %ebp +; AVX1-32-NEXT: .cfi_def_cfa_register %ebp +; AVX1-32-NEXT: andl $-8, %esp +; AVX1-32-NEXT: subl $64, %esp +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX1-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vextractps $1, %xmm0, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstpl (%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vextractps $3, %xmm0, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vextractps $1, %xmm1, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vextractps $3, %xmm1, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-32-NEXT: movl %ebp, %esp +; AVX1-32-NEXT: popl %ebp +; AVX1-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-32-NEXT: retl ; ; AVX1-64-LABEL: uitofp_v4i64_v4f64: ; AVX1-64: # %bb.0: @@ -832,6 +937,56 @@ ; AVX1-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-64-NEXT: retq ; +; AVX2-32-LABEL: uitofp_v4i64_v4f64: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: pushl %ebp +; AVX2-32-NEXT: .cfi_def_cfa_offset 8 +; AVX2-32-NEXT: .cfi_offset %ebp, -8 +; AVX2-32-NEXT: movl %esp, %ebp +; AVX2-32-NEXT: .cfi_def_cfa_register %ebp +; AVX2-32-NEXT: andl $-8, %esp +; AVX2-32-NEXT: subl $64, %esp +; AVX2-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-32-NEXT: vmovq %xmm2, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vpextrd $1, %xmm0, %eax +; AVX2-32-NEXT: shrl $31, %eax +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX2-32-NEXT: fstpl (%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vpextrd $3, %xmm0, %eax +; AVX2-32-NEXT: shrl $31, %eax +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX2-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vpextrd $1, %xmm1, %eax +; AVX2-32-NEXT: shrl $31, %eax +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX2-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vpextrd $3, %xmm1, %eax +; AVX2-32-NEXT: shrl $31, %eax +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX2-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX2-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX2-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX2-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-32-NEXT: movl %ebp, %esp +; AVX2-32-NEXT: popl %ebp +; AVX2-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX2-32-NEXT: retl +; ; AVX2-64-LABEL: uitofp_v4i64_v4f64: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vpsrlq $32, %ymm0, %ymm1 @@ -866,6 +1021,56 @@ ; AVX2-64-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX2-64-NEXT: retq ; +; AVX512F-32-LABEL: uitofp_v4i64_v4f64: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-8, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512F-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX512F-32-NEXT: vmovq %xmm2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpextrd $1, %xmm0, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstpl (%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vpextrd $3, %xmm0, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vpextrd $1, %xmm1, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vpextrd $3, %xmm1, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512F-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512F-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX512F-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl +; ; AVX512F-64-LABEL: uitofp_v4i64_v4f64: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -882,6 +1087,56 @@ ; AVX512F-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-64-NEXT: retq ; +; AVX512VL-32-LABEL: uitofp_v4i64_v4f64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $64, %esp +; AVX512VL-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX512VL-32-NEXT: vmovq %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpextrd $1, %xmm0, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstpl (%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vpextrd $3, %xmm0, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vpextrd $1, %xmm1, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vpextrd $3, %xmm1, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstpl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512VL-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512VL-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX512VL-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; ; AVX512VL-64-LABEL: uitofp_v4i64_v4f64: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -900,7 +1155,7 @@ ; ; AVX512DQ-LABEL: uitofp_v4i64_v4f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: ret{{[l|q]}} @@ -916,40 +1171,40 @@ } define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 { -; AVX-32-LABEL: sitofp_v4i64_v4f32: -; AVX-32: # %bb.0: -; AVX-32-NEXT: pushl %ebp -; AVX-32-NEXT: .cfi_def_cfa_offset 8 -; AVX-32-NEXT: .cfi_offset %ebp, -8 -; AVX-32-NEXT: movl %esp, %ebp -; AVX-32-NEXT: .cfi_def_cfa_register %ebp -; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $48, %esp -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fstps (%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; AVX-32-NEXT: movl %ebp, %esp -; AVX-32-NEXT: popl %ebp -; AVX-32-NEXT: .cfi_def_cfa %esp, 4 -; AVX-32-NEXT: vzeroupper -; AVX-32-NEXT: retl +; AVX1-32-LABEL: sitofp_v4i64_v4f32: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: pushl %ebp +; AVX1-32-NEXT: .cfi_def_cfa_offset 8 +; AVX1-32-NEXT: .cfi_offset %ebp, -8 +; AVX1-32-NEXT: movl %esp, %ebp +; AVX1-32-NEXT: .cfi_def_cfa_register %ebp +; AVX1-32-NEXT: andl $-8, %esp +; AVX1-32-NEXT: subl $48, %esp +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fstps (%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX1-32-NEXT: movl %ebp, %esp +; AVX1-32-NEXT: popl %ebp +; AVX1-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-32-NEXT: vzeroupper +; AVX1-32-NEXT: retl ; ; AVX1-64-LABEL: sitofp_v4i64_v4f32: ; AVX1-64: # %bb.0: @@ -968,6 +1223,41 @@ ; AVX1-64-NEXT: vzeroupper ; AVX1-64-NEXT: retq ; +; AVX2-32-LABEL: sitofp_v4i64_v4f32: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: pushl %ebp +; AVX2-32-NEXT: .cfi_def_cfa_offset 8 +; AVX2-32-NEXT: .cfi_offset %ebp, -8 +; AVX2-32-NEXT: movl %esp, %ebp +; AVX2-32-NEXT: .cfi_def_cfa_register %ebp +; AVX2-32-NEXT: andl $-8, %esp +; AVX2-32-NEXT: subl $48, %esp +; AVX2-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fstps (%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX2-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX2-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX2-32-NEXT: movl %ebp, %esp +; AVX2-32-NEXT: popl %ebp +; AVX2-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX2-32-NEXT: vzeroupper +; AVX2-32-NEXT: retl +; ; AVX2-64-LABEL: sitofp_v4i64_v4f32: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax @@ -985,6 +1275,41 @@ ; AVX2-64-NEXT: vzeroupper ; AVX2-64-NEXT: retq ; +; AVX512F-32-LABEL: sitofp_v4i64_v4f32: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-8, %esp +; AVX512F-32-NEXT: subl $48, %esp +; AVX512F-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512F-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512F-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fstps (%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: vzeroupper +; AVX512F-32-NEXT: retl +; ; AVX512F-64-LABEL: sitofp_v4i64_v4f32: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax @@ -1002,6 +1327,41 @@ ; AVX512F-64-NEXT: vzeroupper ; AVX512F-64-NEXT: retq ; +; AVX512VL-32-LABEL: sitofp_v4i64_v4f32: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $48, %esp +; AVX512VL-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512VL-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fstps (%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: vzeroupper +; AVX512VL-32-NEXT: retl +; ; AVX512VL-64-LABEL: sitofp_v4i64_v4f32: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax @@ -1021,7 +1381,7 @@ ; ; AVX512DQ-LABEL: sitofp_v4i64_v4f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512DQ-NEXT: vzeroupper @@ -1039,55 +1399,55 @@ } define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 { -; AVX-32-LABEL: uitofp_v4i64_v4f32: -; AVX-32: # %bb.0: -; AVX-32-NEXT: pushl %ebp -; AVX-32-NEXT: .cfi_def_cfa_offset 8 -; AVX-32-NEXT: .cfi_offset %ebp, -8 -; AVX-32-NEXT: movl %esp, %ebp -; AVX-32-NEXT: .cfi_def_cfa_register %ebp -; AVX-32-NEXT: andl $-8, %esp -; AVX-32-NEXT: subl $48, %esp -; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vextractps $1, %xmm0, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstps (%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vextractps $3, %xmm0, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vextractps $1, %xmm1, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vextractps $3, %xmm1, %eax -; AVX-32-NEXT: shrl $31, %eax -; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) -; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) -; AVX-32-NEXT: wait -; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; AVX-32-NEXT: movl %ebp, %esp -; AVX-32-NEXT: popl %ebp -; AVX-32-NEXT: .cfi_def_cfa %esp, 4 -; AVX-32-NEXT: vzeroupper -; AVX-32-NEXT: retl +; AVX1-32-LABEL: uitofp_v4i64_v4f32: +; AVX1-32: # %bb.0: +; AVX1-32-NEXT: pushl %ebp +; AVX1-32-NEXT: .cfi_def_cfa_offset 8 +; AVX1-32-NEXT: .cfi_offset %ebp, -8 +; AVX1-32-NEXT: movl %esp, %ebp +; AVX1-32-NEXT: .cfi_def_cfa_register %ebp +; AVX1-32-NEXT: andl $-8, %esp +; AVX1-32-NEXT: subl $48, %esp +; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX1-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) +; AVX1-32-NEXT: vextractps $1, %xmm0, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstps (%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vextractps $3, %xmm0, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vextractps $1, %xmm1, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vextractps $3, %xmm1, %eax +; AVX1-32-NEXT: shrl $31, %eax +; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX1-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX1-32-NEXT: wait +; AVX1-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX1-32-NEXT: movl %ebp, %esp +; AVX1-32-NEXT: popl %ebp +; AVX1-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-32-NEXT: vzeroupper +; AVX1-32-NEXT: retl ; ; AVX1-64-LABEL: uitofp_v4i64_v4f32: ; AVX1-64: # %bb.0: @@ -1116,6 +1476,56 @@ ; AVX1-64-NEXT: vzeroupper ; AVX1-64-NEXT: retq ; +; AVX2-32-LABEL: uitofp_v4i64_v4f32: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: pushl %ebp +; AVX2-32-NEXT: .cfi_def_cfa_offset 8 +; AVX2-32-NEXT: .cfi_offset %ebp, -8 +; AVX2-32-NEXT: movl %esp, %ebp +; AVX2-32-NEXT: .cfi_def_cfa_register %ebp +; AVX2-32-NEXT: andl $-8, %esp +; AVX2-32-NEXT: subl $48, %esp +; AVX2-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-32-NEXT: vmovq %xmm2, {{[0-9]+}}(%esp) +; AVX2-32-NEXT: vpextrd $1, %xmm0, %eax +; AVX2-32-NEXT: shrl $31, %eax +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX2-32-NEXT: fstps (%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vpextrd $3, %xmm0, %eax +; AVX2-32-NEXT: shrl $31, %eax +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vpextrd $1, %xmm1, %eax +; AVX2-32-NEXT: shrl $31, %eax +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vpextrd $3, %xmm1, %eax +; AVX2-32-NEXT: shrl $31, %eax +; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX2-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX2-32-NEXT: wait +; AVX2-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX2-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX2-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX2-32-NEXT: movl %ebp, %esp +; AVX2-32-NEXT: popl %ebp +; AVX2-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX2-32-NEXT: vzeroupper +; AVX2-32-NEXT: retl +; ; AVX2-64-LABEL: uitofp_v4i64_v4f32: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] @@ -1142,6 +1552,56 @@ ; AVX2-64-NEXT: vzeroupper ; AVX2-64-NEXT: retq ; +; AVX512F-32-LABEL: uitofp_v4i64_v4f32: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-8, %esp +; AVX512F-32-NEXT: subl $48, %esp +; AVX512F-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512F-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX512F-32-NEXT: vmovq %xmm2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpextrd $1, %xmm0, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstps (%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vpextrd $3, %xmm0, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vpextrd $1, %xmm1, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vpextrd $3, %xmm1, %eax +; AVX512F-32-NEXT: shrl $31, %eax +; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: wait +; AVX512F-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: vzeroupper +; AVX512F-32-NEXT: retl +; ; AVX512F-64-LABEL: uitofp_v4i64_v4f32: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax @@ -1159,6 +1619,56 @@ ; AVX512F-64-NEXT: vzeroupper ; AVX512F-64-NEXT: retq ; +; AVX512VL-32-LABEL: uitofp_v4i64_v4f32: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $48, %esp +; AVX512VL-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX512VL-32-NEXT: vmovq %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpextrd $1, %xmm0, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstps (%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vpextrd $3, %xmm0, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vpextrd $1, %xmm1, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vpextrd $3, %xmm1, %eax +; AVX512VL-32-NEXT: shrl $31, %eax +; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: vzeroupper +; AVX512VL-32-NEXT: retl +; ; AVX512VL-64-LABEL: uitofp_v4i64_v4f32: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax @@ -1178,7 +1688,7 @@ ; ; AVX512DQ-LABEL: uitofp_v4i64_v4f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll @@ -271,21 +271,21 @@ ; NODQ-32-NEXT: .cfi_def_cfa_register %ebp ; NODQ-32-NEXT: andl $-8, %esp ; NODQ-32-NEXT: subl $128, %esp -; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm0 -; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NODQ-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) @@ -369,64 +369,64 @@ ; NODQ-32-NEXT: .cfi_def_cfa_register %ebp ; NODQ-32-NEXT: andl $-8, %esp ; NODQ-32-NEXT: subl $128, %esp -; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; NODQ-32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm1 -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm4, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vextractps $1, %xmm2, %eax +; NODQ-32-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; NODQ-32-NEXT: vmovq %xmm2, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; NODQ-32-NEXT: vmovq %xmm3, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm4, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpextrd $1, %xmm2, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $3, %xmm2, %eax +; NODQ-32-NEXT: vpextrd $3, %xmm2, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $1, %xmm3, %eax +; NODQ-32-NEXT: vpextrd $1, %xmm3, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $3, %xmm3, %eax +; NODQ-32-NEXT: vpextrd $3, %xmm3, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $1, %xmm0, %eax +; NODQ-32-NEXT: vpextrd $1, %xmm0, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; NODQ-32-NEXT: fstpl (%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $3, %xmm0, %eax +; NODQ-32-NEXT: vpextrd $3, %xmm0, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $1, %xmm1, %eax +; NODQ-32-NEXT: vpextrd $1, %xmm1, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $3, %xmm1, %eax +; NODQ-32-NEXT: vpextrd $3, %xmm1, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) @@ -498,21 +498,21 @@ ; NODQ-32-NEXT: .cfi_def_cfa_register %ebp ; NODQ-32-NEXT: andl $-8, %esp ; NODQ-32-NEXT: subl $96, %esp -; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm1 -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; NODQ-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) @@ -594,64 +594,64 @@ ; NODQ-32-NEXT: .cfi_def_cfa_register %ebp ; NODQ-32-NEXT: andl $-8, %esp ; NODQ-32-NEXT: subl $96, %esp -; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm3 -; NODQ-32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm4, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vextractps $1, %xmm0, %eax +; NODQ-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextracti128 $1, %ymm0, %xmm3 +; NODQ-32-NEXT: vmovq %xmm3, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; NODQ-32-NEXT: vmovq %xmm2, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vmovq %xmm4, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpextrd $1, %xmm0, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; NODQ-32-NEXT: fstps (%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $3, %xmm0, %eax +; NODQ-32-NEXT: vpextrd $3, %xmm0, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $1, %xmm3, %eax +; NODQ-32-NEXT: vpextrd $1, %xmm3, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $3, %xmm3, %eax +; NODQ-32-NEXT: vpextrd $3, %xmm3, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $1, %xmm2, %eax +; NODQ-32-NEXT: vpextrd $1, %xmm2, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $3, %xmm2, %eax +; NODQ-32-NEXT: vpextrd $3, %xmm2, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $1, %xmm1, %eax +; NODQ-32-NEXT: vpextrd $1, %xmm1, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) ; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $3, %xmm1, %eax +; NODQ-32-NEXT: vpextrd $3, %xmm1, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.LCPI.*}}(,%eax,4) diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll --- a/llvm/test/CodeGen/X86/vec_fabs.ll +++ b/llvm/test/CodeGen/X86/vec_fabs.ll @@ -7,15 +7,35 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512VLDQ define <2 x double> @fabs_v2f64(<2 x double> %p) { -; X86-LABEL: fabs_v2f64: -; X86: # %bb.0: -; X86-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-NEXT: retl +; X86-AVX-LABEL: fabs_v2f64: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; -; X64-LABEL: fabs_v2f64: -; X64: # %bb.0: -; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: retq +; X86-AVX512VL-LABEL: fabs_v2f64: +; X86-AVX512VL: # %bb.0: +; X86-AVX512VL-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX512VL-NEXT: retl +; +; X86-AVX512VLDQ-LABEL: fabs_v2f64: +; X86-AVX512VLDQ: # %bb.0: +; X86-AVX512VLDQ-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX512VLDQ-NEXT: retl +; +; X64-AVX-LABEL: fabs_v2f64: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq +; +; X64-AVX512VL-LABEL: fabs_v2f64: +; X64-AVX512VL: # %bb.0: +; X64-AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX512VL-NEXT: retq +; +; X64-AVX512VLDQ-LABEL: fabs_v2f64: +; X64-AVX512VLDQ: # %bb.0: +; X64-AVX512VLDQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX512VLDQ-NEXT: retq %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p) ret <2 x double> %t } @@ -34,7 +54,7 @@ ; ; X86-AVX512VLDQ-LABEL: fabs_v4f32: ; X86-AVX512VLDQ: # %bb.0: -; X86-AVX512VLDQ-NEXT: vandps {{\.LCPI.*}}{1to4}, %xmm0, %xmm0 +; X86-AVX512VLDQ-NEXT: vpandd {{\.LCPI.*}}{1to4}, %xmm0, %xmm0 ; X86-AVX512VLDQ-NEXT: retl ; ; X64-AVX-LABEL: fabs_v4f32: @@ -49,7 +69,7 @@ ; ; X64-AVX512VLDQ-LABEL: fabs_v4f32: ; X64-AVX512VLDQ: # %bb.0: -; X64-AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512VLDQ-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 ; X64-AVX512VLDQ-NEXT: retq %t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p) ret <4 x float> %t @@ -69,7 +89,7 @@ ; ; X86-AVX512VLDQ-LABEL: fabs_v4f64: ; X86-AVX512VLDQ: # %bb.0: -; X86-AVX512VLDQ-NEXT: vandpd {{\.LCPI.*}}{1to4}, %ymm0, %ymm0 +; X86-AVX512VLDQ-NEXT: vpandq {{\.LCPI.*}}{1to4}, %ymm0, %ymm0 ; X86-AVX512VLDQ-NEXT: retl ; ; X64-AVX-LABEL: fabs_v4f64: @@ -84,7 +104,7 @@ ; ; X64-AVX512VLDQ-LABEL: fabs_v4f64: ; X64-AVX512VLDQ: # %bb.0: -; X64-AVX512VLDQ-NEXT: vandpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; X64-AVX512VLDQ-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm0 ; X64-AVX512VLDQ-NEXT: retq %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p) ret <4 x double> %t @@ -104,7 +124,7 @@ ; ; X86-AVX512VLDQ-LABEL: fabs_v8f32: ; X86-AVX512VLDQ: # %bb.0: -; X86-AVX512VLDQ-NEXT: vandps {{\.LCPI.*}}{1to8}, %ymm0, %ymm0 +; X86-AVX512VLDQ-NEXT: vpandd {{\.LCPI.*}}{1to8}, %ymm0, %ymm0 ; X86-AVX512VLDQ-NEXT: retl ; ; X64-AVX-LABEL: fabs_v8f32: @@ -119,7 +139,7 @@ ; ; X64-AVX512VLDQ-LABEL: fabs_v8f32: ; X64-AVX512VLDQ: # %bb.0: -; X64-AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; X64-AVX512VLDQ-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; X64-AVX512VLDQ-NEXT: retq %t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p) ret <8 x float> %t @@ -141,7 +161,7 @@ ; ; X86-AVX512VLDQ-LABEL: fabs_v8f64: ; X86-AVX512VLDQ: # %bb.0: -; X86-AVX512VLDQ-NEXT: vandpd {{\.LCPI.*}}{1to8}, %zmm0, %zmm0 +; X86-AVX512VLDQ-NEXT: vpandq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0 ; X86-AVX512VLDQ-NEXT: retl ; ; X64-AVX-LABEL: fabs_v8f64: @@ -158,7 +178,7 @@ ; ; X64-AVX512VLDQ-LABEL: fabs_v8f64: ; X64-AVX512VLDQ: # %bb.0: -; X64-AVX512VLDQ-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; X64-AVX512VLDQ-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; X64-AVX512VLDQ-NEXT: retq %t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p) ret <8 x double> %t @@ -180,7 +200,7 @@ ; ; X86-AVX512VLDQ-LABEL: fabs_v16f32: ; X86-AVX512VLDQ: # %bb.0: -; X86-AVX512VLDQ-NEXT: vandps {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 +; X86-AVX512VLDQ-NEXT: vpandd {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 ; X86-AVX512VLDQ-NEXT: retl ; ; X64-AVX-LABEL: fabs_v16f32: @@ -197,7 +217,7 @@ ; ; X64-AVX512VLDQ-LABEL: fabs_v16f32: ; X64-AVX512VLDQ: # %bb.0: -; X64-AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; X64-AVX512VLDQ-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 ; X64-AVX512VLDQ-NEXT: retq %t = call <16 x float> @llvm.fabs.v16f32(<16 x float> %p) ret <16 x float> %t diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll --- a/llvm/test/CodeGen/X86/vec_floor.ll +++ b/llvm/test/CodeGen/X86/vec_floor.ll @@ -713,7 +713,7 @@ ; ; AVX512-LABEL: const_floor_v2f64: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] ; AVX512-NEXT: retq %t = call <2 x double> @llvm.floor.v2f64(<2 x double> ) ret <2 x double> %t @@ -732,7 +732,7 @@ ; ; AVX512-LABEL: const_floor_v4f32: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0] ; AVX512-NEXT: retq %t = call <4 x float> @llvm.floor.v4f32(<4 x float> ) ret <4 x float> %t @@ -751,7 +751,7 @@ ; ; AVX512-LABEL: const_ceil_v2f64: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [-1.0E+0,3.0E+0] ; AVX512-NEXT: retq %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> ) ret <2 x double> %t @@ -770,7 +770,7 @@ ; ; AVX512-LABEL: const_ceil_v4f32: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0] ; AVX512-NEXT: retq %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> ) ret <4 x float> %t @@ -789,7 +789,7 @@ ; ; AVX512-LABEL: const_trunc_v2f64: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [-1.0E+0,2.0E+0] ; AVX512-NEXT: retq %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> ) ret <2 x double> %t @@ -808,7 +808,7 @@ ; ; AVX512-LABEL: const_trunc_v4f32: ; AVX512: ## %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0] ; AVX512-NEXT: retq %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> ) ret <4 x float> %t diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -522,7 +522,7 @@ ; ; AVX512F-LABEL: fptoui_4f64_to_2i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -530,14 +530,14 @@ ; ; AVX512VL-LABEL: fptoui_4f64_to_2i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_4f64_to_2i32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512DQ-NEXT: vzeroupper @@ -545,7 +545,7 @@ ; ; AVX512VLDQ-LABEL: fptoui_4f64_to_2i32: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0 ; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq @@ -1948,10 +1948,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615] ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_2f64_to_2i64_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615] -; AVX-NEXT: retq +; AVX1-LABEL: fptosi_2f64_to_2i64_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_2f64_to_2i64_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [1,18446744073709551615] +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptosi_2f64_to_2i64_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [1,18446744073709551615] +; AVX512-NEXT: retq %cvt = fptosi <2 x double> to <2 x i64> ret <2 x i64> %cvt } @@ -1962,10 +1972,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = <4294967295,1,u,u> ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_2f64_to_2i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u> -; AVX-NEXT: retq +; AVX1-LABEL: fptosi_2f64_to_2i32_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u> +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_2f64_to_2i32_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <4294967295,1,u,u> +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptosi_2f64_to_2i32_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <4294967295,1,u,u> +; AVX512-NEXT: retq %cvt = fptosi <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> ret <4 x i32> %ext @@ -1978,10 +1998,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,18446744073709551613] ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_4f64_to_4i64_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] -; AVX-NEXT: retq +; AVX1-LABEL: fptosi_4f64_to_4i64_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_4f64_to_4i64_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptosi_4f64_to_4i64_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] +; AVX512-NEXT: retq %cvt = fptosi <4 x double> to <4 x i64> ret <4 x i64> %cvt } @@ -1992,10 +2022,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_4f64_to_4i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] -; AVX-NEXT: retq +; AVX1-LABEL: fptosi_4f64_to_4i32_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_4f64_to_4i32_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967295,1,4294967294,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptosi_4f64_to_4i32_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967295,1,4294967294,3] +; AVX512-NEXT: retq %cvt = fptosi <4 x double> to <4 x i32> ret <4 x i32> %cvt } @@ -2006,10 +2046,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4] ; SSE-NEXT: retq ; -; AVX-LABEL: fptoui_2f64_to_2i64_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4] -; AVX-NEXT: retq +; AVX1-LABEL: fptoui_2f64_to_2i64_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [2,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f64_to_2i64_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2,4] +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptoui_2f64_to_2i64_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [2,4] +; AVX512-NEXT: retq %cvt = fptoui <2 x double> to <2 x i64> ret <2 x i64> %cvt } @@ -2020,10 +2070,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = <2,4,u,u> ; SSE-NEXT: retq ; -; AVX-LABEL: fptoui_2f64_to_2i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u> -; AVX-NEXT: retq +; AVX1-LABEL: fptoui_2f64_to_2i32_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u> +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f64_to_2i32_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <2,4,u,u> +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptoui_2f64_to_2i32_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <2,4,u,u> +; AVX512-NEXT: retq %cvt = fptoui <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> ret <4 x i32> %ext @@ -2036,10 +2096,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,8] ; SSE-NEXT: retq ; -; AVX-LABEL: fptoui_4f64_to_4i64_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8] -; AVX-NEXT: retq +; AVX1-LABEL: fptoui_4f64_to_4i64_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f64_to_4i64_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,4,6,8] +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptoui_4f64_to_4i64_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [2,4,6,8] +; AVX512-NEXT: retq %cvt = fptoui <4 x double> to <4 x i64> ret <4 x i64> %cvt } @@ -2050,10 +2120,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4,6,8] ; SSE-NEXT: retq ; -; AVX-LABEL: fptoui_4f64_to_4i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8] -; AVX-NEXT: retq +; AVX1-LABEL: fptoui_4f64_to_4i32_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f64_to_4i32_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2,4,6,8] +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptoui_4f64_to_4i32_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [2,4,6,8] +; AVX512-NEXT: retq %cvt = fptoui <4 x double> to <4 x i32> ret <4 x i32> %cvt } @@ -2064,10 +2144,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,4294967295,2,3] ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_4f32_to_4i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: fptosi_4f32_to_4i32_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_4f32_to_4i32_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [1,4294967295,2,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptosi_4f32_to_4i32_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [1,4294967295,2,3] +; AVX512-NEXT: retq %cvt = fptosi <4 x float> to <4 x i32> ret <4 x i32> %cvt } @@ -2079,10 +2169,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,3] ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_4f32_to_4i64_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: fptosi_4f32_to_4i64_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_4f32_to_4i64_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,18446744073709551615,2,3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptosi_4f32_to_4i64_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [1,18446744073709551615,2,3] +; AVX512-NEXT: retq %cvt = fptosi <4 x float> to <4 x i64> ret <4 x i64> %cvt } @@ -2094,10 +2194,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,4294967288,2,4294967295] ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_8f32_to_8i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] -; AVX-NEXT: retq +; AVX1-LABEL: fptosi_8f32_to_8i32_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_8f32_to_8i32_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptosi_8f32_to_8i32_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] +; AVX512-NEXT: retq %cvt = fptosi <8 x float> to <8 x i32> ret <8 x i32> %cvt } @@ -2108,10 +2218,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2,4,6] ; SSE-NEXT: retq ; -; AVX-LABEL: fptoui_4f32_to_4i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6] -; AVX-NEXT: retq +; AVX1-LABEL: fptoui_4f32_to_4i32_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f32_to_4i32_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [1,2,4,6] +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptoui_4f32_to_4i32_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [1,2,4,6] +; AVX512-NEXT: retq %cvt = fptoui <4 x float> to <4 x i32> ret <4 x i32> %cvt } @@ -2123,10 +2243,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [4,8] ; SSE-NEXT: retq ; -; AVX-LABEL: fptoui_4f32_to_4i64_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8] -; AVX-NEXT: retq +; AVX1-LABEL: fptoui_4f32_to_4i64_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_4f32_to_4i64_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8] +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptoui_4f32_to_4i64_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8] +; AVX512-NEXT: retq %cvt = fptoui <4 x float> to <4 x i64> ret <4 x i64> %cvt } @@ -2138,10 +2268,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [8,6,4,1] ; SSE-NEXT: retq ; -; AVX-LABEL: fptoui_8f32_to_8i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] -; AVX-NEXT: retq +; AVX1-LABEL: fptoui_8f32_to_8i32_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_8f32_to_8i32_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptoui_8f32_to_8i32_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] +; AVX512-NEXT: retq %cvt = fptoui <8 x float> to <8 x i32> ret <8 x i32> %cvt } @@ -2158,10 +2298,10 @@ ; SSE-NEXT: pushq %rax ; SSE-NEXT: movl %esi, %ebx ; SSE-NEXT: movzwl %di, %edi -; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: callq __gnu_h2f_ieee@PLT ; SSE-NEXT: cvttss2si %xmm0, %ebp ; SSE-NEXT: movzwl %bx, %edi -; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: callq __gnu_h2f_ieee@PLT ; SSE-NEXT: cvttss2si %xmm0, %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: movd %ebp, %xmm1 @@ -2179,10 +2319,10 @@ ; VEX-NEXT: pushq %rax ; VEX-NEXT: movl %esi, %ebx ; VEX-NEXT: movzwl %di, %edi -; VEX-NEXT: callq __gnu_h2f_ieee +; VEX-NEXT: callq __gnu_h2f_ieee@PLT ; VEX-NEXT: vcvttss2si %xmm0, %ebp ; VEX-NEXT: movzwl %bx, %edi -; VEX-NEXT: callq __gnu_h2f_ieee +; VEX-NEXT: callq __gnu_h2f_ieee@PLT ; VEX-NEXT: vcvttss2si %xmm0, %eax ; VEX-NEXT: vmovd %eax, %xmm0 ; VEX-NEXT: vmovd %ebp, %xmm1 @@ -2260,10 +2400,10 @@ ; SSE-NEXT: pushq %rbx ; SSE-NEXT: subq $16, %rsp ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: callq __fixtfsi +; SSE-NEXT: callq __fixtfsi@PLT ; SSE-NEXT: movl %eax, %ebx ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: callq __fixtfsi +; SSE-NEXT: callq __fixtfsi@PLT ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: movd %ebx, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -2272,22 +2412,56 @@ ; SSE-NEXT: popq %rbx ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_2f128_to_4i32: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbx -; AVX-NEXT: subq $16, %rsp -; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX-NEXT: callq __fixtfsi -; AVX-NEXT: movl %eax, %ebx -; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX-NEXT: callq __fixtfsi -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vmovd %ebx, %xmm1 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX-NEXT: addq $16, %rsp -; AVX-NEXT: popq %rbx -; AVX-NEXT: retq +; AVX1-LABEL: fptosi_2f128_to_4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $16, %rsp +; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-NEXT: callq __fixtfsi@PLT +; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __fixtfsi@PLT +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vmovd %ebx, %xmm1 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: addq $16, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_2f128_to_4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $16, %rsp +; AVX2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-NEXT: callq __fixtfsi@PLT +; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: callq __fixtfsi@PLT +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vmovd %ebx, %xmm1 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: addq $16, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq +; +; AVX512-LABEL: fptosi_2f128_to_4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $16, %rsp +; AVX512-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX512-NEXT: callq __fixtfsi@PLT +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __fixtfsi@PLT +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vmovd %ebx, %xmm1 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512-NEXT: addq $16, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: retq %cvt = fptosi <2 x fp128> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %ext @@ -2776,7 +2950,7 @@ ; ; AVX512DQ-LABEL: fptosi_2f32_to_2i64_load: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -2864,7 +3038,7 @@ ; ; AVX512DQ-LABEL: fptoui_2f32_to_2i64_load: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vec_fpext.ll b/llvm/test/CodeGen/X86/vec_fpext.ll --- a/llvm/test/CodeGen/X86/vec_fpext.ll +++ b/llvm/test/CodeGen/X86/vec_fpext.ll @@ -97,7 +97,7 @@ ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] ; X32-AVX512VL-NEXT: vcvtps2pd (%ecx), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0x01] -; X32-AVX512VL-NEXT: vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] +; X32-AVX512VL-NEXT: vmovdqu %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x00] ; X32-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: fpext_frommem: @@ -115,7 +115,7 @@ ; X64-AVX512VL-LABEL: fpext_frommem: ; X64-AVX512VL: # %bb.0: # %entry ; X64-AVX512VL-NEXT: vcvtps2pd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0x07] -; X64-AVX512VL-NEXT: vmovups %xmm0, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x06] +; X64-AVX512VL-NEXT: vmovdqu %xmm0, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] entry: %0 = load <2 x float>, <2 x float>* %in, align 8 @@ -149,7 +149,7 @@ ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] ; X32-AVX512VL-NEXT: vcvtps2pd (%ecx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0x01] -; X32-AVX512VL-NEXT: vmovups %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x00] +; X32-AVX512VL-NEXT: vmovdqu %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x00] ; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X32-AVX512VL-NEXT: retl # encoding: [0xc3] ; @@ -171,7 +171,7 @@ ; X64-AVX512VL-LABEL: fpext_frommem4: ; X64-AVX512VL: # %bb.0: # %entry ; X64-AVX512VL-NEXT: vcvtps2pd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0x07] -; X64-AVX512VL-NEXT: vmovups %ymm0, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x06] +; X64-AVX512VL-NEXT: vmovdqu %ymm0, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06] ; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] entry: @@ -212,7 +212,7 @@ ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] ; X32-AVX512VL-NEXT: vcvtps2pd (%ecx), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x5a,0x01] -; X32-AVX512VL-NEXT: vmovups %zmm0, (%eax) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x00] +; X32-AVX512VL-NEXT: vmovdqu64 %zmm0, (%eax) # encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x00] ; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X32-AVX512VL-NEXT: retl # encoding: [0xc3] ; @@ -240,7 +240,7 @@ ; X64-AVX512VL-LABEL: fpext_frommem8: ; X64-AVX512VL: # %bb.0: # %entry ; X64-AVX512VL-NEXT: vcvtps2pd (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x5a,0x07] -; X64-AVX512VL-NEXT: vmovups %zmm0, (%rsi) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x06] +; X64-AVX512VL-NEXT: vmovdqu64 %zmm0, (%rsi) # encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x06] ; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] entry: @@ -267,8 +267,8 @@ ; ; X32-AVX512VL-LABEL: fpext_fromconst: ; X32-AVX512VL: # %bb.0: # %entry -; X32-AVX512VL-NEXT: vmovaps {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [1.0E+0,-2.0E+0] -; X32-AVX512VL-NEXT: # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X32-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [1.0E+0,-2.0E+0] +; X32-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; X32-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X32-AVX512VL-NEXT: retl # encoding: [0xc3] ; @@ -288,8 +288,8 @@ ; ; X64-AVX512VL-LABEL: fpext_fromconst: ; X64-AVX512VL: # %bb.0: # %entry -; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [1.0E+0,-2.0E+0] -; X64-AVX512VL-NEXT: # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [1.0E+0,-2.0E+0] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] entry: @@ -318,7 +318,7 @@ ; X32-AVX512VL-LABEL: PR42079: ; X32-AVX512VL: # %bb.0: ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X32-AVX512VL-NEXT: vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00] +; X32-AVX512VL-NEXT: vmovdqa (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00] ; X32-AVX512VL-NEXT: vcvtps2pd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0xc0] ; X32-AVX512VL-NEXT: retl # encoding: [0xc3] ; @@ -336,7 +336,7 @@ ; ; X64-AVX512VL-LABEL: PR42079: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07] +; X64-AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07] ; X64-AVX512VL-NEXT: vcvtps2pd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0xc0] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %a = load volatile <4 x float>, <4 x float>* %x diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -2955,7 +2955,7 @@ ; ; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -2992,11 +2992,23 @@ ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: sitofp_volatile_load_4i32_to_2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_volatile_load_4i32_to_2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_volatile_load_4i32_to_2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX512-NEXT: retq %ld = load volatile <4 x i32>, <4 x i32> *%a %b = shufflevector <4 x i32> %ld, <4 x i32> undef, <2 x i32> %cvt = sitofp <2 x i32> %b to <2 x double> @@ -3026,11 +3038,23 @@ ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: sitofp_volatile_load_4i32_to_2f64_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sitofp_volatile_load_4i32_to_2f64_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: sitofp_volatile_load_4i32_to_2f64_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX512-NEXT: retq %a = load volatile <4 x i32>, <4 x i32>* %x %b = sitofp <4 x i32> %a to <4 x double> %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> @@ -3189,7 +3213,7 @@ ; ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq @@ -3359,7 +3383,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -3403,7 +3427,7 @@ ; ; AVX512F-LABEL: uitofp_load_2i32_to_2f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -3416,7 +3440,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_2i32_to_2f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -3470,7 +3494,7 @@ ; ; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -3483,7 +3507,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -3538,7 +3562,7 @@ ; ; AVX512F-LABEL: uitofp_volatile_load_4i32_to_2f64_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -3546,13 +3570,13 @@ ; ; AVX512VL-LABEL: uitofp_volatile_load_4i32_to_2f64_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -3560,7 +3584,7 @@ ; ; AVX512VLDQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VLDQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq %a = load volatile <4 x i32>, <4 x i32>* %x @@ -3733,7 +3757,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq @@ -3797,7 +3821,7 @@ ; ; AVX512F-LABEL: uitofp_load_4i32_to_4f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -3809,7 +3833,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_4i32_to_4f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq @@ -3982,7 +4006,7 @@ ; ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512DQ-NEXT: vzeroupper @@ -4564,7 +4588,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512DQ-NEXT: vzeroupper @@ -4628,7 +4652,7 @@ ; ; AVX512F-LABEL: uitofp_load_4i32_to_4f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -4641,7 +4665,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_4i32_to_4f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -5179,7 +5203,7 @@ ; ; AVX512F-LABEL: uitofp_load_8i32_to_8f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -5191,7 +5215,7 @@ ; ; AVX512DQ-LABEL: uitofp_load_8i32_to_8f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq @@ -5345,14 +5369,41 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: aggregate_sitofp_8i16_to_8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: movq 24(%rdi), %rax -; AVX512-NEXT: vpmovsxwd 8(%rdi), %ymm0 -; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512-NEXT: vmovaps %ymm0, (%rax) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: aggregate_sitofp_8i16_to_8f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movq 24(%rdi), %rax +; AVX512F-NEXT: vpmovsxwd 8(%rdi), %ymm0 +; AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512F-NEXT: vmovaps %ymm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: aggregate_sitofp_8i16_to_8f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movq 24(%rdi), %rax +; AVX512VL-NEXT: vpmovsxwd 8(%rdi), %ymm0 +; AVX512VL-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa %ymm0, (%rax) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: aggregate_sitofp_8i16_to_8f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: movq 24(%rdi), %rax +; AVX512DQ-NEXT: vpmovsxwd 8(%rdi), %ymm0 +; AVX512DQ-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovaps %ymm0, (%rax) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: aggregate_sitofp_8i16_to_8f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: movq 24(%rdi), %rax +; AVX512VLDQ-NEXT: vpmovsxwd 8(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vmovdqa %ymm0, (%rax) +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq %1 = load %Arguments, %Arguments* %a0, align 1 %2 = extractvalue %Arguments %1, 1 %3 = extractvalue %Arguments %1, 2 @@ -5472,12 +5523,40 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: extract0_sitofp_v4i32_f32_multiuse2: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm1 -; AVX-NEXT: vmovss %xmm0, (%rdi) -; AVX-NEXT: vmovaps %xmm1, %xmm0 -; AVX-NEXT: retq +; VEX-LABEL: extract0_sitofp_v4i32_f32_multiuse2: +; VEX: # %bb.0: +; VEX-NEXT: vcvtdq2ps %xmm0, %xmm1 +; VEX-NEXT: vmovss %xmm0, (%rdi) +; VEX-NEXT: vmovaps %xmm1, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: extract0_sitofp_v4i32_f32_multiuse2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm1 +; AVX512F-NEXT: vmovss %xmm0, (%rdi) +; AVX512F-NEXT: vmovaps %xmm1, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract0_sitofp_v4i32_f32_multiuse2: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtdq2ps %xmm0, %xmm1 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) +; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract0_sitofp_v4i32_f32_multiuse2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvtdq2ps %xmm0, %xmm1 +; AVX512DQ-NEXT: vmovss %xmm0, (%rdi) +; AVX512DQ-NEXT: vmovaps %xmm1, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract0_sitofp_v4i32_f32_multiuse2: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm1 +; AVX512VLDQ-NEXT: vmovd %xmm0, (%rdi) +; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 0 %r = sitofp i32 %e to float store i32 %e, i32* %p @@ -5598,11 +5677,35 @@ ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: extract3_sitofp_v4i32_f32: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: retq +; VEX-LABEL: extract3_sitofp_v4i32_f32: +; VEX: # %bb.0: +; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; VEX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; VEX-NEXT: retq +; +; AVX512F-LABEL: extract3_sitofp_v4i32_f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512F-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: extract3_sitofp_v4i32_f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: extract3_sitofp_v4i32_f32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512DQ-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: extract3_sitofp_v4i32_f32: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %r = sitofp i32 %e to float ret float %r @@ -5624,11 +5727,23 @@ ; SSE41-NEXT: cvtsi2sd %eax, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: extract3_sitofp_v4i32_f64: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: extract3_sitofp_v4i32_f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: extract3_sitofp_v4i32_f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: extract3_sitofp_v4i32_f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX512-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 %r = sitofp i32 %e to double ret double %r @@ -5658,7 +5773,7 @@ ; ; AVX512F-LABEL: extract3_uitofp_v4i32_f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -5666,13 +5781,13 @@ ; ; AVX512VL-LABEL: extract3_uitofp_v4i32_f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: extract3_uitofp_v4i32_f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -5680,7 +5795,7 @@ ; ; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f32: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 @@ -5712,7 +5827,7 @@ ; ; AVX512F-LABEL: extract3_uitofp_v4i32_f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -5720,13 +5835,13 @@ ; ; AVX512VL-LABEL: extract3_uitofp_v4i32_f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: extract3_uitofp_v4i32_f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -5734,7 +5849,7 @@ ; ; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f64: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 diff --git a/llvm/test/CodeGen/X86/vec_minmax_sint.ll b/llvm/test/CodeGen/X86/vec_minmax_sint.ll --- a/llvm/test/CodeGen/X86/vec_minmax_sint.ll +++ b/llvm/test/CodeGen/X86/vec_minmax_sint.ll @@ -1553,10 +1553,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_gt_v2i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_gt_v2i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_gt_v2i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_gt_v2i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX512-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp sgt <2 x i64> %1, %2 @@ -1571,10 +1581,20 @@ ; SSE-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: max_gt_v4i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_gt_v4i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_gt_v4i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_gt_v4i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX512-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp sgt <4 x i64> %1, %2 @@ -1588,10 +1608,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_gt_v4i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_gt_v4i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_gt_v4i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_gt_v4i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX512-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp sgt <4 x i32> %1, %2 @@ -1606,10 +1636,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,5,5,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_gt_v8i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_gt_v8i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_gt_v8i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_gt_v8i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX512-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp sgt <8 x i32> %1, %2 @@ -1623,10 +1663,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_gt_v8i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_gt_v8i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_gt_v8i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_gt_v8i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX512-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 -1, i32 0 %3 = icmp sgt <8 x i16> %1, %2 @@ -1641,10 +1691,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,6,5,4,5,6,7,8] ; SSE-NEXT: retq ; -; AVX-LABEL: max_gt_v16i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] -; AVX-NEXT: retq +; AVX1-LABEL: max_gt_v16i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_gt_v16i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_gt_v16i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX512-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 -1, i32 0 %3 = icmp sgt <16 x i16> %1, %2 @@ -1658,10 +1718,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] ; SSE-NEXT: retq ; -; AVX-LABEL: max_gt_v16i8c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] -; AVX-NEXT: retq +; AVX1-LABEL: max_gt_v16i8c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_gt_v16i8c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_gt_v16i8c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX512-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 -1, i32 0 %3 = icmp sgt <16 x i8> %1, %2 @@ -1675,10 +1745,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_ge_v2i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_ge_v2i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_ge_v2i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_ge_v2i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX512-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp sge <2 x i64> %1, %2 @@ -1693,10 +1773,20 @@ ; SSE-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: max_ge_v4i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_ge_v4i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_ge_v4i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_ge_v4i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX512-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp sge <4 x i64> %1, %2 @@ -1710,10 +1800,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_ge_v4i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_ge_v4i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_ge_v4i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_ge_v4i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX512-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp sge <4 x i32> %1, %2 @@ -1728,10 +1828,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,5,5,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_ge_v8i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_ge_v8i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_ge_v8i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_ge_v8i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX512-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp sge <8 x i32> %1, %2 @@ -1745,10 +1855,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_ge_v8i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_ge_v8i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_ge_v8i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_ge_v8i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX512-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 -1, i32 0 %3 = icmp sge <8 x i16> %1, %2 @@ -1763,10 +1883,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,6,5,4,5,6,7,8] ; SSE-NEXT: retq ; -; AVX-LABEL: max_ge_v16i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] -; AVX-NEXT: retq +; AVX1-LABEL: max_ge_v16i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_ge_v16i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_ge_v16i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX512-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 -1, i32 0 %3 = icmp sge <16 x i16> %1, %2 @@ -1780,10 +1910,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] ; SSE-NEXT: retq ; -; AVX-LABEL: max_ge_v16i8c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] -; AVX-NEXT: retq +; AVX1-LABEL: max_ge_v16i8c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_ge_v16i8c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_ge_v16i8c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX512-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 -1, i32 0 %3 = icmp sge <16 x i8> %1, %2 @@ -1797,10 +1937,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_lt_v2i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_lt_v2i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_lt_v2i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_lt_v2i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX512-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp slt <2 x i64> %1, %2 @@ -1815,10 +1965,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_lt_v4i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_lt_v4i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_lt_v4i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_lt_v4i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX512-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp slt <4 x i64> %1, %2 @@ -1832,10 +1992,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_lt_v4i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_lt_v4i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_lt_v4i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_lt_v4i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX512-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp slt <4 x i32> %1, %2 @@ -1850,10 +2020,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,3,3,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_lt_v8i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_lt_v8i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_lt_v8i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_lt_v8i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX512-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp slt <8 x i32> %1, %2 @@ -1867,10 +2047,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_lt_v8i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_lt_v8i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_lt_v8i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_lt_v8i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] +; AVX512-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 -1, i32 0 %3 = icmp slt <8 x i16> %1, %2 @@ -1885,10 +2075,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,2,3,4,3,2,1,0] ; SSE-NEXT: retq ; -; AVX-LABEL: min_lt_v16i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] -; AVX-NEXT: retq +; AVX1-LABEL: min_lt_v16i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_lt_v16i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_lt_v16i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX512-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 -1, i32 0 %3 = icmp slt <16 x i16> %1, %2 @@ -1902,10 +2102,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] ; SSE-NEXT: retq ; -; AVX-LABEL: min_lt_v16i8c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] -; AVX-NEXT: retq +; AVX1-LABEL: min_lt_v16i8c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_lt_v16i8c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_lt_v16i8c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX512-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 -1, i32 0 %3 = icmp slt <16 x i8> %1, %2 @@ -1919,10 +2129,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_le_v2i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_le_v2i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_le_v2i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_le_v2i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX512-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp sle <2 x i64> %1, %2 @@ -1937,10 +2157,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_le_v4i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_le_v4i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_le_v4i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_le_v4i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX512-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp sle <4 x i64> %1, %2 @@ -1954,10 +2184,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_le_v4i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_le_v4i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_le_v4i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_le_v4i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX512-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp sle <4 x i32> %1, %2 @@ -1972,10 +2212,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,3,3,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_le_v8i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_le_v8i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_le_v8i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_le_v8i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX512-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp sle <8 x i32> %1, %2 @@ -1989,10 +2239,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_le_v8i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_le_v8i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_le_v8i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_le_v8i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] +; AVX512-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 -1, i32 0 %3 = icmp sle <8 x i16> %1, %2 @@ -2007,10 +2267,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,2,3,4,3,2,1,0] ; SSE-NEXT: retq ; -; AVX-LABEL: min_le_v16i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] -; AVX-NEXT: retq +; AVX1-LABEL: min_le_v16i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_le_v16i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_le_v16i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX512-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 -1, i32 0 %3 = icmp sle <16 x i16> %1, %2 @@ -2024,10 +2294,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] ; SSE-NEXT: retq ; -; AVX-LABEL: min_le_v16i8c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] -; AVX-NEXT: retq +; AVX1-LABEL: min_le_v16i8c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_le_v16i8c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_le_v16i8c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX512-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 -1, i32 0 %3 = icmp sle <16 x i8> %1, %2 diff --git a/llvm/test/CodeGen/X86/vec_minmax_uint.ll b/llvm/test/CodeGen/X86/vec_minmax_uint.ll --- a/llvm/test/CodeGen/X86/vec_minmax_uint.ll +++ b/llvm/test/CodeGen/X86/vec_minmax_uint.ll @@ -1647,10 +1647,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_gt_v2i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_gt_v2i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_gt_v2i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_gt_v2i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX512-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp ugt <2 x i64> %1, %2 @@ -1665,10 +1675,20 @@ ; SSE-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: max_gt_v4i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_gt_v4i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_gt_v4i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_gt_v4i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX512-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp ugt <4 x i64> %1, %2 @@ -1682,10 +1702,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_gt_v4i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_gt_v4i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_gt_v4i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_gt_v4i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX512-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp ugt <4 x i32> %1, %2 @@ -1700,10 +1730,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,5,5,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_gt_v8i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_gt_v8i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_gt_v8i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_gt_v8i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX512-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp ugt <8 x i32> %1, %2 @@ -1717,10 +1757,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_gt_v8i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_gt_v8i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_gt_v8i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_gt_v8i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX512-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 -1, i32 0 %3 = icmp ugt <8 x i16> %1, %2 @@ -1735,10 +1785,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,6,5,4,5,6,7,8] ; SSE-NEXT: retq ; -; AVX-LABEL: max_gt_v16i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] -; AVX-NEXT: retq +; AVX1-LABEL: max_gt_v16i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_gt_v16i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_gt_v16i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX512-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 -1, i32 0 %3 = icmp ugt <16 x i16> %1, %2 @@ -1752,10 +1812,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] ; SSE-NEXT: retq ; -; AVX-LABEL: max_gt_v16i8c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] -; AVX-NEXT: retq +; AVX1-LABEL: max_gt_v16i8c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_gt_v16i8c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_gt_v16i8c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX512-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 -1, i32 0 %3 = icmp ugt <16 x i8> %1, %2 @@ -1769,10 +1839,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_ge_v2i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_ge_v2i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_ge_v2i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_ge_v2i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551615,7] +; AVX512-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp uge <2 x i64> %1, %2 @@ -1787,10 +1867,20 @@ ; SSE-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: max_ge_v4i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_ge_v4i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_ge_v4i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_ge_v4i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7] +; AVX512-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp uge <4 x i64> %1, %2 @@ -1804,10 +1894,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_ge_v4i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_ge_v4i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_ge_v4i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_ge_v4i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967295,4294967295,7,7] +; AVX512-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp uge <4 x i32> %1, %2 @@ -1822,10 +1922,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,5,5,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_ge_v8i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_ge_v8i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_ge_v8i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_ge_v8i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7] +; AVX512-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp uge <8 x i32> %1, %2 @@ -1839,10 +1949,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] ; SSE-NEXT: retq ; -; AVX-LABEL: max_ge_v8i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] -; AVX-NEXT: retq +; AVX1-LABEL: max_ge_v8i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_ge_v8i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_ge_v8i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7] +; AVX512-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 -1, i32 0 %3 = icmp uge <8 x i16> %1, %2 @@ -1857,10 +1977,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,6,5,4,5,6,7,8] ; SSE-NEXT: retq ; -; AVX-LABEL: max_ge_v16i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] -; AVX-NEXT: retq +; AVX1-LABEL: max_ge_v16i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_ge_v16i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_ge_v16i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8] +; AVX512-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 -1, i32 0 %3 = icmp uge <16 x i16> %1, %2 @@ -1874,10 +2004,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] ; SSE-NEXT: retq ; -; AVX-LABEL: max_ge_v16i8c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] -; AVX-NEXT: retq +; AVX1-LABEL: max_ge_v16i8c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX1-NEXT: retq +; +; AVX2-LABEL: max_ge_v16i8c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX2-NEXT: retq +; +; AVX512-LABEL: max_ge_v16i8c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8] +; AVX512-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 -1, i32 0 %3 = icmp uge <16 x i8> %1, %2 @@ -1891,10 +2031,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_lt_v2i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_lt_v2i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_lt_v2i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_lt_v2i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX512-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp ult <2 x i64> %1, %2 @@ -1909,10 +2059,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_lt_v4i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_lt_v4i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_lt_v4i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_lt_v4i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX512-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp ult <4 x i64> %1, %2 @@ -1926,10 +2086,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_lt_v4i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_lt_v4i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_lt_v4i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_lt_v4i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX512-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp ult <4 x i32> %1, %2 @@ -1944,10 +2114,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,3,3,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_lt_v8i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_lt_v8i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_lt_v8i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_lt_v8i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX512-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp ult <8 x i32> %1, %2 @@ -1961,10 +2141,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,65531,65531,65529,1,3,3,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_lt_v8i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,65531,65531,65529,1,3,3,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_lt_v8i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,65531,65531,65529,1,3,3,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_lt_v8i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [1,65531,65531,65529,1,3,3,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_lt_v8i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [1,65531,65531,65529,1,3,3,1] +; AVX512-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 1, i32 0 %3 = icmp ult <8 x i16> %1, %2 @@ -1979,10 +2169,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,2,3,4,3,2,1,0] ; SSE-NEXT: retq ; -; AVX-LABEL: min_lt_v16i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] -; AVX-NEXT: retq +; AVX1-LABEL: min_lt_v16i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_lt_v16i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_lt_v16i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [1,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX512-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 1, i32 0 %3 = icmp ult <16 x i16> %1, %2 @@ -1996,10 +2196,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] ; SSE-NEXT: retq ; -; AVX-LABEL: min_lt_v16i8c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] -; AVX-NEXT: retq +; AVX1-LABEL: min_lt_v16i8c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_lt_v16i8c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [1,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_lt_v16i8c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [1,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX512-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 1, i32 0 %3 = icmp ult <16 x i8> %1, %2 @@ -2013,10 +2223,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_le_v2i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_le_v2i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_le_v2i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_le_v2i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551609,1] +; AVX512-NEXT: retq %1 = insertelement <2 x i64> , i64 -7, i32 0 %2 = insertelement <2 x i64> , i64 -1, i32 0 %3 = icmp ule <2 x i64> %1, %2 @@ -2031,10 +2251,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_le_v4i64c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_le_v4i64c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_le_v4i64c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_le_v4i64c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1] +; AVX512-NEXT: retq %1 = insertelement <4 x i64> , i64 -7, i32 0 %2 = insertelement <4 x i64> , i64 -1, i32 0 %3 = icmp ule <4 x i64> %1, %2 @@ -2048,10 +2278,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_le_v4i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_le_v4i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_le_v4i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_le_v4i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4294967289,4294967289,1,1] +; AVX512-NEXT: retq %1 = insertelement <4 x i32> , i32 -7, i32 0 %2 = insertelement <4 x i32> , i32 -1, i32 0 %3 = icmp ule <4 x i32> %1, %2 @@ -2066,10 +2306,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,3,3,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_le_v8i32c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_le_v8i32c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_le_v8i32c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_le_v8i32c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1] +; AVX512-NEXT: retq %1 = insertelement <8 x i32> , i32 -7, i32 0 %2 = insertelement <8 x i32> , i32 -1, i32 0 %3 = icmp ule <8 x i32> %1, %2 @@ -2083,10 +2333,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] ; SSE-NEXT: retq ; -; AVX-LABEL: min_le_v8i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] -; AVX-NEXT: retq +; AVX1-LABEL: min_le_v8i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_le_v8i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_le_v8i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1] +; AVX512-NEXT: retq %1 = insertelement <8 x i16> , i16 -7, i32 0 %2 = insertelement <8 x i16> , i16 -1, i32 0 %3 = icmp ule <8 x i16> %1, %2 @@ -2101,10 +2361,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,2,3,4,3,2,1,0] ; SSE-NEXT: retq ; -; AVX-LABEL: min_le_v16i16c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] -; AVX-NEXT: retq +; AVX1-LABEL: min_le_v16i16c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_le_v16i16c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_le_v16i16c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0] +; AVX512-NEXT: retq %1 = insertelement <16 x i16> , i16 -7, i32 0 %2 = insertelement <16 x i16> , i16 -1, i32 0 %3 = icmp ule <16 x i16> %1, %2 @@ -2118,10 +2388,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] ; SSE-NEXT: retq ; -; AVX-LABEL: min_le_v16i8c: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] -; AVX-NEXT: retq +; AVX1-LABEL: min_le_v16i8c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: min_le_v16i8c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX2-NEXT: retq +; +; AVX512-LABEL: min_le_v16i8c: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0] +; AVX512-NEXT: retq %1 = insertelement <16 x i8> , i8 -7, i32 0 %2 = insertelement <16 x i8> , i8 -1, i32 0 %3 = icmp ule <16 x i8> %1, %2 diff --git a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll --- a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll +++ b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll @@ -255,15 +255,25 @@ ; X64-NEXT: movaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2] ; X64-NEXT: retq ; -; X32_AVX-LABEL: test5: -; X32_AVX: ## %bb.0: ## %entry -; X32_AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2] -; X32_AVX-NEXT: retl +; X32_AVX1-LABEL: test5: +; X32_AVX1: ## %bb.0: ## %entry +; X32_AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2] +; X32_AVX1-NEXT: retl ; -; X64_AVX-LABEL: test5: -; X64_AVX: ## %bb.0: ## %entry -; X64_AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2] -; X64_AVX-NEXT: retq +; X64_AVX1-LABEL: test5: +; X64_AVX1: ## %bb.0: ## %entry +; X64_AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2] +; X64_AVX1-NEXT: retq +; +; X32_AVX512-LABEL: test5: +; X32_AVX512: ## %bb.0: ## %entry +; X32_AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [1.28E+2,1.23321E+2] +; X32_AVX512-NEXT: retl +; +; X64_AVX512-LABEL: test5: +; X64_AVX512: ## %bb.0: ## %entry +; X64_AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [1.28E+2,1.23321E+2] +; X64_AVX512-NEXT: retq entry: %0 = tail call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> , i32 128) nounwind readnone ret <2 x double> %0 diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -3214,15 +3214,30 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] ; SSE-NEXT: retq ; -; AVX-LABEL: fold_bitreverse_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] -; AVX-NEXT: retq +; AVX1-LABEL: fold_bitreverse_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] +; AVX1-NEXT: retq ; -; XOP-LABEL: fold_bitreverse_v16i8: -; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] -; XOP-NEXT: retq +; AVX2-LABEL: fold_bitreverse_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] +; AVX2-NEXT: retq +; +; AVX512-LABEL: fold_bitreverse_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] +; AVX512-NEXT: retq +; +; XOPAVX1-LABEL: fold_bitreverse_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: fold_bitreverse_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] +; XOPAVX2-NEXT: retq ; ; GFNISSE-LABEL: fold_bitreverse_v16i8: ; GFNISSE: # %bb.0: @@ -3236,17 +3251,17 @@ ; ; GFNIAVX2-LABEL: fold_bitreverse_v16i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512F-LABEL: fold_bitreverse_v16i8: ; GFNIAVX512F: # %bb.0: -; GFNIAVX512F-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] +; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] ; GFNIAVX512F-NEXT: retq ; ; GFNIAVX512BW-LABEL: fold_bitreverse_v16i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] +; GFNIAVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] ; GFNIAVX512BW-NEXT: retq %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> ) ret <16 x i8> %b @@ -3259,15 +3274,30 @@ ; SSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] ; SSE-NEXT: retq ; -; AVX-LABEL: fold_bitreverse_v16i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] -; AVX-NEXT: retq +; AVX1-LABEL: fold_bitreverse_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] +; AVX1-NEXT: retq ; -; XOP-LABEL: fold_bitreverse_v16i16: -; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] -; XOP-NEXT: retq +; AVX2-LABEL: fold_bitreverse_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] +; AVX2-NEXT: retq +; +; AVX512-LABEL: fold_bitreverse_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] +; AVX512-NEXT: retq +; +; XOPAVX1-LABEL: fold_bitreverse_v16i16: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: fold_bitreverse_v16i16: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] +; XOPAVX2-NEXT: retq ; ; GFNISSE-LABEL: fold_bitreverse_v16i16: ; GFNISSE: # %bb.0: @@ -3282,17 +3312,17 @@ ; ; GFNIAVX2-LABEL: fold_bitreverse_v16i16: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512F-LABEL: fold_bitreverse_v16i16: ; GFNIAVX512F: # %bb.0: -; GFNIAVX512F-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] +; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] ; GFNIAVX512F-NEXT: retq ; ; GFNIAVX512BW-LABEL: fold_bitreverse_v16i16: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] +; GFNIAVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] ; GFNIAVX512BW-NEXT: retq %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> ) ret <16 x i16> %b @@ -3315,20 +3345,26 @@ ; ; AVX2-LABEL: fold_bitreverse_v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] ; AVX2-NEXT: retq ; ; AVX512-LABEL: fold_bitreverse_v16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] ; AVX512-NEXT: retq ; -; XOP-LABEL: fold_bitreverse_v16i32: -; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] -; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] -; XOP-NEXT: retq +; XOPAVX1-LABEL: fold_bitreverse_v16i32: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] +; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: fold_bitreverse_v16i32: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] +; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] +; XOPAVX2-NEXT: retq ; ; GFNISSE-LABEL: fold_bitreverse_v16i32: ; GFNISSE: # %bb.0: @@ -3346,18 +3382,18 @@ ; ; GFNIAVX2-LABEL: fold_bitreverse_v16i32: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] -; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] +; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512F-LABEL: fold_bitreverse_v16i32: ; GFNIAVX512F: # %bb.0: -; GFNIAVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] +; GFNIAVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] ; GFNIAVX512F-NEXT: retq ; ; GFNIAVX512BW-LABEL: fold_bitreverse_v16i32: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] +; GFNIAVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] ; GFNIAVX512BW-NEXT: retq %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> ) ret <16 x i32> %b diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll --- a/llvm/test/CodeGen/X86/vector-blend.ll +++ b/llvm/test/CodeGen/X86/vector-blend.ll @@ -25,10 +25,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_float: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: vsel_float: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_float: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq entry: %vsel = select <4 x i1> , <4 x float> %v1, <4 x float> %v2 ret <4 x float> %vsel @@ -52,10 +57,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_float2: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: vsel_float2: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_float2: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq entry: %vsel = select <4 x i1> , <4 x float> %v1, <4 x float> %v2 ret <4 x float> %vsel @@ -145,10 +155,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_i32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: vsel_i32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq entry: %vsel = select <4 x i1> , <4 x i32> %v1, <4 x i32> %v2 ret <4 x i32> %vsel @@ -170,10 +185,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_double: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: vsel_double: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_double: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq entry: %vsel = select <2 x i1> , <2 x double> %v1, <2 x double> %v2 ret <2 x double> %vsel @@ -195,10 +215,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: vsel_i64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq entry: %vsel = select <2 x i1> , <2 x i64> %v1, <2 x i64> %v2 ret <2 x i64> %vsel @@ -298,10 +323,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_float8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: retq +; AVX1-LABEL: vsel_float8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_float8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq entry: %vsel = select <8 x i1> , <8 x float> %v1, <8 x float> %v2 ret <8 x float> %vsel @@ -330,10 +360,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_i328: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: retq +; AVX1-LABEL: vsel_i328: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i328: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq entry: %vsel = select <8 x i1> , <8 x i32> %v1, <8 x i32> %v2 ret <8 x i32> %vsel @@ -364,11 +399,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_double8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX-NEXT: retq +; AVX1-LABEL: vsel_double8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_double8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: retq entry: %vsel = select <8 x i1> , <8 x double> %v1, <8 x double> %v2 ret <8 x double> %vsel @@ -399,11 +440,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_i648: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX-NEXT: retq +; AVX1-LABEL: vsel_i648: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i648: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: retq entry: %vsel = select <8 x i1> , <8 x i64> %v1, <8 x i64> %v2 ret <8 x i64> %vsel @@ -428,10 +475,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_double4: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: retq +; AVX1-LABEL: vsel_double4: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_double4: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: retq entry: %vsel = select <4 x i1> , <4 x double> %v1, <4 x double> %v2 ret <4 x double> %vsel @@ -536,10 +588,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: constant_blendvpd_avx: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: retq +; AVX1-LABEL: constant_blendvpd_avx: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_blendvpd_avx: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: retq entry: %select = select <4 x i1> , <4 x double> %xy, <4 x double> %ab ret <4 x double> %select @@ -572,10 +629,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: constant_blendvps_avx: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX-NEXT: retq +; AVX1-LABEL: constant_blendvps_avx: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_blendvps_avx: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-NEXT: retq entry: %select = select <8 x i1> , <8 x float> %xyzw, <8 x float> %abcd ret <8 x float> %select @@ -657,10 +719,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: blend_shufflevector_4xfloat: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: blend_shufflevector_4xfloat: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: blend_shufflevector_4xfloat: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq entry: %select = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %select @@ -691,10 +758,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: blend_shufflevector_8xfloat: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] -; AVX-NEXT: retq +; AVX1-LABEL: blend_shufflevector_8xfloat: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: blend_shufflevector_8xfloat: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq entry: %select = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %select @@ -716,10 +788,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: blend_shufflevector_4xdouble: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX-NEXT: retq +; AVX1-LABEL: blend_shufflevector_4xdouble: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: blend_shufflevector_4xdouble: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: retq entry: %select = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %select @@ -744,10 +821,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: blend_shufflevector_4xi64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX-NEXT: retq +; AVX1-LABEL: blend_shufflevector_4xi64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: blend_shufflevector_4xi64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq entry: %select = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %select diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -152,21 +152,32 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq fmodf +; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_frem_v1f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: callq fmodf -; AVX-NEXT: popq %rax -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_frem_v1f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: callq fmodf@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_frem_v1f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: callq fmodf@PLT +; AVX512-NEXT: popq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %rem = call <1 x float> @llvm.experimental.constrained.frem.v1f32( <1 x float> , @@ -183,33 +194,50 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmod +; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmod +; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_frem_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmod -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmod -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_frem_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmod@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmod@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_frem_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmod@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmod@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %rem = call <2 x double> @llvm.experimental.constrained.frem.v2f64( <2 x double> , @@ -226,15 +254,15 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq fmodf +; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq fmodf +; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq fmodf +; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -244,28 +272,51 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_frem_v3f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: callq fmodf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: callq fmodf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: callq fmodf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_frem_v3f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: callq fmodf@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: callq fmodf@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: callq fmodf@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_frem_v3f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: callq fmodf@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: callq fmodf@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: callq fmodf@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %rem = call <3 x float> @llvm.experimental.constrained.frem.v3f32( <3 x float> , @@ -282,15 +333,15 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmod +; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmod +; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmod +; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) ; CHECK-NEXT: wait @@ -302,29 +353,53 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_frem_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmod -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmod -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq fmod -; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_frem_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 64 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmod@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmod@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq fmod@PLT +; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_frem_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 64 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmod@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmod@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq fmod@PLT +; AVX512-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %rem = call <3 x double> @llvm.experimental.constrained.frem.v3f64( <3 x double> , @@ -341,21 +416,21 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmod +; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmod +; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmod +; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmod +; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] @@ -364,33 +439,61 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_frem_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmod -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmod -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmod -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmod -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_frem_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmod@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmod@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmod@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmod@PLT +; AVX1-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_frem_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmod@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmod@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmod@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmod@PLT +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq %rem = call <4 x double> @llvm.experimental.constrained.frem.v4f64( <4 x double> , @@ -942,21 +1045,32 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq powf +; CHECK-NEXT: callq powf@PLT ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_pow_v1f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: callq powf -; AVX-NEXT: popq %rax -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_pow_v1f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: callq powf@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_pow_v1f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: callq powf@PLT +; AVX512-NEXT: popq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %pow = call <1 x float> @llvm.experimental.constrained.pow.v1f32( <1 x float> , @@ -973,33 +1087,50 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq pow +; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq pow +; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_pow_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq pow -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq pow -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_pow_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq pow@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq pow@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_pow_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq pow@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq pow@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %pow = call <2 x double> @llvm.experimental.constrained.pow.v2f64( <2 x double> , @@ -1016,15 +1147,15 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq powf +; CHECK-NEXT: callq powf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq powf +; CHECK-NEXT: callq powf@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq powf +; CHECK-NEXT: callq powf@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1034,28 +1165,51 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_pow_v3f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: callq powf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: callq powf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: callq powf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_pow_v3f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: callq powf@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: callq powf@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: callq powf@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_pow_v3f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: callq powf@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: callq powf@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: callq powf@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %pow = call <3 x float> @llvm.experimental.constrained.pow.v3f32( <3 x float> , @@ -1072,15 +1226,15 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq pow +; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq pow +; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq pow +; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) ; CHECK-NEXT: wait @@ -1092,29 +1246,53 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_pow_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq pow -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq pow -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq pow -; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_pow_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 64 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq pow@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq pow@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq pow@PLT +; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_pow_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 64 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq pow@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq pow@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq pow@PLT +; AVX512-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %pow = call <3 x double> @llvm.experimental.constrained.pow.v3f64( <3 x double> , @@ -1131,21 +1309,21 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq pow +; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq pow +; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq pow +; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq pow +; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] @@ -1154,33 +1332,61 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_pow_v4f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq pow -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq pow -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq pow -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq pow -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_pow_v4f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq pow@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq pow@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq pow@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq pow@PLT +; AVX1-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_pow_v4f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq pow@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq pow@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq pow@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq pow@PLT +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %pow = call <4 x double> @llvm.experimental.constrained.pow.v4f64( <4 x double> @llvm.experimental.constrained.powi.v1f32( <1 x float> , @@ -1230,33 +1447,50 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movl $3, %edi -; CHECK-NEXT: callq __powidf2 +; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movl $3, %edi -; CHECK-NEXT: callq __powidf2 +; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_powi_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: movl $3, %edi -; AVX-NEXT: callq __powidf2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: movl $3, %edi -; AVX-NEXT: callq __powidf2 -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_powi_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: movl $3, %edi +; AVX1-NEXT: callq __powidf2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: movl $3, %edi +; AVX1-NEXT: callq __powidf2@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_powi_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: movl $3, %edi +; AVX512-NEXT: callq __powidf2@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: movl $3, %edi +; AVX512-NEXT: callq __powidf2@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %powi = call <2 x double> @llvm.experimental.constrained.powi.v2f64( <2 x double> , @@ -1273,15 +1507,15 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movl $3, %edi -; CHECK-NEXT: callq __powisf2 +; CHECK-NEXT: callq __powisf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movl $3, %edi -; CHECK-NEXT: callq __powisf2 +; CHECK-NEXT: callq __powisf2@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movl $3, %edi -; CHECK-NEXT: callq __powisf2 +; CHECK-NEXT: callq __powisf2@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1291,28 +1525,51 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_powi_v3f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: movl $3, %edi -; AVX-NEXT: callq __powisf2 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: movl $3, %edi -; AVX-NEXT: callq __powisf2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: movl $3, %edi -; AVX-NEXT: callq __powisf2 -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_powi_v3f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: movl $3, %edi +; AVX1-NEXT: callq __powisf2@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: movl $3, %edi +; AVX1-NEXT: callq __powisf2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: movl $3, %edi +; AVX1-NEXT: callq __powisf2@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_powi_v3f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: movl $3, %edi +; AVX512-NEXT: callq __powisf2@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: movl $3, %edi +; AVX512-NEXT: callq __powisf2@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: movl $3, %edi +; AVX512-NEXT: callq __powisf2@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %powi = call <3 x float> @llvm.experimental.constrained.powi.v3f32( <3 x float> , @@ -1329,15 +1586,15 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movl $3, %edi -; CHECK-NEXT: callq __powidf2 +; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movl $3, %edi -; CHECK-NEXT: callq __powidf2 +; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movl $3, %edi -; CHECK-NEXT: callq __powidf2 +; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) ; CHECK-NEXT: wait @@ -1349,29 +1606,53 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_powi_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: movl $3, %edi -; AVX-NEXT: callq __powidf2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: movl $3, %edi -; AVX-NEXT: callq __powidf2 -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: movl $3, %edi -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq __powidf2 -; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_powi_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 64 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: movl $3, %edi +; AVX1-NEXT: callq __powidf2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: movl $3, %edi +; AVX1-NEXT: callq __powidf2@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: movl $3, %edi +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __powidf2@PLT +; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_powi_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 64 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: movl $3, %edi +; AVX512-NEXT: callq __powidf2@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: movl $3, %edi +; AVX512-NEXT: callq __powidf2@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: movl $3, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __powidf2@PLT +; AVX512-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %powi = call <3 x double> @llvm.experimental.constrained.powi.v3f64( <3 x double> , @@ -1388,21 +1669,21 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movl $3, %edi -; CHECK-NEXT: callq __powidf2 +; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movl $3, %edi -; CHECK-NEXT: callq __powidf2 +; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movl $3, %edi -; CHECK-NEXT: callq __powidf2 +; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movl $3, %edi -; CHECK-NEXT: callq __powidf2 +; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] @@ -1411,33 +1692,61 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_powi_v4f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: movl $3, %edi -; AVX-NEXT: callq __powidf2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: movl $3, %edi -; AVX-NEXT: callq __powidf2 -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: movl $3, %edi -; AVX-NEXT: callq __powidf2 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: movl $3, %edi -; AVX-NEXT: callq __powidf2 -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_powi_v4f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: movl $3, %edi +; AVX1-NEXT: callq __powidf2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: movl $3, %edi +; AVX1-NEXT: callq __powidf2@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: movl $3, %edi +; AVX1-NEXT: callq __powidf2@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: movl $3, %edi +; AVX1-NEXT: callq __powidf2@PLT +; AVX1-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_powi_v4f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: movl $3, %edi +; AVX512-NEXT: callq __powidf2@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: movl $3, %edi +; AVX512-NEXT: callq __powidf2@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: movl $3, %edi +; AVX512-NEXT: callq __powidf2@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: movl $3, %edi +; AVX512-NEXT: callq __powidf2@PLT +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %powi = call <4 x double> @llvm.experimental.constrained.powi.v4f64( <4 x double> @llvm.experimental.constrained.sin.v1f32( - <1 x float> , - metadata !"round.dynamic", +; AVX1-LABEL: constrained_vector_sin_v1f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq sinf@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sin_v1f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq sinf@PLT +; AVX512-NEXT: popq %rax +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq +entry: + %sin = call <1 x float> @llvm.experimental.constrained.sin.v1f32( + <1 x float> , + metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret <1 x float> %sin } @@ -1482,30 +1801,45 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq sin +; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq sin +; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_sin_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq sin -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq sin -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_sin_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq sin@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq sin@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sin_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq sin@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq sin@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %sin = call <2 x double> @llvm.experimental.constrained.sin.v2f64( <2 x double> , @@ -1520,13 +1854,13 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1536,25 +1870,45 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_sin_v3f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq sinf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq sinf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq sinf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_sin_v3f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq sinf@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq sinf@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq sinf@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sin_v3f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq sinf@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq sinf@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq sinf@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %sin = call <3 x float> @llvm.experimental.constrained.sin.v3f32( <3 x float> , @@ -1569,13 +1923,13 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq sin +; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq sin +; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq sin +; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) ; CHECK-NEXT: wait @@ -1587,26 +1941,47 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_sin_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq sin -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq sin -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq sin -; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_sin_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 64 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq sin@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq sin@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq sin@PLT +; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sin_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 64 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq sin@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq sin@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq sin@PLT +; AVX512-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %sin = call <3 x double> @llvm.experimental.constrained.sin.v3f64( <3 x double> , @@ -1621,18 +1996,18 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq sin +; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq sin +; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq sin +; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq sin +; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] @@ -1641,29 +2016,53 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_sin_v4f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq sin -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq sin -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq sin -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq sin -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_sin_v4f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq sin@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq sin@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq sin@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq sin@PLT +; AVX1-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_sin_v4f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq sin@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq sin@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq sin@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq sin@PLT +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %sin = call <4 x double> @llvm.experimental.constrained.sin.v4f64( <4 x double> @llvm.experimental.constrained.cos.v1f32( <1 x float> , @@ -1707,30 +2116,45 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq cos +; CHECK-NEXT: callq cos@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq cos +; CHECK-NEXT: callq cos@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_cos_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq cos -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq cos -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_cos_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq cos@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq cos@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_cos_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq cos@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq cos@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %cos = call <2 x double> @llvm.experimental.constrained.cos.v2f64( <2 x double> , @@ -1745,13 +2169,13 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq cosf +; CHECK-NEXT: callq cosf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq cosf +; CHECK-NEXT: callq cosf@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq cosf +; CHECK-NEXT: callq cosf@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1761,25 +2185,45 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_cos_v3f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq cosf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq cosf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq cosf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_cos_v3f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq cosf@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq cosf@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq cosf@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_cos_v3f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq cosf@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq cosf@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq cosf@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %cos = call <3 x float> @llvm.experimental.constrained.cos.v3f32( <3 x float> , @@ -1794,13 +2238,13 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq cos +; CHECK-NEXT: callq cos@PLT ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq cos +; CHECK-NEXT: callq cos@PLT ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq cos +; CHECK-NEXT: callq cos@PLT ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) ; CHECK-NEXT: wait @@ -1812,26 +2256,47 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_cos_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq cos -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq cos -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq cos -; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_cos_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 64 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq cos@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq cos@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq cos@PLT +; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_cos_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 64 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq cos@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq cos@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq cos@PLT +; AVX512-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %cos = call <3 x double> @llvm.experimental.constrained.cos.v3f64( <3 x double> , @@ -1846,18 +2311,18 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq cos +; CHECK-NEXT: callq cos@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq cos +; CHECK-NEXT: callq cos@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq cos +; CHECK-NEXT: callq cos@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq cos +; CHECK-NEXT: callq cos@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] @@ -1866,29 +2331,53 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_cos_v4f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq cos -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq cos -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq cos -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq cos -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_cos_v4f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq cos@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq cos@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq cos@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq cos@PLT +; AVX1-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_cos_v4f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq cos@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq cos@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq cos@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq cos@PLT +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %cos = call <4 x double> @llvm.experimental.constrained.cos.v4f64( <4 x double> @llvm.experimental.constrained.exp.v1f32( <1 x float> , @@ -1932,30 +2431,45 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp +; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp +; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_exp_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_exp_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_exp_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %exp = call <2 x double> @llvm.experimental.constrained.exp.v2f64( <2 x double> , @@ -1970,13 +2484,13 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq expf +; CHECK-NEXT: callq expf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq expf +; CHECK-NEXT: callq expf@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq expf +; CHECK-NEXT: callq expf@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1986,25 +2500,45 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_exp_v3f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq expf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq expf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq expf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_exp_v3f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq expf@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq expf@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq expf@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_exp_v3f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq expf@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq expf@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq expf@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %exp = call <3 x float> @llvm.experimental.constrained.exp.v3f32( <3 x float> , @@ -2019,13 +2553,13 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp +; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp +; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp +; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) ; CHECK-NEXT: wait @@ -2037,26 +2571,47 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_exp_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq exp -; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_exp_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 64 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq exp@PLT +; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_exp_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 64 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq exp@PLT +; AVX512-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %exp = call <3 x double> @llvm.experimental.constrained.exp.v3f64( <3 x double> , @@ -2071,18 +2626,18 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp +; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp +; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp +; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp +; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] @@ -2091,29 +2646,53 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_exp_v4f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_exp_v4f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp@PLT +; AVX1-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_exp_v4f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp@PLT +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %exp = call <4 x double> @llvm.experimental.constrained.exp.v4f64( <4 x double> @llvm.experimental.constrained.exp2.v1f32( <1 x float> , @@ -2157,30 +2746,45 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp2 +; CHECK-NEXT: callq exp2@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp2 +; CHECK-NEXT: callq exp2@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_exp2_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp2 -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_exp2_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp2@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_exp2_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp2@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp2@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %exp2 = call <2 x double> @llvm.experimental.constrained.exp2.v2f64( <2 x double> , @@ -2195,13 +2799,13 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq exp2f +; CHECK-NEXT: callq exp2f@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq exp2f +; CHECK-NEXT: callq exp2f@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq exp2f +; CHECK-NEXT: callq exp2f@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2211,25 +2815,45 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_exp2_v3f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq exp2f -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq exp2f -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq exp2f -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_exp2_v3f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq exp2f@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq exp2f@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq exp2f@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_exp2_v3f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq exp2f@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq exp2f@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq exp2f@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %exp2 = call <3 x float> @llvm.experimental.constrained.exp2.v3f32( <3 x float> , @@ -2244,13 +2868,13 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp2 +; CHECK-NEXT: callq exp2@PLT ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp2 +; CHECK-NEXT: callq exp2@PLT ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp2 +; CHECK-NEXT: callq exp2@PLT ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) ; CHECK-NEXT: wait @@ -2262,26 +2886,47 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_exp2_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp2 -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq exp2 -; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_exp2_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 64 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp2@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq exp2@PLT +; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_exp2_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 64 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp2@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp2@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq exp2@PLT +; AVX512-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %exp2 = call <3 x double> @llvm.experimental.constrained.exp2.v3f64( <3 x double> , @@ -2296,18 +2941,18 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp2 +; CHECK-NEXT: callq exp2@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp2 +; CHECK-NEXT: callq exp2@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp2 +; CHECK-NEXT: callq exp2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq exp2 +; CHECK-NEXT: callq exp2@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] @@ -2316,29 +2961,53 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_exp2_v4f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp2 -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp2 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq exp2 -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_exp2_v4f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp2@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp2@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq exp2@PLT +; AVX1-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_exp2_v4f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp2@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp2@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp2@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq exp2@PLT +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %exp2 = call <4 x double> @llvm.experimental.constrained.exp2.v4f64( <4 x double> @llvm.experimental.constrained.log.v1f32( <1 x float> , @@ -2382,30 +3061,45 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log +; CHECK-NEXT: callq log@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log +; CHECK-NEXT: callq log@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_log_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_log_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_log_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %log = call <2 x double> @llvm.experimental.constrained.log.v2f64( <2 x double> , @@ -2420,13 +3114,13 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq logf +; CHECK-NEXT: callq logf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq logf +; CHECK-NEXT: callq logf@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq logf +; CHECK-NEXT: callq logf@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2436,25 +3130,45 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_log_v3f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq logf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq logf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq logf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_log_v3f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq logf@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq logf@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq logf@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_log_v3f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq logf@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq logf@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq logf@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %log = call <3 x float> @llvm.experimental.constrained.log.v3f32( <3 x float> , @@ -2469,13 +3183,13 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log +; CHECK-NEXT: callq log@PLT ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log +; CHECK-NEXT: callq log@PLT ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log +; CHECK-NEXT: callq log@PLT ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) ; CHECK-NEXT: wait @@ -2487,26 +3201,47 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_log_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq log -; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_log_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 64 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq log@PLT +; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_log_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 64 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq log@PLT +; AVX512-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %log = call <3 x double> @llvm.experimental.constrained.log.v3f64( <3 x double> , @@ -2521,18 +3256,18 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log +; CHECK-NEXT: callq log@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log +; CHECK-NEXT: callq log@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log +; CHECK-NEXT: callq log@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log +; CHECK-NEXT: callq log@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] @@ -2541,29 +3276,53 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_log_v4f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_log_v4f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log@PLT +; AVX1-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_log_v4f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log@PLT +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %log = call <4 x double> @llvm.experimental.constrained.log.v4f64( <4 x double> @llvm.experimental.constrained.log10.v1f32( <1 x float> , @@ -2607,30 +3376,45 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log10 +; CHECK-NEXT: callq log10@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log10 +; CHECK-NEXT: callq log10@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_log10_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log10 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log10 -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_log10_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log10@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log10@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_log10_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log10@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log10@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %log10 = call <2 x double> @llvm.experimental.constrained.log10.v2f64( <2 x double> , @@ -2645,13 +3429,13 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq log10f +; CHECK-NEXT: callq log10f@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq log10f +; CHECK-NEXT: callq log10f@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq log10f +; CHECK-NEXT: callq log10f@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2661,25 +3445,45 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_log10_v3f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq log10f -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq log10f -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq log10f -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_log10_v3f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq log10f@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq log10f@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq log10f@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_log10_v3f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq log10f@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq log10f@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq log10f@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %log10 = call <3 x float> @llvm.experimental.constrained.log10.v3f32( <3 x float> , @@ -2694,13 +3498,13 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log10 +; CHECK-NEXT: callq log10@PLT ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log10 +; CHECK-NEXT: callq log10@PLT ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log10 +; CHECK-NEXT: callq log10@PLT ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) ; CHECK-NEXT: wait @@ -2712,26 +3516,47 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_log10_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log10 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log10 -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq log10 -; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_log10_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 64 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log10@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log10@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq log10@PLT +; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_log10_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 64 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log10@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log10@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq log10@PLT +; AVX512-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %log10 = call <3 x double> @llvm.experimental.constrained.log10.v3f64( <3 x double> , @@ -2746,18 +3571,18 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log10 +; CHECK-NEXT: callq log10@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log10 +; CHECK-NEXT: callq log10@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log10 +; CHECK-NEXT: callq log10@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log10 +; CHECK-NEXT: callq log10@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] @@ -2766,29 +3591,53 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_log10_v4f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log10 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log10 -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log10 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log10 -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_log10_v4f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log10@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log10@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log10@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log10@PLT +; AVX1-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_log10_v4f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log10@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log10@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log10@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log10@PLT +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %log10 = call <4 x double> @llvm.experimental.constrained.log10.v4f64( <4 x double> @llvm.experimental.constrained.log2.v1f32( <1 x float> , @@ -2832,30 +3691,45 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log2 +; CHECK-NEXT: callq log2@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log2 +; CHECK-NEXT: callq log2@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_log2_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log2 -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_log2_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log2@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_log2_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log2@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log2@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %log2 = call <2 x double> @llvm.experimental.constrained.log2.v2f64( <2 x double> , @@ -2870,13 +3744,13 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq log2f +; CHECK-NEXT: callq log2f@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq log2f +; CHECK-NEXT: callq log2f@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq log2f +; CHECK-NEXT: callq log2f@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2886,25 +3760,45 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_log2_v3f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq log2f -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq log2f -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq log2f -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_log2_v3f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq log2f@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq log2f@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq log2f@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_log2_v3f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq log2f@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq log2f@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq log2f@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %log2 = call <3 x float> @llvm.experimental.constrained.log2.v3f32( <3 x float> , @@ -2919,13 +3813,13 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log2 +; CHECK-NEXT: callq log2@PLT ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log2 +; CHECK-NEXT: callq log2@PLT ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log2 +; CHECK-NEXT: callq log2@PLT ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) ; CHECK-NEXT: wait @@ -2937,26 +3831,47 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_log2_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log2 -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq log2 -; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_log2_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 64 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log2@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq log2@PLT +; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_log2_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 64 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log2@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log2@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq log2@PLT +; AVX512-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %log2 = call <3 x double> @llvm.experimental.constrained.log2.v3f64( <3 x double> , @@ -2971,18 +3886,18 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log2 +; CHECK-NEXT: callq log2@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log2 +; CHECK-NEXT: callq log2@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log2 +; CHECK-NEXT: callq log2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq log2 +; CHECK-NEXT: callq log2@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] @@ -2991,29 +3906,53 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_log2_v4f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log2 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log2 -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log2 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq log2 -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_log2_v4f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log2@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log2@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq log2@PLT +; AVX1-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_log2_v4f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log2@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log2@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log2@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq log2@PLT +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %log2 = call <4 x double> @llvm.experimental.constrained.log2.v4f64( <4 x double> @llvm.experimental.constrained.maxnum.v1f32( <1 x float> , <1 x float> , @@ -3401,33 +4351,50 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmax +; CHECK-NEXT: callq fmax@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmax +; CHECK-NEXT: callq fmax@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_maxnum_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmax -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmax -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_maxnum_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmax@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmax@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_maxnum_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmax@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmax@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %max = call <2 x double> @llvm.experimental.constrained.maxnum.v2f64( <2 x double> , @@ -3443,15 +4410,15 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq fmaxf +; CHECK-NEXT: callq fmaxf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq fmaxf +; CHECK-NEXT: callq fmaxf@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq fmaxf +; CHECK-NEXT: callq fmaxf@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3461,28 +4428,51 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_maxnum_v3f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: callq fmaxf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq fmaxf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: callq fmaxf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_maxnum_v3f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: callq fmaxf@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq fmaxf@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: callq fmaxf@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_maxnum_v3f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: callq fmaxf@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq fmaxf@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: callq fmaxf@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %max = call <3 x float> @llvm.experimental.constrained.maxnum.v3f32( <3 x float> , @@ -3498,15 +4488,15 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmax +; CHECK-NEXT: callq fmax@PLT ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmax +; CHECK-NEXT: callq fmax@PLT ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmax +; CHECK-NEXT: callq fmax@PLT ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) ; CHECK-NEXT: wait @@ -3518,29 +4508,53 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_max_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmax -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmax -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq fmax -; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_max_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 64 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmax@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmax@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq fmax@PLT +; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_max_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 64 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmax@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmax@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq fmax@PLT +; AVX512-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %max = call <3 x double> @llvm.experimental.constrained.maxnum.v3f64( <3 x double> , @@ -3556,21 +4570,21 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmax +; CHECK-NEXT: callq fmax@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmax +; CHECK-NEXT: callq fmax@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmax +; CHECK-NEXT: callq fmax@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmax +; CHECK-NEXT: callq fmax@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] @@ -3579,33 +4593,61 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_maxnum_v4f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmax -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmax -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmax -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmax -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_maxnum_v4f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmax@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmax@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmax@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmax@PLT +; AVX1-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_maxnum_v4f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmax@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmax@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmax@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmax@PLT +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %max = call <4 x double> @llvm.experimental.constrained.maxnum.v4f64( <4 x double> @llvm.experimental.constrained.minnum.v1f32( <1 x float> , <1 x float> , @@ -3652,33 +4705,50 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmin +; CHECK-NEXT: callq fmin@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmin +; CHECK-NEXT: callq fmin@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_minnum_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmin -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmin -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_minnum_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmin@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmin@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_minnum_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmin@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmin@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %min = call <2 x double> @llvm.experimental.constrained.minnum.v2f64( <2 x double> , @@ -3694,15 +4764,15 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq fminf +; CHECK-NEXT: callq fminf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq fminf +; CHECK-NEXT: callq fminf@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: callq fminf +; CHECK-NEXT: callq fminf@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3712,28 +4782,51 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_minnum_v3f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: callq fminf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq fminf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: callq fminf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_minnum_v3f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: callq fminf@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq fminf@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: callq fminf@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_minnum_v3f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: callq fminf@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq fminf@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: callq fminf@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %min = call <3 x float> @llvm.experimental.constrained.minnum.v3f32( <3 x float> , @@ -3749,15 +4842,15 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmin +; CHECK-NEXT: callq fmin@PLT ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmin +; CHECK-NEXT: callq fmin@PLT ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmin +; CHECK-NEXT: callq fmin@PLT ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) ; CHECK-NEXT: wait @@ -3769,29 +4862,53 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_min_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmin -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmin -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq fmin -; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_min_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 64 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmin@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmin@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq fmin@PLT +; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_min_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 64 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmin@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmin@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq fmin@PLT +; AVX512-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %min = call <3 x double> @llvm.experimental.constrained.minnum.v3f64( <3 x double> , @@ -3807,21 +4924,21 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmin +; CHECK-NEXT: callq fmin@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmin +; CHECK-NEXT: callq fmin@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmin +; CHECK-NEXT: callq fmin@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: callq fmin +; CHECK-NEXT: callq fmin@PLT ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] @@ -3830,33 +4947,61 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_minnum_v4f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmin -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmin -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmin -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: callq fmin -; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_minnum_v4f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmin@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmin@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmin@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: callq fmin@PLT +; AVX1-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_minnum_v4f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmin@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmin@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmin@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: callq fmin@PLT +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %min = call <4 x double> @llvm.experimental.constrained.minnum.v4f64( <4 x double> @llvm.experimental.constrained.round.v1f32( <1 x float> , @@ -5922,30 +7077,45 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq round +; CHECK-NEXT: callq round@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq round +; CHECK-NEXT: callq round@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_round_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq round -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq round -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_round_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq round@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq round@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_round_v2f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq round@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq round@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %round = call <2 x double> @llvm.experimental.constrained.round.v2f64( <2 x double> , @@ -5959,13 +7129,13 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq roundf +; CHECK-NEXT: callq roundf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq roundf +; CHECK-NEXT: callq roundf@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq roundf +; CHECK-NEXT: callq roundf@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5975,25 +7145,45 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_round_v3f32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 48 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq roundf -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq roundf -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: callq roundf -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: addq $40, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_round_v3f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq roundf@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq roundf@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq roundf@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_round_v3f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq roundf@PLT +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq roundf@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: callq roundf@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX512-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %round = call <3 x float> @llvm.experimental.constrained.round.v3f32( <3 x float> , @@ -6008,13 +7198,13 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq round +; CHECK-NEXT: callq round@PLT ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq round +; CHECK-NEXT: callq round@PLT ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq round +; CHECK-NEXT: callq round@PLT ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) ; CHECK-NEXT: wait @@ -6026,26 +7216,47 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_round_v3f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq round -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq round -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq round -; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_round_v3f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 64 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq round@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: callq round@PLT +; AVX1-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq round@PLT +; AVX1-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 +; AVX1-NEXT: retq +; +; AVX512-LABEL: constrained_vector_round_v3f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 64 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq round@PLT +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: callq round@PLT +; AVX512-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq round@PLT +; AVX512-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %round = call <3 x double> @llvm.experimental.constrained.round.v3f64( <3 x double> , @@ -6059,7 +7270,7 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq truncf +; CHECK-NEXT: callq truncf@PLT ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -6082,10 +7293,10 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq trunc +; CHECK-NEXT: callq trunc@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq trunc +; CHECK-NEXT: callq trunc@PLT ; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: addq $24, %rsp @@ -6109,13 +7320,13 @@ ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq truncf +; CHECK-NEXT: callq truncf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq truncf +; CHECK-NEXT: callq truncf@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq truncf +; CHECK-NEXT: callq truncf@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6149,13 +7360,13 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq trunc +; CHECK-NEXT: callq trunc@PLT ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq trunc +; CHECK-NEXT: callq trunc@PLT ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: callq trunc +; CHECK-NEXT: callq trunc@PLT ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) ; CHECK-NEXT: wait @@ -6324,7 +7535,7 @@ ; ; AVX512DQ-LABEL: constrained_vector_sitofp_v2f64_v2i64: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -6625,7 +7836,7 @@ ; ; AVX512DQ-LABEL: constrained_vector_sitofp_v4f64_v4i64: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq @@ -6695,7 +7906,7 @@ ; ; AVX512DQ-LABEL: constrained_vector_sitofp_v4f32_v4i64: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512DQ-NEXT: vzeroupper @@ -6994,7 +8205,7 @@ ; ; AVX512DQ-LABEL: constrained_vector_uitofp_v2f64_v2i64: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -7425,7 +8636,7 @@ ; ; AVX512-LABEL: constrained_vector_uitofp_v4f64_v4i32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovaps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq @@ -7460,7 +8671,7 @@ ; ; AVX512-LABEL: constrained_vector_uitofp_v4f32_v4i32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovaps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper @@ -7591,7 +8802,7 @@ ; ; AVX512DQ-LABEL: constrained_vector_uitofp_v4f64_v4i64: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512DQ-NEXT: retq @@ -7714,7 +8925,7 @@ ; ; AVX512DQ-LABEL: constrained_vector_uitofp_v4f32_v4i64: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-extend-inreg.ll b/llvm/test/CodeGen/X86/vector-extend-inreg.ll --- a/llvm/test/CodeGen/X86/vector-extend-inreg.ll +++ b/llvm/test/CodeGen/X86/vector-extend-inreg.ll @@ -67,16 +67,16 @@ ; X86-AVX-NEXT: andl $-32, %esp ; X86-AVX-NEXT: subl $288, %esp # imm = 0x120 ; X86-AVX-NEXT: movl 40(%ebp), %ecx -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: vmovaps %ymm1, (%esp) -; X86-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: vmovdqa %ymm1, (%esp) +; X86-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: leal (%ecx,%ecx), %eax ; X86-AVX-NEXT: andl $31, %eax ; X86-AVX-NEXT: movl 128(%esp,%eax,4), %eax @@ -97,10 +97,10 @@ ; X64-AVX-NEXT: # kill: def $edi killed $edi def $rdi ; X64-AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm3[3,3,3,3] ; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovaps %ymm1, (%rsp) +; X64-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vmovdqa %ymm1, (%rsp) ; X64-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; X64-AVX-NEXT: andl $15, %edi ; X64-AVX-NEXT: movq (%rsp,%rdi,8), %rax diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -64,9 +64,9 @@ ; AVX2-LABEL: cvt_16i16_to_16f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vcvtph2ps %xmm0, %ymm2 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vcvtph2ps %xmm0, %ymm1 -; AVX2-NEXT: vmovaps %ymm2, %ymm0 +; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: cvt_16i16_to_16f32: @@ -122,7 +122,7 @@ ; ; AVX2-LABEL: cvt_16i16_to_16f32_constrained: ; AVX2: # %bb.0: -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vcvtph2ps %xmm1, %ymm1 ; AVX2-NEXT: vcvtph2ps %xmm0, %ymm0 ; AVX2-NEXT: retq @@ -312,7 +312,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vcvtph2ps %xmm0, %ymm1 ; AVX2-NEXT: vcvtps2pd %xmm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vcvtps2pd %xmm1, %ymm1 ; AVX2-NEXT: retq ; @@ -363,7 +363,7 @@ ; AVX2-LABEL: cvt_8i16_to_8f64_constrained: ; AVX2: # %bb.0: ; AVX2-NEXT: vcvtph2ps %xmm0, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vcvtps2pd %xmm1, %ymm1 ; AVX2-NEXT: vcvtps2pd %xmm0, %ymm0 ; AVX2-NEXT: retq @@ -449,7 +449,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vcvtph2ps (%rdi), %ymm1 ; AVX2-NEXT: vcvtps2pd %xmm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vcvtps2pd %xmm1, %ymm1 ; AVX2-NEXT: retq ; @@ -535,7 +535,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0 ; AVX2-NEXT: vcvtps2ph $4, %ymm1, %xmm1 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: cvt_16f32_to_16i16: @@ -575,11 +575,23 @@ } define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounwind { -; ALL-LABEL: store_cvt_4f32_to_8i16_undef: -; ALL: # %bb.0: -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, (%rdi) -; ALL-NEXT: retq +; AVX1-LABEL: store_cvt_4f32_to_8i16_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_cvt_4f32_to_8i16_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: store_cvt_4f32_to_8i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; AVX512-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> @@ -588,11 +600,23 @@ } define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind { -; ALL-LABEL: store_cvt_4f32_to_8i16_zero: -; ALL: # %bb.0: -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, (%rdi) -; ALL-NEXT: retq +; AVX1-LABEL: store_cvt_4f32_to_8i16_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_cvt_4f32_to_8i16_zero: +; AVX2: # %bb.0: +; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: store_cvt_4f32_to_8i16_zero: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; AVX512-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> @@ -652,81 +676,213 @@ } define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { -; ALL-LABEL: cvt_2f64_to_2i16: -; ALL: # %bb.0: -; ALL-NEXT: subq $40, %rsp -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: addq $40, %rsp -; ALL-NEXT: retq +; AVX1-LABEL: cvt_2f64_to_2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovaps (%rsp), %xmm0 +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: retq +; +; AVX2-LABEL: cvt_2f64_to_2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: subq $40, %rsp +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 +; AVX2-NEXT: addq $40, %rsp +; AVX2-NEXT: retq +; +; AVX512-LABEL: cvt_2f64_to_2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> %2 = bitcast <2 x half> %1 to <2 x i16> ret <2 x i16> %2 } define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { -; ALL-LABEL: cvt_4f64_to_4i16: -; ALL: # %bb.0: -; ALL-NEXT: subq $88, %rsp -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: addq $88, %rsp -; ALL-NEXT: retq +; AVX1-LABEL: cvt_4f64_to_4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $88, %rsp +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovaps (%rsp), %xmm0 +; AVX1-NEXT: addq $88, %rsp +; AVX1-NEXT: retq +; +; AVX2-LABEL: cvt_4f64_to_4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: subq $88, %rsp +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 +; AVX2-NEXT: addq $88, %rsp +; AVX2-NEXT: retq +; +; AVX512-LABEL: cvt_4f64_to_4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $88, %rsp +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 +; AVX512-NEXT: addq $88, %rsp +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> ret <4 x i16> %2 } define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { -; ALL-LABEL: cvt_4f64_to_8i16_undef: -; ALL: # %bb.0: -; ALL-NEXT: subq $88, %rsp -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: addq $88, %rsp -; ALL-NEXT: retq +; AVX1-LABEL: cvt_4f64_to_8i16_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $88, %rsp +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovaps (%rsp), %xmm0 +; AVX1-NEXT: addq $88, %rsp +; AVX1-NEXT: retq +; +; AVX2-LABEL: cvt_4f64_to_8i16_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: subq $88, %rsp +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 +; AVX2-NEXT: addq $88, %rsp +; AVX2-NEXT: retq +; +; AVX512-LABEL: cvt_4f64_to_8i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $88, %rsp +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 +; AVX512-NEXT: addq $88, %rsp +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> @@ -734,31 +890,83 @@ } define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { -; ALL-LABEL: cvt_4f64_to_8i16_zero: -; ALL: # %bb.0: -; ALL-NEXT: subq $88, %rsp -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: addq $88, %rsp -; ALL-NEXT: retq +; AVX1-LABEL: cvt_4f64_to_8i16_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $88, %rsp +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: addq $88, %rsp +; AVX1-NEXT: retq +; +; AVX2-LABEL: cvt_4f64_to_8i16_zero: +; AVX2: # %bb.0: +; AVX2-NEXT: subq $88, %rsp +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq $88, %rsp +; AVX2-NEXT: retq +; +; AVX512-LABEL: cvt_4f64_to_8i16_zero: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $88, %rsp +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: addq $88, %rsp +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> @@ -776,13 +984,13 @@ ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movzwl %ax, %r15d ; AVX1-NEXT: orl %ebx, %r15d ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -790,24 +998,24 @@ ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movzwl %ax, %r14d ; AVX1-NEXT: orl %ebx, %r14d ; AVX1-NEXT: shlq $32, %r14 ; AVX1-NEXT: orq %r15, %r14 ; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = mem[1,0] -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movzwl %ax, %r15d ; AVX1-NEXT: orl %ebx, %r15d ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload @@ -815,11 +1023,11 @@ ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movzwl %ax, %eax ; AVX1-NEXT: orl %ebx, %eax ; AVX1-NEXT: shlq $32, %rax @@ -839,17 +1047,17 @@ ; AVX2-NEXT: pushq %r14 ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: subq $64, %rsp -; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movzwl %ax, %r15d ; AVX2-NEXT: orl %ebx, %r15d ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -857,24 +1065,24 @@ ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movzwl %ax, %r14d ; AVX2-NEXT: orl %ebx, %r14d ; AVX2-NEXT: shlq $32, %r14 ; AVX2-NEXT: orq %r15, %r14 ; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[1,0] -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movzwl %ax, %r15d ; AVX2-NEXT: orl %ebx, %r15d ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload @@ -882,11 +1090,11 @@ ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movzwl %ax, %eax ; AVX2-NEXT: orl %ebx, %eax ; AVX2-NEXT: shlq $32, %rax @@ -909,13 +1117,13 @@ ; AVX512-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movzwl %ax, %r15d ; AVX512-NEXT: orl %ebx, %r15d ; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload @@ -923,11 +1131,11 @@ ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movzwl %ax, %r14d ; AVX512-NEXT: orl %ebx, %r14d ; AVX512-NEXT: shlq $32, %r14 @@ -937,13 +1145,13 @@ ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movzwl %ax, %r15d ; AVX512-NEXT: orl %ebx, %r15d ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload @@ -951,11 +1159,11 @@ ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movzwl %ax, %eax ; AVX512-NEXT: orl %ebx, %eax ; AVX512-NEXT: shlq $32, %rax @@ -982,7 +1190,7 @@ ; ALL: # %bb.0: ; ALL-NEXT: pushq %rbx ; ALL-NEXT: movq %rdi, %rbx -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, (%rbx) ; ALL-NEXT: popq %rbx ; ALL-NEXT: retq @@ -993,24 +1201,62 @@ } define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind { -; ALL-LABEL: store_cvt_2f64_to_2i16: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbp -; ALL-NEXT: pushq %rbx -; ALL-NEXT: subq $24, %rsp -; ALL-NEXT: movq %rdi, %rbx -; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movl %eax, %ebp -; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, (%rbx) -; ALL-NEXT: movw %bp, 2(%rbx) -; ALL-NEXT: addq $24, %rsp -; ALL-NEXT: popq %rbx -; ALL-NEXT: popq %rbp -; ALL-NEXT: retq +; AVX1-LABEL: store_cvt_2f64_to_2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, (%rbx) +; AVX1-NEXT: movw %bp, 2(%rbx) +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_cvt_2f64_to_2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $24, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movl %eax, %ebp +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, (%rbx) +; AVX2-NEXT: movw %bp, 2(%rbx) +; AVX2-NEXT: addq $24, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: store_cvt_2f64_to_2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, (%rbx) +; AVX512-NEXT: movw %bp, 2(%rbx) +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> %2 = bitcast <2 x half> %1 to <2 x i16> store <2 x i16> %2, <2 x i16>* %a1 @@ -1029,22 +1275,22 @@ ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %r14d ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %r15d ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %ebp ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movw %ax, 4(%rbx) ; AVX1-NEXT: movw %bp, (%rbx) ; AVX1-NEXT: movw %r15w, 6(%rbx) @@ -1067,22 +1313,22 @@ ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %r14d ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movw %ax, 4(%rbx) ; AVX2-NEXT: movw %bp, (%rbx) ; AVX2-NEXT: movw %r15w, 6(%rbx) @@ -1105,22 +1351,22 @@ ; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %r14d ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movw %ax, 4(%rbx) ; AVX512-NEXT: movw %bp, (%rbx) ; AVX512-NEXT: movw %r15w, 6(%rbx) @@ -1138,35 +1384,95 @@ } define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind { -; ALL-LABEL: store_cvt_4f64_to_8i16_undef: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbx -; ALL-NEXT: subq $80, %rsp -; ALL-NEXT: movq %rdi, %rbx -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: vmovaps %xmm0, (%rbx) -; ALL-NEXT: addq $80, %rsp -; ALL-NEXT: popq %rbx -; ALL-NEXT: retq +; AVX1-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $80, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovaps (%rsp), %xmm0 +; AVX1-NEXT: vmovaps %xmm0, (%rbx) +; AVX1-NEXT: addq $80, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $80, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rbx) +; AVX2-NEXT: addq $80, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq +; +; AVX512-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $80, %rsp +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rbx) +; AVX512-NEXT: addq $80, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> @@ -1175,35 +1481,95 @@ } define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind { -; ALL-LABEL: store_cvt_4f64_to_8i16_zero: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbx -; ALL-NEXT: subq $80, %rsp -; ALL-NEXT: movq %rdi, %rbx -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovaps %xmm0, (%rbx) -; ALL-NEXT: addq $80, %rsp -; ALL-NEXT: popq %rbx -; ALL-NEXT: retq +; AVX1-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $80, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovaps %xmm0, (%rbx) +; AVX1-NEXT: addq $80, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $80, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovdqa %xmm0, (%rbx) +; AVX2-NEXT: addq $80, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq +; +; AVX512-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $80, %rsp +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovdqa %xmm0, (%rbx) +; AVX512-NEXT: addq $80, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> @@ -1226,41 +1592,41 @@ ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = mem[1,0] -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %r12d ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %r13d ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %ebp ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %r14d ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %r15d ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movw %ax, 12(%rbx) ; AVX1-NEXT: movw %r15w, 8(%rbx) ; AVX1-NEXT: movw %r14w, 4(%rbx) @@ -1290,45 +1656,45 @@ ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: subq $136, %rsp ; AVX2-NEXT: movq %rdi, %rbx -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[1,0] -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %r12d ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %r13d -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %r14d -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movw %ax, 12(%rbx) ; AVX2-NEXT: movw %r15w, 8(%rbx) ; AVX2-NEXT: movw %r14w, 4(%rbx) @@ -1361,44 +1727,44 @@ ; AVX512-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %r12d ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %r13d -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %r14d -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movw %ax, 12(%rbx) ; AVX512-NEXT: movw %r15w, 8(%rbx) ; AVX512-NEXT: movw %r14w, 4(%rbx) diff --git a/llvm/test/CodeGen/X86/vector-idiv.ll b/llvm/test/CodeGen/X86/vector-idiv.ll --- a/llvm/test/CodeGen/X86/vector-idiv.ll +++ b/llvm/test/CodeGen/X86/vector-idiv.ll @@ -10,10 +10,15 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_urem_unary_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_urem_unary_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_urem_unary_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq %I8 = insertelement <2 x i16> zeroinitializer, i16 -1, i32 0 %I9 = insertelement <2 x i16> %I8, i16 -1, i32 1 %B9 = urem <2 x i16> %I9, %I9 diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll --- a/llvm/test/CodeGen/X86/vector-interleave.ll +++ b/llvm/test/CodeGen/X86/vector-interleave.ll @@ -263,11 +263,11 @@ ; ; AVX2-LABEL: splat2_i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,2,1,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] -; AVX2-NEXT: vmovups %ymm0, 32(%rsi) -; AVX2-NEXT: vmovups %ymm1, (%rsi) +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = mem[0,2,1,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %ld32 = load <8 x i32>, <8 x i32>* %s, align 1 @@ -306,11 +306,11 @@ ; ; AVX2-LABEL: splat2_i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-NEXT: vmovups %ymm0, 32(%rsi) -; AVX2-NEXT: vmovups %ymm1, (%rsi) +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %ld32 = load <4 x i64>, <4 x i64>* %s, align 1 diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -3,9 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=NOBW,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=NOBW,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=NOBW,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=NOBW,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=NOBW,AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=NOBW,AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=AVX512VLBWDQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd,+avx512vl | FileCheck %s --check-prefixes=NOBW,AVX512,AVX512VLCD ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd | FileCheck %s --check-prefixes=NOBW,AVX512,AVX512CD @@ -1669,16 +1669,26 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0] ; SSE-NEXT: retq ; -; NOBW-LABEL: foldv2i64: -; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] -; NOBW-NEXT: retq +; AVX1-LABEL: foldv2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [55,0,0,0] +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv2i64: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [55,0,0,0] ; AVX512VLBWDQ-NEXT: retq ; +; AVX512-LABEL: foldv2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [55,0,0,0] +; AVX512-NEXT: retq +; ; X32-SSE-LABEL: foldv2i64: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0] @@ -1693,16 +1703,26 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0] ; SSE-NEXT: retq ; -; NOBW-LABEL: foldv2i64u: -; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] -; NOBW-NEXT: retq +; AVX1-LABEL: foldv2i64u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv2i64u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [55,0,0,0] +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv2i64u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [55,0,0,0] ; AVX512VLBWDQ-NEXT: retq ; +; AVX512-LABEL: foldv2i64u: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [55,0,0,0] +; AVX512-NEXT: retq +; ; X32-SSE-LABEL: foldv2i64u: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0] @@ -1717,16 +1737,26 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] ; SSE-NEXT: retq ; -; NOBW-LABEL: foldv4i32: -; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] -; NOBW-NEXT: retq +; AVX1-LABEL: foldv4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [23,0,32,24] +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv4i32: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [23,0,32,24] ; AVX512VLBWDQ-NEXT: retq ; +; AVX512-LABEL: foldv4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [23,0,32,24] +; AVX512-NEXT: retq +; ; X32-SSE-LABEL: foldv4i32: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] @@ -1741,16 +1771,26 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] ; SSE-NEXT: retq ; -; NOBW-LABEL: foldv4i32u: -; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] -; NOBW-NEXT: retq +; AVX1-LABEL: foldv4i32u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv4i32u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [23,0,32,24] +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv4i32u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [23,0,32,24] ; AVX512VLBWDQ-NEXT: retq ; +; AVX512-LABEL: foldv4i32u: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [23,0,32,24] +; AVX512-NEXT: retq +; ; X32-SSE-LABEL: foldv4i32u: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] @@ -1765,16 +1805,26 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; SSE-NEXT: retq ; -; NOBW-LABEL: foldv8i16: -; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] -; NOBW-NEXT: retq +; AVX1-LABEL: foldv8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv8i16: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; AVX512VLBWDQ-NEXT: retq ; +; AVX512-LABEL: foldv8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX512-NEXT: retq +; ; X32-SSE-LABEL: foldv8i16: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] @@ -1789,16 +1839,26 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; SSE-NEXT: retq ; -; NOBW-LABEL: foldv8i16u: -; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] -; NOBW-NEXT: retq +; AVX1-LABEL: foldv8i16u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv8i16u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv8i16u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; AVX512VLBWDQ-NEXT: retq ; +; AVX512-LABEL: foldv8i16u: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX512-NEXT: retq +; ; X32-SSE-LABEL: foldv8i16u: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] @@ -1813,16 +1873,26 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; SSE-NEXT: retq ; -; NOBW-LABEL: foldv16i8: -; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] -; NOBW-NEXT: retq +; AVX1-LABEL: foldv16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv16i8: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; AVX512VLBWDQ-NEXT: retq ; +; AVX512-LABEL: foldv16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX512-NEXT: retq +; ; X32-SSE-LABEL: foldv16i8: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] @@ -1837,16 +1907,26 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; SSE-NEXT: retq ; -; NOBW-LABEL: foldv16i8u: -; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] -; NOBW-NEXT: retq +; AVX1-LABEL: foldv16i8u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv16i8u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX2-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv16i8u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; AVX512VLBWDQ-NEXT: retq ; +; AVX512-LABEL: foldv16i8u: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX512-NEXT: retq +; ; X32-SSE-LABEL: foldv16i8u: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll @@ -1100,112 +1100,272 @@ } define <4 x i64> @foldv4i64() nounwind { -; X64-LABEL: foldv4i64: -; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] -; X64-NEXT: retq +; AVX1-LABEL: foldv4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [55,0,64,56] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: foldv4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [55,0,64,56] +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv4i64: +; AVX512VLBWDQ: # %bb.0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [55,0,64,56] +; AVX512VLBWDQ-NEXT: retq +; +; AVX512-LABEL: foldv4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [55,0,64,56] +; AVX512-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0] ; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> , i1 0) ret <4 x i64> %out } define <4 x i64> @foldv4i64u() nounwind { -; X64-LABEL: foldv4i64u: -; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] -; X64-NEXT: retq +; AVX1-LABEL: foldv4i64u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv4i64u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [55,0,64,56] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: foldv4i64u: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [55,0,64,56] +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv4i64u: +; AVX512VLBWDQ: # %bb.0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [55,0,64,56] +; AVX512VLBWDQ-NEXT: retq +; +; AVX512-LABEL: foldv4i64u: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [55,0,64,56] +; AVX512-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0] ; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> , i1 -1) ret <4 x i64> %out } define <8 x i32> @foldv8i32() nounwind { -; X64-LABEL: foldv8i32: -; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; X64-NEXT: retq +; AVX1-LABEL: foldv8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: foldv8i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv8i32: +; AVX512VLBWDQ: # %bb.0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; AVX512VLBWDQ-NEXT: retq +; +; AVX512-LABEL: foldv8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; AVX512-NEXT: retq ; ; X32-AVX-LABEL: foldv8i32: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] ; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> , i1 0) ret <8 x i32> %out } define <8 x i32> @foldv8i32u() nounwind { -; X64-LABEL: foldv8i32u: -; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; X64-NEXT: retq +; AVX1-LABEL: foldv8i32u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv8i32u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: foldv8i32u: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv8i32u: +; AVX512VLBWDQ: # %bb.0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; AVX512VLBWDQ-NEXT: retq +; +; AVX512-LABEL: foldv8i32u: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; AVX512-NEXT: retq ; ; X32-AVX-LABEL: foldv8i32u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] ; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> , i1 -1) ret <8 x i32> %out } define <16 x i16> @foldv16i16() nounwind { -; X64-LABEL: foldv16i16: -; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] -; X64-NEXT: retq +; AVX1-LABEL: foldv16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: foldv16i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv16i16: +; AVX512VLBWDQ: # %bb.0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX512VLBWDQ-NEXT: retq +; +; AVX512-LABEL: foldv16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX512-NEXT: retq ; ; X32-AVX-LABEL: foldv16i16: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] ; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> , i1 0) ret <16 x i16> %out } define <16 x i16> @foldv16i16u() nounwind { -; X64-LABEL: foldv16i16u: -; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] -; X64-NEXT: retq +; AVX1-LABEL: foldv16i16u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv16i16u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: foldv16i16u: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv16i16u: +; AVX512VLBWDQ: # %bb.0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX512VLBWDQ-NEXT: retq +; +; AVX512-LABEL: foldv16i16u: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX512-NEXT: retq ; ; X32-AVX-LABEL: foldv16i16u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] ; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> , i1 -1) ret <16 x i16> %out } define <32 x i8> @foldv32i8() nounwind { -; X64-LABEL: foldv32i8: -; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] -; X64-NEXT: retq +; AVX1-LABEL: foldv32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: foldv32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv32i8: +; AVX512VLBWDQ: # %bb.0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX512VLBWDQ-NEXT: retq +; +; AVX512-LABEL: foldv32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX512-NEXT: retq ; ; X32-AVX-LABEL: foldv32i8: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] ; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> , i1 0) ret <32 x i8> %out } define <32 x i8> @foldv32i8u() nounwind { -; X64-LABEL: foldv32i8u: -; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] -; X64-NEXT: retq +; AVX1-LABEL: foldv32i8u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv32i8u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: foldv32i8u: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv32i8u: +; AVX512VLBWDQ: # %bb.0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX512VLBWDQ-NEXT: retq +; +; AVX512-LABEL: foldv32i8u: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX512-NEXT: retq ; ; X32-AVX-LABEL: foldv32i8u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] ; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> , i1 -1) ret <32 x i8> %out diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -1367,11 +1367,23 @@ ; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: ret{{[l|q]}} ; -; X64-AVX-LABEL: mul_v2i64_0_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; X64-AVX-NEXT: retq +; X64-XOP-LABEL: mul_v2i64_0_1: +; X64-XOP: # %bb.0: +; X64-XOP-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; X64-XOP-NEXT: retq +; +; X64-AVX2-LABEL: mul_v2i64_0_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; X64-AVX2-NEXT: retq +; +; X64-AVX512DQ-LABEL: mul_v2i64_0_1: +; X64-AVX512DQ: # %bb.0: +; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; X64-AVX512DQ-NEXT: retq %1 = mul <2 x i64> %a0, ret <2 x i64> %1 } diff --git a/llvm/test/CodeGen/X86/vector-partial-undef.ll b/llvm/test/CodeGen/X86/vector-partial-undef.ll --- a/llvm/test/CodeGen/X86/vector-partial-undef.ll +++ b/llvm/test/CodeGen/X86/vector-partial-undef.ll @@ -13,7 +13,7 @@ ; ; AVX-LABEL: xor_insert_insert: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %xw = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> %yw = shufflevector <2 x i64> %y, <2 x i64> undef, <4 x i32> @@ -30,9 +30,9 @@ ; ; AVX-LABEL: xor_insert_insert_high_half: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq %xw = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> %yw = shufflevector <2 x i64> %y, <2 x i64> undef, <4 x i32> @@ -86,8 +86,8 @@ ; AVX-LABEL: and_undef_elts: ; AVX: # %bb.0: ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,1,2] +; AVX-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,1,2] ; AVX-NEXT: retq %extend = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> %bogus_bo = and <4 x i64> %extend, @@ -107,8 +107,8 @@ ; AVX-LABEL: or_undef_elts: ; AVX: # %bb.0: ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,1,2] +; AVX-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,1,2] ; AVX-NEXT: retq %extend = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> %bogus_bo = or <4 x i64> %extend, @@ -152,10 +152,10 @@ ; AVX-LABEL: xor_undef_elts_alt: ; AVX: # %bb.0: ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [6,1,5,4,3,2,0,7] -; AVX-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,1,5,4,3,2,0,7] +; AVX-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %extend = shufflevector <4 x i32> %x, <4 x i32> undef, <8 x i32> %bogus_bo = xor <8 x i32> %extend, diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -591,19 +591,34 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,64] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] -; AVX-NEXT: retq +; AVX1-LABEL: foldv2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [1,64] +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv2i64: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [1,64] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv2i64: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [1,64] +; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [1,64] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [1,64] ; BITALG-NEXT: retq %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> ) ret <2 x i64> %out @@ -615,19 +630,34 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,32,0,8] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] -; AVX-NEXT: retq +; AVX1-LABEL: foldv4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [1,32,0,8] +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv4i32: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [1,32,0,8] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv4i32: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [1,32,0,8] +; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [1,32,0,8] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [1,32,0,8] ; BITALG-NEXT: retq %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> ) ret <4 x i32> %out @@ -639,19 +669,34 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: foldv8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv8i16: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv8i16: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] +; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv8i16: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv8i16: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] ; BITALG-NEXT: retq %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> ) ret <8 x i16> %out @@ -663,19 +708,34 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] -; AVX-NEXT: retq +; AVX1-LABEL: foldv16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv16i8: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv16i8: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] +; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv16i8: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv16i8: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] ; BITALG-NEXT: retq %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> ) ret <16 x i8> %out diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll @@ -306,37 +306,137 @@ } define <4 x i64> @foldv4i64() nounwind { -; ALL-LABEL: foldv4i64: -; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,64,0,8] -; ALL-NEXT: retq +; AVX1-LABEL: foldv4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,64,0,8] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,64,0,8] +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv4i64: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [1,64,0,8] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv4i64: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm0 = [1,64,0,8] +; AVX512VPOPCNTDQVL-NEXT: retq +; +; BITALG_NOVLX-LABEL: foldv4i64: +; BITALG_NOVLX: # %bb.0: +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [1,64,0,8] +; BITALG_NOVLX-NEXT: retq +; +; BITALG-LABEL: foldv4i64: +; BITALG: # %bb.0: +; BITALG-NEXT: vmovdqa {{.*#+}} ymm0 = [1,64,0,8] +; BITALG-NEXT: retq %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> ) ret <4 x i64> %out } define <8 x i32> @foldv8i32() nounwind { -; ALL-LABEL: foldv8i32: -; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: foldv8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv8i32: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv8i32: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] +; AVX512VPOPCNTDQVL-NEXT: retq +; +; BITALG_NOVLX-LABEL: foldv8i32: +; BITALG_NOVLX: # %bb.0: +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] +; BITALG_NOVLX-NEXT: retq +; +; BITALG-LABEL: foldv8i32: +; BITALG: # %bb.0: +; BITALG-NEXT: vmovdqa {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] +; BITALG-NEXT: retq %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> ) ret <8 x i32> %out } define <16 x i16> @foldv16i16() nounwind { -; ALL-LABEL: foldv16i16: -; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1] -; ALL-NEXT: retq +; AVX1-LABEL: foldv16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1] +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv16i16: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv16i16: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1] +; AVX512VPOPCNTDQVL-NEXT: retq +; +; BITALG_NOVLX-LABEL: foldv16i16: +; BITALG_NOVLX: # %bb.0: +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: retq +; +; BITALG-LABEL: foldv16i16: +; BITALG: # %bb.0: +; BITALG-NEXT: vmovdqa {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1] +; BITALG-NEXT: retq %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> ) ret <16 x i16> %out } define <32 x i8> @foldv32i8() nounwind { -; ALL-LABEL: foldv32i8: -; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7] -; ALL-NEXT: retq +; AVX1-LABEL: foldv32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7] +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv32i8: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv32i8: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7] +; AVX512VPOPCNTDQVL-NEXT: retq +; +; BITALG_NOVLX-LABEL: foldv32i8: +; BITALG_NOVLX: # %bb.0: +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7] +; BITALG_NOVLX-NEXT: retq +; +; BITALG-LABEL: foldv32i8: +; BITALG: # %bb.0: +; BITALG-NEXT: vmovdqa {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7] +; BITALG-NEXT: retq %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> ) ret <32 x i8> %out } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll @@ -15,7 +15,7 @@ ; ; AVX2OR512VL-LABEL: unpckh_unary_extracted_v4i64: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] ; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2OR512VL-NEXT: vzeroupper ; AVX2OR512VL-NEXT: retq @@ -35,7 +35,7 @@ ; ; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8f64: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] ; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2OR512VL-NEXT: vzeroupper ; AVX2OR512VL-NEXT: retq @@ -48,12 +48,19 @@ ; vpermps requires a constant load for the index op. It's unlikely to be profitable. define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) { -; ALL-LABEL: unpckh_unary_extracted_v8i32: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq +; AVX1-LABEL: unpckh_unary_extracted_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8i32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: vzeroupper +; AVX2OR512VL-NEXT: retq %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> @@ -61,12 +68,19 @@ } define <4 x float> @unpckh_unary_extracted_v8f32(<8 x float> %x) { -; ALL-LABEL: unpckh_unary_extracted_v8f32: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq +; AVX1-LABEL: unpckh_unary_extracted_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8f32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: vzeroupper +; AVX2OR512VL-NEXT: retq %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> @@ -123,7 +137,7 @@ ; ; AVX2OR512VL-LABEL: unpckl_unary_extracted_v4i64: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2OR512VL-NEXT: vzeroupper ; AVX2OR512VL-NEXT: retq @@ -143,7 +157,7 @@ ; ; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8f64: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2OR512VL-NEXT: vzeroupper ; AVX2OR512VL-NEXT: retq @@ -156,12 +170,19 @@ ; vpermps requires a constant load for the index op. It's unlikely to be profitable. define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) { -; ALL-LABEL: unpckl_unary_extracted_v8i32: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq +; AVX1-LABEL: unpckl_unary_extracted_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8i32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2OR512VL-NEXT: vzeroupper +; AVX2OR512VL-NEXT: retq %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> @@ -169,12 +190,19 @@ } define <4 x float> @unpckl_unary_extracted_v8f32(<8 x float> %x) { -; ALL-LABEL: unpckl_unary_extracted_v8f32: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq +; AVX1-LABEL: unpckl_unary_extracted_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8f32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2OR512VL-NEXT: vzeroupper +; AVX2OR512VL-NEXT: retq %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> @@ -224,11 +252,17 @@ ; This would infinite loop because we did not recognize the unpack shuffle mask in commuted form. define <8 x i32> @extract_unpckl_v8i32(<8 x i32> %a) { -; ALL-LABEL: extract_unpckl_v8i32: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; ALL-NEXT: retq +; AVX1-LABEL: extract_unpckl_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: extract_unpckl_v8i32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> ret <8 x i32> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -672,10 +672,25 @@ ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } @@ -1638,12 +1653,33 @@ ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: constant_gets_selected: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, (%rdi) -; AVX-NEXT: vmovaps %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: constant_gets_selected: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, (%rdi) +; AVX1-NEXT: vmovaps %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: constant_gets_selected: +; AVX2OR512VL: # %bb.0: # %entry +; AVX2OR512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2OR512VL-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2OR512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: constant_gets_selected: +; XOPAVX1: # %bb.0: # %entry +; XOPAVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vmovaps %xmm0, (%rdi) +; XOPAVX1-NEXT: vmovaps %xmm0, (%rsi) +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: constant_gets_selected: +; XOPAVX2: # %bb.0: # %entry +; XOPAVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; XOPAVX2-NEXT: vmovdqa %xmm0, (%rdi) +; XOPAVX2-NEXT: vmovdqa %xmm0, (%rsi) +; XOPAVX2-NEXT: retq entry: %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8> %shuffle.i = shufflevector <16 x i8> , <16 x i8> %weird_zero, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -20,12 +20,12 @@ ; ; AVX2-LABEL: shuffle_v2i64_00: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v2i64_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX512VL-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -36,10 +36,20 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_10: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_10: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_10: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_10: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -49,10 +59,20 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_11: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_11: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_11: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_11: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -69,12 +89,12 @@ ; ; AVX2-LABEL: shuffle_v2i64_22: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v2i64_22: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] +; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -85,10 +105,20 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_32: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -98,10 +128,20 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_33: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_33: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_33: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_33: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -127,10 +167,20 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v2f64_00: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2f64_00: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2f64_00: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2f64_00: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle } @@ -183,10 +233,20 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v2f64_22: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2f64_22: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2f64_22: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2f64_22: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle } @@ -240,10 +300,20 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v2f64_03: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2f64_03: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2f64_03: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2f64_03: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle } @@ -268,10 +338,20 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v2f64_21: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2f64_21: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2f64_21: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2f64_21: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle } @@ -296,10 +376,20 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v2f64_u2: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2f64_u2: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2f64_u2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2f64_u2: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle } @@ -324,10 +414,20 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_02: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_02: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_02: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_02: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -338,10 +438,20 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_02_copy: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm2[0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_02_copy: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm2[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_02_copy: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm2[0] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_02_copy: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm2[0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -366,10 +476,20 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_03: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_03: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_03: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_03: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -398,10 +518,20 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_03_copy: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_03_copy: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_03_copy: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_03_copy: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -473,10 +603,20 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_13: -; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_13: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_13: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_13: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -487,10 +627,20 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_13_copy: -; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_13_copy: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_13_copy: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_13_copy: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -501,10 +651,20 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_20: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_20: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_20: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_20: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -515,10 +675,20 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_20_copy: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_20_copy: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_20_copy: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_20_copy: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -543,10 +713,20 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_21: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_21: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_21: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_21: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -575,10 +755,20 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_21_copy: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_21_copy: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_21_copy: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_21_copy: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -651,10 +841,20 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_31: -; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_31: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_31: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -665,10 +865,20 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_31_copy: -; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm1[1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_31_copy: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_31_copy: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm1[1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_31_copy: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm1[1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -743,11 +953,23 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_z1: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_z1: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_z1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_z1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> ret <2 x i64> %shuffle } @@ -773,11 +995,23 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2f64_1z: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2f64_1z: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2f64_1z: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2f64_1z: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> ret <2 x double> %shuffle } @@ -790,11 +1024,23 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2f64_z0: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2f64_z0: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2f64_z0: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2f64_z0: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> ret <2 x double> %shuffle } @@ -824,11 +1070,23 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v2f64_z1: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2f64_z1: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2f64_z1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2f64_z1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> ret <2 x double> %shuffle } @@ -840,11 +1098,23 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v2f64_bitcast_1z: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2f64_bitcast_1z: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2f64_bitcast_1z: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2f64_bitcast_1z: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512VL-NEXT: retq %shuffle64 = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x float> %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> @@ -874,11 +1144,23 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v2i64_bitcast_z123: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v2i64_bitcast_z123: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_bitcast_z123: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v2i64_bitcast_z123: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512VL-NEXT: retq %bitcast32 = bitcast <2 x i64> %x to <4 x float> %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> , <4 x i32> %bitcast64 = bitcast <4 x float> %shuffle32 to <2 x i64> @@ -907,10 +1189,20 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: insert_mem_and_zero_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: insert_mem_and_zero_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_mem_and_zero_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_mem_and_zero_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512VL-NEXT: retq %a = load i64, i64* %ptr %v = insertelement <2 x i64> undef, i64 %a, i32 0 %shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> @@ -938,10 +1230,20 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: insert_mem_and_zero_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: insert_mem_and_zero_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_mem_and_zero_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_mem_and_zero_v2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512VL-NEXT: retq %a = load double, double* %ptr %v = insertelement <2 x double> undef, double %a, i32 0 %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> @@ -1100,10 +1402,20 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_reg_lo_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: insert_reg_lo_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_reg_lo_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_reg_lo_v2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: retq %v = insertelement <2 x double> undef, double %a, i32 0 %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle @@ -1132,10 +1444,20 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: insert_reg_hi_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: retq +; AVX1-LABEL: insert_reg_hi_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_reg_hi_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_reg_hi_v2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: retq %v = insertelement <2 x double> undef, double %a, i32 0 %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle @@ -1178,10 +1500,20 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_dup_reg_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: insert_dup_reg_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_reg_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_reg_v2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512VL-NEXT: retq %v = insertelement <2 x double> undef, double %a, i32 0 %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> ret <2 x double> %shuffle @@ -1209,10 +1541,20 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_dup_mem_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: insert_dup_mem_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_mem_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_mem_v2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX512VL-NEXT: retq %a = load double, double* %ptr %v = insertelement <2 x double> undef, double %a, i32 0 %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> @@ -1241,10 +1583,20 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_dup_mem128_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: insert_dup_mem128_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_mem128_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_mem128_v2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX512VL-NEXT: retq %v = load <2 x double>, <2 x double>* %ptr %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> ret <2 x double> %shuffle @@ -1258,10 +1610,20 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: insert_dup_mem_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: insert_dup_mem_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_mem_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_mem_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX512VL-NEXT: retq %tmp = load i64, i64* %ptr, align 1 %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0 %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer @@ -1306,10 +1668,20 @@ ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_mem_v2f64_02: -; AVX: # %bb.0: -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_mem_v2f64_02: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_mem_v2f64_02: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_mem_v2f64_02: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512VL-NEXT: retq %b = load <2 x double>, <2 x double>* %pb, align 1 %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle @@ -1337,10 +1709,20 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_mem_v2f64_21: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_mem_v2f64_21: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_mem_v2f64_21: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_mem_v2f64_21: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; AVX512VL-NEXT: retq %b = load <2 x double>, <2 x double>* %pb, align 1 %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -14,10 +14,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_0001: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_0001: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_0001: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -27,10 +32,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_0020: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_0020: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_0020: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -40,10 +50,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_0112: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_0112: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_0112: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -53,10 +68,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_0300: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_0300: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_0300: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -66,10 +86,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_1000: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_1000: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_1000: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -79,10 +104,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_2200: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_2200: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_2200: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,0,0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -92,10 +122,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_3330: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_3330: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_3330: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -105,10 +140,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_3210: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_3210: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_3210: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -119,10 +159,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_2121: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_2121: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,1] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_2121: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -133,10 +178,15 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_0001: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_0001: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_0001: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle } @@ -146,10 +196,15 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_0020: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_0020: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_0020: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle } @@ -159,10 +214,15 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_0300: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_0300: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_0300: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle } @@ -172,10 +232,15 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_1000: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_1000: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_1000: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle } @@ -185,10 +250,15 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_2200: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_2200: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_2200: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,0,0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle } @@ -198,10 +268,15 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_3330: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_3330: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_3330: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle } @@ -211,10 +286,15 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_3210: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_3210: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_3210: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle } @@ -224,10 +304,15 @@ ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_0011: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_0011: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_0011: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle } @@ -237,10 +322,15 @@ ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_2233: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_2233: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_2233: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle } @@ -307,10 +397,15 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_0145: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_0145: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_0145: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle } @@ -321,10 +416,15 @@ ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_6723: -; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_6723: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_6723: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle } @@ -362,8 +462,8 @@ ; ; AVX2-LABEL: shuffle_v4i32_0124: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_0124: @@ -409,9 +509,9 @@ ; ; AVX2-LABEL: shuffle_v4i32_0142: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_0142: @@ -460,9 +560,9 @@ ; ; AVX2-LABEL: shuffle_v4i32_0412: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_0412: @@ -501,11 +601,17 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v4i32_4012: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2] -; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_4012: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i32_4012: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,2] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_4012: ; AVX512VL: # %bb.0: @@ -521,10 +627,15 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_0145: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_0145: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_0145: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -563,9 +674,9 @@ ; ; AVX2-LABEL: shuffle_v4i32_0451: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_0451: @@ -583,10 +694,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_4501: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_4501: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_4501: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -625,9 +741,9 @@ ; ; AVX2-LABEL: shuffle_v4i32_4015: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_4015: @@ -667,11 +783,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_4zzz: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_4zzz: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_4zzz: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> ret <4 x float> %shuffle } @@ -812,11 +934,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_zzz7: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_zzz7: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_zzz7: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> ret <4 x float> %shuffle } @@ -887,11 +1015,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_0z23: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_0z23: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_0z23: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> ret <4 x float> %shuffle } @@ -924,11 +1058,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_01z3: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_01z3: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_01z3: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> ret <4 x float> %shuffle } @@ -961,11 +1101,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_012z: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_012z: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_012z: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> ret <4 x float> %shuffle } @@ -998,11 +1144,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_0zz3: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_0zz3: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_0zz3: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> ret <4 x float> %shuffle } @@ -1035,11 +1187,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_0z2z: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_0z2z: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_0z2z: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %v, <4 x float> , <4 x i32> ret <4 x float> %shuffle } @@ -1051,10 +1209,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4f32_u051: -; AVX: # %bb.0: -; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4f32_u051: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4f32_u051: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle } @@ -1213,11 +1376,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_4zzz: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_4zzz: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_4zzz: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> ret <4 x i32> %shuffle } @@ -1260,9 +1429,9 @@ ; ; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v4i32_z4zz: @@ -1316,9 +1485,9 @@ ; ; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1] +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v4i32_zz4z: @@ -1386,9 +1555,9 @@ ; ; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v4i32_z6zz: @@ -1617,10 +1786,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_40u1: -; AVX: # %bb.0: -; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_40u1: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_40u1: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle } @@ -1755,11 +1929,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_0z23: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_0z23: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_0z23: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle } @@ -1786,11 +1966,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_01z3: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_01z3: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_01z3: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle } @@ -1817,11 +2003,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_012z: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_012z: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_012z: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle } @@ -1848,11 +2040,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_0zz3: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_0zz3: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_0zz3: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle } @@ -1863,10 +2061,15 @@ ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_bitcast_0415: -; AVX: # %bb.0: -; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_bitcast_0415: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v4i32_bitcast_0415: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2OR512VL-NEXT: retq %shuffle32 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %bitcast64 = bitcast <4 x i32> %shuffle32 to <2 x double> %shuffle64 = shufflevector <2 x double> %bitcast64, <2 x double> undef, <2 x i32> @@ -1936,10 +2139,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: mask_v4f32_4127: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: mask_v4f32_4127: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: mask_v4f32_4127: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2OR512VL-NEXT: retq %1 = bitcast <4 x float> %a to <4 x i32> %2 = bitcast <4 x float> %b to <4 x i32> %3 = and <4 x i32> %1, @@ -1976,10 +2184,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: mask_v4f32_0127: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: mask_v4f32_0127: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: mask_v4f32_0127: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2OR512VL-NEXT: retq %1 = bitcast <4 x float> %a to <2 x i64> %2 = bitcast <4 x float> %b to <2 x i64> %3 = and <2 x i64> %1, @@ -2016,10 +2229,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: mask_v4i32_0127: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: mask_v4i32_0127: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: mask_v4i32_0127: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2OR512VL-NEXT: retq %1 = bitcast <4 x i32> %a to <2 x i64> %2 = bitcast <4 x i32> %b to <2 x i64> %3 = and <2 x i64> %1, @@ -2051,10 +2269,15 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: broadcast_v4f32_0101_from_v2f32: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: broadcast_v4f32_0101_from_v2f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: broadcast_v4f32_0101_from_v2f32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX2OR512VL-NEXT: retq %1 = load <2 x float>, <2 x float>* %x, align 1 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> ret <4 x float> %2 @@ -2091,11 +2314,17 @@ ; SSE41-NEXT: pinsrd $0, %eax, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: extract3_insert0_v4i32_7123: -; AVX: # %bb.0: -; AVX-NEXT: vextractps $3, %xmm1, %eax -; AVX-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: extract3_insert0_v4i32_7123: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractps $3, %xmm1, %eax +; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: extract3_insert0_v4i32_7123: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpextrd $3, %xmm1, %eax +; AVX2OR512VL-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 +; AVX2OR512VL-NEXT: retq %1 = extractelement <4 x i32> %a1, i32 3 %2 = insertelement <4 x i32> %a0, i32 %1, i32 0 ret <4 x i32> %2 @@ -2125,10 +2354,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: extract3_insert3_v4i32_0127: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: extract3_insert3_v4i32_0127: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: extract3_insert3_v4i32_0127: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2OR512VL-NEXT: retq %1 = extractelement <4 x i32> %a1, i32 3 %2 = insertelement <4 x i32> %a0, i32 %1, i32 3 ret <4 x i32> %2 @@ -2155,10 +2389,15 @@ ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; -; AVX-LABEL: insert_mem_and_zero_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: insert_mem_and_zero_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: insert_mem_and_zero_v4i32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2OR512VL-NEXT: retq %a = load i32, i32* %ptr %v = insertelement <4 x i32> undef, i32 %a, i32 0 %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> @@ -2193,11 +2432,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_reg_and_zero_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: insert_reg_and_zero_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: insert_reg_and_zero_v4f32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2OR512VL-NEXT: retq %v = insertelement <4 x float> undef, float %a, i32 0 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> ret <4 x float> %shuffle @@ -2209,10 +2454,15 @@ ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; -; AVX-LABEL: insert_mem_and_zero_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: insert_mem_and_zero_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: insert_mem_and_zero_v4f32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2OR512VL-NEXT: retq %a = load float, float* %ptr %v = insertelement <4 x float> undef, float %a, i32 0 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> @@ -2283,11 +2533,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_mem_lo_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: insert_mem_lo_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: insert_mem_lo_v4i32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2OR512VL-NEXT: retq %a = load <2 x i32>, <2 x i32>* %ptr %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> @@ -2319,11 +2575,17 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: insert_mem_hi_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: insert_mem_hi_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: insert_mem_hi_v4i32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2OR512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2OR512VL-NEXT: retq %a = load <2 x i32>, <2 x i32>* %ptr %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> @@ -2351,10 +2613,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_reg_lo_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: insert_reg_lo_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: insert_reg_lo_v4f32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2OR512VL-NEXT: retq %a.cast = bitcast double %a to <2 x float> %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> @@ -2384,10 +2651,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: insert_reg_hi_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: retq +; AVX1-LABEL: insert_reg_hi_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: insert_reg_hi_v4f32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2OR512VL-NEXT: retq %a.cast = bitcast double %a to <2 x float> %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> @@ -2418,10 +2690,15 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_mem_v4f32_3210: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_mem_v4f32_3210: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_mem_v4f32_3210: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,2,1,0] +; AVX2OR512VL-NEXT: retq %a = load <4 x float>, <4 x float>* %ptr %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> ret <4 x float> %shuffle @@ -2434,10 +2711,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: insert_dup_mem_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: insert_dup_mem_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: insert_dup_mem_v4i32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpbroadcastd (%rdi), %xmm0 +; AVX2OR512VL-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer @@ -2467,11 +2749,11 @@ ; ; AVX2OR512VL-LABEL: shuffle_mem_pmovzx_v4f32: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2OR512VL-NEXT: vmovaps %xmm1, (%rsi) +; AVX2OR512VL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2OR512VL-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2OR512VL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX2OR512VL-NEXT: retq %1 = load <2 x float>, <2 x float>* %p0 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> @@ -2519,10 +2801,15 @@ ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_mem_v4f32_0145: -; AVX: # %bb.0: -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_mem_v4f32_0145: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_mem_v4f32_0145: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX2OR512VL-NEXT: retq %b = load <4 x float>, <4 x float>* %pb, align 1 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -2550,10 +2837,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_mem_v4f32_4523: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_mem_v4f32_4523: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_mem_v4f32_4523: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; AVX2OR512VL-NEXT: retq %b = load <4 x float>, <4 x float>* %pb, align 1 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -16,10 +16,25 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_01012323: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_01012323: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i16_01012323: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v8i16_01012323: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v8i16_01012323: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; XOPAVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -29,10 +44,25 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_67452301: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_67452301: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i16_67452301: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v8i16_67452301: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v8i16_67452301: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; XOPAVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -236,10 +266,25 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_23016745: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_23016745: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i16_23016745: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v8i16_23016745: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,2] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v8i16_23016745: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; XOPAVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1369,10 +1414,25 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_XXXdXXXX: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,2,3,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_XXXdXXXX: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,2,3,3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i16_XXXdXXXX: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v8i16_XXXdXXXX: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,2,3,3] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v8i16_XXXdXXXX: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; XOPAVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -3035,10 +3095,25 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: mask_v8i16_012345ef: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: mask_v8i16_012345ef: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: mask_v8i16_012345ef: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: mask_v8i16_012345ef: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: mask_v8i16_012345ef: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; XOPAVX2-NEXT: retq %1 = bitcast <8 x i16> %a to <2 x i64> %2 = bitcast <8 x i16> %b to <2 x i64> %3 = and <2 x i64> %1, diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1310,19 +1310,49 @@ } define <16 x i16> @shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) { -; ALL-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } define <16 x i16> @shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15(<16 x i16> %a, <16 x i16> %b) { -; ALL-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -1470,10 +1500,25 @@ } define <16 x i16> @shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) { -; ALL-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] +; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -5609,10 +5654,25 @@ } define <16 x i16> @shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) { -; ALL-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[2,2,2,2,6,6,6,6] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[2,2,2,2,6,6,6,6] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[2,2,2,2,6,6,6,6] +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[2,2,2,2,6,6,6,6] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[2,2,2,2,6,6,6,6] +; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle } @@ -7179,10 +7239,25 @@ } define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i16> %a, <16 x i16> %b) { -; ALL-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; ALL-NEXT: retq +; AVX1-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; XOPAVX2-NEXT: retq %alo = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> %bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> %shuf = shufflevector <8 x i16> %alo, <8 x i16> %bhi, <16 x i32> @@ -7190,20 +7265,25 @@ } define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc(<16 x i16> %a, <16 x i16> %b) { -; AVX1OR2-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: retq ; -; AVX512VL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; XOPAVX1-NEXT: retq ; -; XOP-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: -; XOP: # %bb.0: -; XOP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; XOP-NEXT: retq +; XOPAVX2-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; XOPAVX2-NEXT: retq %ahi = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> %bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> %bc0hi = bitcast <8 x i16> %ahi to <16 x i8> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2233,10 +2233,25 @@ } define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) { -; ALL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: -; ALL: # %bb.0: -; ALL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -14,12 +14,12 @@ ; ; AVX2-LABEL: shuffle_v4f64_0000: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_0000: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -34,12 +34,12 @@ ; ; AVX2-LABEL: shuffle_v4f64_0001: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_0001: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -54,12 +54,12 @@ ; ; AVX2-LABEL: shuffle_v4f64_0020: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_0020: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -75,12 +75,12 @@ ; ; AVX2-LABEL: shuffle_v4f64_0300: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_0300: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -95,12 +95,12 @@ ; ; AVX2-LABEL: shuffle_v4f64_1000: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_1000: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -115,12 +115,12 @@ ; ; AVX2-LABEL: shuffle_v4f64_2200: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_2200: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -135,12 +135,12 @@ ; ; AVX2-LABEL: shuffle_v4f64_2222: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_2222: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -155,12 +155,12 @@ ; ; AVX2-LABEL: shuffle_v4f64_2222_bc: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_2222_bc: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX512VL-NEXT: retq %tmp0 = bitcast <4 x i64> %a to <4 x double> %tmp1 = bitcast <4 x i64> %b to <4 x double> @@ -177,12 +177,12 @@ ; ; AVX2-LABEL: shuffle_v4f64_2233: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_2233: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -198,12 +198,12 @@ ; ; AVX2-LABEL: shuffle_v4f64_3330: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_3330: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -218,12 +218,12 @@ ; ; AVX2-LABEL: shuffle_v4f64_3210: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_3210: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -304,28 +304,34 @@ ; ; AVX2-LABEL: shuffle_v4f64_0213: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_0213: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) { -; AVX1OR2-LABEL: shuffle_v4f64_0423: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_0423: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0423: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4f64_0423: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v4f64_0423: @@ -338,12 +344,19 @@ } define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) { -; AVX1OR2-LABEL: shuffle_v4f64_0462: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_0462: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0462: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_0462: ; AVX512VL: # %bb.0: @@ -355,37 +368,77 @@ } define <4 x double> @shuffle_v4f64_0426(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_0426: -; ALL: # %bb.0: -; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_0426: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0426: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0426: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_1537(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_1537: -; ALL: # %bb.0: -; ALL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_1537: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_1537: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_1537: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_4062(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_4062: -; ALL: # %bb.0: -; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_4062: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_4062: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_4062: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_5173(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_5173: -; ALL: # %bb.0: -; ALL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_5173: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_5173: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_5173: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -400,46 +453,96 @@ } define <4 x double> @shuffle_v4f64_0527(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_0527: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_0527: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0527: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0527: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_4163: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_4163: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_4163: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_4163: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_0145: -; ALL: # %bb.0: -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_0145: +; AVX1: # %bb.0: +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0145: +; AVX2: # %bb.0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0145: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_4501: -; ALL: # %bb.0: -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_4501: +; AVX1: # %bb.0: +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_4501: +; AVX2: # %bb.0: +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_4501: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_0167: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_0167: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0167: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0167: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -542,9 +645,9 @@ ; ; AVX2-LABEL: shuffle_v4f64_0415: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_0415: @@ -557,19 +660,39 @@ } define <4 x double> @shuffle_v4f64_u062(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_u062: -; ALL: # %bb.0: -; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_u062: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_u062: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_u062: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: shuffle_v4f64_15uu: -; ALL: # %bb.0: -; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_15uu: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_15uu: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_15uu: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -592,12 +715,12 @@ ; ; AVX2-LABEL: shuffle_v4f64_22uu: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_22uu: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -612,12 +735,12 @@ ; ; AVX2-LABEL: shuffle_v4f64_3333: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4f64_3333: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -675,14 +798,14 @@ ; ; AVX2-LABEL: shuffle_v4f64_0044: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4f64_0044: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v4f64_0044: @@ -695,12 +818,26 @@ } define <4 x double> @shuffle_v4f64_0044_v2f64(<2 x double> %a, <2 x double> %b) { -; ALL-LABEL: shuffle_v4f64_0044_v2f64: -; ALL: # %bb.0: -; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_0044_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0044_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0044_v2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> %2 = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> %3 = shufflevector <2 x double> %1, <2 x double> %2, <4 x i32> @@ -722,11 +859,23 @@ ;PR34359 define <4 x double> @shuffle_v4f64_2345_0567_select(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) { -; ALL-LABEL: shuffle_v4f64_2345_0567_select: -; ALL: # %bb.0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4f64_2345_0567_select: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_2345_0567_select: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_2345_0567_select: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX512VL-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %res = select <4 x i1> , <4 x double> %shuf, <4 x double> %vec3 ret <4 x double> %res @@ -741,12 +890,12 @@ ; ; AVX2-LABEL: shuffle_v4i64_0000: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0000: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -761,12 +910,12 @@ ; ; AVX2-LABEL: shuffle_v4i64_0001: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0001: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -781,12 +930,12 @@ ; ; AVX2-LABEL: shuffle_v4i64_0020: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0020: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -801,12 +950,12 @@ ; ; AVX2-LABEL: shuffle_v4i64_0112: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0112: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,2] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -822,12 +971,12 @@ ; ; AVX2-LABEL: shuffle_v4i64_0300: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0300: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -842,12 +991,12 @@ ; ; AVX2-LABEL: shuffle_v4i64_1000: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_1000: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -862,12 +1011,12 @@ ; ; AVX2-LABEL: shuffle_v4i64_2200: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_2200: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -883,12 +1032,12 @@ ; ; AVX2-LABEL: shuffle_v4i64_3330: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_3330: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -903,12 +1052,12 @@ ; ; AVX2-LABEL: shuffle_v4i64_3210: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_3210: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -924,12 +1073,12 @@ ; ; AVX2-LABEL: shuffle_v4i64_0213: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0213: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -944,14 +1093,14 @@ ; ; AVX2-LABEL: shuffle_v4i64_0124: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_0124: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v4i64_0124: @@ -972,9 +1121,9 @@ ; ; AVX2-LABEL: shuffle_v4i64_0142: ; AVX2: # %bb.0: -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0142: @@ -996,9 +1145,9 @@ ; ; AVX2-LABEL: shuffle_v4i64_0412: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0412: @@ -1019,14 +1168,14 @@ ; ; AVX2-LABEL: shuffle_v4i64_4012: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,2] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_4012: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2] -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,2] +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v4i64_4012: @@ -1039,10 +1188,20 @@ } define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: shuffle_v4i64_0145: -; ALL: # %bb.0: -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4i64_0145: +; AVX1: # %bb.0: +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0145: +; AVX2: # %bb.0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_0145: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -1057,9 +1216,9 @@ ; ; AVX2-LABEL: shuffle_v4i64_0451: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0451: @@ -1072,10 +1231,20 @@ } define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: shuffle_v4i64_4501: -; ALL: # %bb.0: -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4i64_4501: +; AVX1: # %bb.0: +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_4501: +; AVX2: # %bb.0: +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_4501: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -1090,9 +1259,9 @@ ; ; AVX2-LABEL: shuffle_v4i64_4015: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_4015: @@ -1114,14 +1283,14 @@ ; ; AVX2-LABEL: shuffle_v4i64_2u35: ; AVX2: # %bb.0: -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,1] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_2u35: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1] +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,1] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v4i64_2u35: @@ -1143,9 +1312,9 @@ ; ; AVX2-LABEL: shuffle_v4i64_1251: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_1251: @@ -1166,14 +1335,14 @@ ; ; AVX2-LABEL: shuffle_v4i64_1054: ; AVX2: # %bb.0: -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_1054: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v4i64_1054: @@ -1194,8 +1363,8 @@ ; ; AVX2-LABEL: shuffle_v4i64_3254: ; AVX2: # %bb.0: -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_3254: @@ -1222,8 +1391,8 @@ ; ; AVX2-LABEL: shuffle_v4i64_3276: ; AVX2: # %bb.0: -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_3276: @@ -1250,14 +1419,14 @@ ; ; AVX2-LABEL: shuffle_v4i64_1076: ; AVX2: # %bb.0: -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_1076: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v4i64_1076: @@ -1279,9 +1448,9 @@ ; ; AVX2-LABEL: shuffle_v4i64_0415: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0415: @@ -1334,28 +1503,58 @@ } define <4 x i64> @shuffle_v4i64_40u2(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: shuffle_v4i64_40u2: -; ALL: # %bb.0: -; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4i64_40u2: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_40u2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_40u2: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_15uu(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: shuffle_v4i64_15uu: -; ALL: # %bb.0: -; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4i64_15uu: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_15uu: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_15uu: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: shuffle_v4i64_11uu: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4i64_11uu: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_11uu: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_11uu: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -1369,12 +1568,12 @@ ; ; AVX2-LABEL: shuffle_v4i64_22uu: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_22uu: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -1389,12 +1588,12 @@ ; ; AVX2-LABEL: shuffle_v4i64_3333: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_3333: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -1421,12 +1620,26 @@ } define <4 x i64> @shuffle_v4i64_1032_v2i64(<2 x i64> %a, <2 x i64> %b) { -; ALL-LABEL: shuffle_v4i64_1032_v2i64: -; ALL: # %bb.0: -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v4i64_1032_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_1032_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_1032_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-NEXT: retq %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> @@ -1455,10 +1668,20 @@ } define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) { -; ALL-LABEL: insert_mem_and_zero_v4i64: -; ALL: # %bb.0: -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: retq +; AVX1-LABEL: insert_mem_and_zero_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_mem_and_zero_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_mem_and_zero_v4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512VL-NEXT: retq %a = load i64, i64* %ptr %v = insertelement <4 x i64> undef, i64 %a, i64 0 %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> @@ -1476,10 +1699,20 @@ } define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) { -; ALL-LABEL: insert_mem_and_zero_v4f64: -; ALL: # %bb.0: -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: retq +; AVX1-LABEL: insert_mem_and_zero_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_mem_and_zero_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_mem_and_zero_v4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512VL-NEXT: retq %a = load double, double* %ptr %v = insertelement <4 x double> undef, double %a, i32 0 %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> @@ -1487,10 +1720,20 @@ } define <4 x double> @splat_mem_v4f64(double* %ptr) { -; ALL-LABEL: splat_mem_v4f64: -; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: splat_mem_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_mem_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: splat_mem_v4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512VL-NEXT: retq %a = load double, double* %ptr %v = insertelement <4 x double> undef, double %a, i32 0 %shuffle = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> @@ -1498,10 +1741,20 @@ } define <4 x i64> @splat_mem_v4i64(i64* %ptr) { -; ALL-LABEL: splat_mem_v4i64: -; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: splat_mem_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_mem_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: splat_mem_v4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512VL-NEXT: retq %a = load i64, i64* %ptr %v = insertelement <4 x i64> undef, i64 %a, i64 0 %shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> @@ -1509,10 +1762,20 @@ } define <4 x double> @splat_mem_v4f64_2(double* %p) { -; ALL-LABEL: splat_mem_v4f64_2: -; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: splat_mem_v4f64_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_mem_v4f64_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: splat_mem_v4f64_2: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512VL-NEXT: retq %1 = load double, double* %p %2 = insertelement <2 x double> undef, double %1, i32 0 %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> zeroinitializer @@ -1528,42 +1791,67 @@ ; ; AVX2-LABEL: splat_v4f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: splat_v4f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512VL-NEXT: retq %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer ret <4 x double> %1 } define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) { -; ALL-LABEL: splat_mem_v4i64_from_v2i64: -; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: splat_mem_v4i64_from_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_mem_v4i64_from_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: splat_mem_v4i64_from_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512VL-NEXT: retq %v = load <2 x i64>, <2 x i64>* %ptr %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> ret <4 x i64> %shuffle } define <4 x double> @splat_mem_v4f64_from_v2f64(<2 x double>* %ptr) { -; ALL-LABEL: splat_mem_v4f64_from_v2f64: -; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: splat_mem_v4f64_from_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_mem_v4f64_from_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: splat_mem_v4f64_from_v2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512VL-NEXT: retq %v = load <2 x double>, <2 x double>* %ptr %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> ret <4 x double> %shuffle } define <4 x i64> @splat128_mem_v4i64_from_v2i64(<2 x i64>* %ptr) { -; AVX1OR2-LABEL: splat128_mem_v4i64_from_v2i64: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: splat128_mem_v4i64_from_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat128_mem_v4i64_from_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: splat128_mem_v4i64_from_v2i64: ; AVX512VL: # %bb.0: @@ -1575,10 +1863,20 @@ } define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) { -; ALL-LABEL: splat128_mem_v4f64_from_v2f64: -; ALL: # %bb.0: -; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; ALL-NEXT: retq +; AVX1-LABEL: splat128_mem_v4f64_from_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat128_mem_v4f64_from_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: splat128_mem_v4f64_from_v2f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX512VL-NEXT: retq %v = load <2 x double>, <2 x double>* %ptr %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> ret <4 x double> %shuffle @@ -1593,12 +1891,12 @@ ; ; AVX2-LABEL: broadcast_v4f64_0000_from_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: broadcast_v4f64_0000_from_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512VL-NEXT: retq %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> %2 = bitcast <4 x i64> %1 to <4 x double> @@ -1607,10 +1905,20 @@ } define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) { -; ALL-LABEL: bitcast_v4f64_0426: -; ALL: # %bb.0: -; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; ALL-NEXT: retq +; AVX1-LABEL: bitcast_v4f64_0426: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitcast_v4f64_0426: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: bitcast_v4f64_0426: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512VL-NEXT: retq %shuffle64 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %bitcast32 = bitcast <4 x double> %shuffle64 to <8 x float> %shuffle32 = shufflevector <8 x float> %bitcast32, <8 x float> undef, <8 x i32> @@ -1621,10 +1929,20 @@ } define <4 x i64> @concat_v4i64_0167(<4 x i64> %a0, <4 x i64> %a1) { -; ALL-LABEL: concat_v4i64_0167: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; ALL-NEXT: retq +; AVX1-LABEL: concat_v4i64_0167: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_v4i64_0167: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: concat_v4i64_0167: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VL-NEXT: retq %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> %a1hi = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> %shuffle64 = shufflevector <2 x i64> %a0lo, <2 x i64> %a1hi, <4 x i32> @@ -1632,10 +1950,20 @@ } define <4 x i64> @concat_v4i64_0145_bc(<4 x i64> %a0, <4 x i64> %a1) { -; ALL-LABEL: concat_v4i64_0145_bc: -; ALL: # %bb.0: -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: concat_v4i64_0145_bc: +; AVX1: # %bb.0: +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_v4i64_0145_bc: +; AVX2: # %bb.0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: concat_v4i64_0145_bc: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> %a1lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> %bc0lo = bitcast <2 x i64> %a0lo to <4 x i32> @@ -1646,10 +1974,20 @@ } define <4 x i64> @insert_dup_mem_v4i64(i64* %ptr) { -; ALL-LABEL: insert_dup_mem_v4i64: -; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: insert_dup_mem_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_mem_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_mem_v4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512VL-NEXT: retq %tmp = load i64, i64* %ptr, align 1 %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0 %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <4 x i32> zeroinitializer @@ -1686,12 +2024,12 @@ ; ; AVX2-LABEL: shuffle_v4i64_1230: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_1230: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> ret <4 x i64> %shuffle @@ -1706,9 +2044,9 @@ ; ; AVX2-SLOW-LABEL: shuffle_v4i64_z0z3: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3] -; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3] +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v4i64_z0z3: @@ -1718,9 +2056,9 @@ ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_z0z3: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3] -; AVX512VL-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3] +; AVX512VL-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v4i64_z0z3: @@ -1740,9 +2078,9 @@ ; ; AVX2-SLOW-LABEL: shuffle_v4i64_1z2z: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v4i64_1z2z: @@ -1752,9 +2090,9 @@ ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_1z2z: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] +; AVX512VL-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v4i64_1z2z: @@ -1968,11 +2306,17 @@ } define <8 x float> @shuffle_v8f32_0zzzzzzz_pgso(<8 x float> %a) !prof !14 { -; AVX1OR2-LABEL: shuffle_v8f32_0zzzzzzz_pgso: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_0zzzzzzz_pgso: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_0zzzzzzz_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_0zzzzzzz_pgso: ; AVX512VL: # %bb.0: @@ -1984,11 +2328,17 @@ } define <8 x i32> @shuffle_v8i32_0zzzzzzz_pgso(<8 x i32> %a) !prof !14 { -; AVX1OR2-LABEL: shuffle_v8i32_0zzzzzzz_pgso: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_0zzzzzzz_pgso: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_0zzzzzzz_pgso: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_0zzzzzzz_pgso: ; AVX512VL: # %bb.0: @@ -2000,21 +2350,45 @@ } define <4 x i64> @unpckh_v4i64(<4 x i64> %x, <4 x i64> %y) { -; ALL-LABEL: unpckh_v4i64: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; ALL-NEXT: retq +; AVX1-LABEL: unpckh_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: unpckh_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: unpckh_v4i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512VL-NEXT: retq %unpckh = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> ret <4 x i64> %unpckh } define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) { -; ALL-LABEL: unpckh_v4f64: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; ALL-NEXT: retq +; AVX1-LABEL: unpckh_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: unpckh_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: unpckh_v4f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512VL-NEXT: retq %unpckh = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> ret <4 x double> %unpckh } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -14,7 +14,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8f32_00000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2OR512VL-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -30,26 +30,26 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_00000010: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8f32_00000010: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00000010: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_00000010: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -65,26 +65,26 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_00000200: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8f32_00000200: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00000200: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_00000200: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -100,26 +100,26 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_00003000: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8f32_00003000: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00003000: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_00003000: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -136,8 +136,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8f32_00040000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -153,8 +153,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8f32_00500000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -170,8 +170,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8f32_06000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -187,8 +187,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8f32_70000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [7,0,0,0] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -213,8 +213,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8f32_00112233: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -230,36 +230,41 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_00001111: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8f32_00001111: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00001111: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_00001111: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_81a3c5e7(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_81a3c5e7: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_81a3c5e7: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_81a3c5e7: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -274,14 +279,14 @@ ; ; AVX2-LABEL: shuffle_v8f32_08080808: ; AVX2: # %bb.0: -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_08080808: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512VL-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_08080808: @@ -343,30 +348,47 @@ } define <8 x float> @shuffle_v8f32_08194c5d(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_08194c5d: -; ALL: # %bb.0: -; ALL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_08194c5d: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_08194c5d: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_2a3b6e7f(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_2a3b6e7f: -; ALL: # %bb.0: -; ALL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_2a3b6e7f: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_2a3b6e7f: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_08192a3b: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_08192a3b: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_08192a3b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_08192a3b: ; AVX512VL: # %bb.0: @@ -389,20 +411,20 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_08991abb: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8f32_08991abb: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_08991abb: @@ -426,9 +448,9 @@ ; ; AVX2-LABEL: shuffle_v8f32_091b2d3f: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u> -; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_091b2d3f: @@ -450,16 +472,16 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_09ab1def: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8f32_09ab1def: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_09ab1def: @@ -473,73 +495,113 @@ } define <8 x float> @shuffle_v8f32_00014445(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_00014445: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_00014445: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_00014445: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_00204464(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_00204464: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_00204464: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_00204464: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_03004744(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_03004744: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_03004744: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_03004744: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_10005444(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_10005444: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_10005444: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_10005444: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_22006644(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_22006644: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_22006644: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_22006644: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_33307774(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_33307774: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_33307774: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_33307774: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_32107654(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_32107654: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_32107654: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_32107654: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_00234467(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_00234467: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_00234467: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_00234467: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -554,10 +616,15 @@ } define <8 x float> @shuffle_v8f32_10325476(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_10325476: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_10325476: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_10325476: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -572,19 +639,29 @@ } define <8 x float> @shuffle_v8f32_10235467(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_10235467: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_10235467: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_10235467: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_10225466(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_10225466: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_10225466: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_10225466: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -773,22 +850,22 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_c348cda0: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8f32_c348cda0: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_c348cda0: @@ -814,20 +891,20 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_f511235a: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,0] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,1,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8f32_f511235a: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2> -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_f511235a: @@ -848,21 +925,21 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_32103210: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8f32_32103210: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_32103210: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_32103210: @@ -884,21 +961,21 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_76547654: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8f32_76547654: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_76547654: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_76547654: @@ -920,42 +997,48 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_76543210: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8f32_76543210: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_76543210: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_76543210: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_3210ba98: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_3210ba98: +; AVX1: # %bb.0: +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_3210ba98: +; AVX2: # %bb.0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_3210ba98: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_3210ba98: @@ -968,16 +1051,22 @@ } define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_3210fedc: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_3210fedc: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_3210fedc: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_3210fedc: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_3210fedc: @@ -990,11 +1079,17 @@ } define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_7654fedc: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_7654fedc: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_7654fedc: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_7654fedc: ; AVX512VL-SLOW: # %bb.0: @@ -1012,11 +1107,17 @@ } define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_fedc7654: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_fedc7654: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_fedc7654: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_fedc7654: ; AVX512VL-SLOW: # %bb.0: @@ -1064,16 +1165,22 @@ } define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_ba987654: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_ba987654: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_ba987654: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_ba987654: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_ba987654: @@ -1087,16 +1194,22 @@ } define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_ba983210: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_ba983210: +; AVX1: # %bb.0: +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_ba983210: +; AVX2: # %bb.0: +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_ba983210: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_ba983210: @@ -1110,19 +1223,29 @@ } define <8 x float> @shuffle_v8f32_80u1c4u5(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_80u1c4u5: -; ALL: # %bb.0: -; ALL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_80u1c4u5: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_80u1c4u5: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_a2u3e6f7(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_a2u3e6f7: -; ALL: # %bb.0: -; ALL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_a2u3e6f7: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_a2u3e6f7: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1143,14 +1266,14 @@ ; ; AVX2-LABEL: shuffle_v8f32_084c195d: ; AVX2: # %bb.0: -; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_084c195d: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_084c195d: @@ -1172,7 +1295,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8f32_01452367: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> ret <8 x float> %shuffle @@ -1191,9 +1314,9 @@ ; ; AVX2-LABEL: shuffle_v8f32_089abcde: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_089abcde: @@ -1230,11 +1353,17 @@ } define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_uuuu1111: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_uuuu1111: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_uuuu1111: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1248,8 +1377,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8f32_44444444: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -1265,11 +1394,17 @@ } define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_uuuu3210: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_uuuu3210: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_uuuu3210: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1285,31 +1420,49 @@ } define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_1111uuuu: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_1111uuuu: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_1111uuuu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) { -; ALL-LABEL: shuffle_v8f32_5555uuuu: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_5555uuuu: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_5555uuuu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_32107654_v4f32(<4 x float> %a, <4 x float> %b) { -; ALL-LABEL: shuffle_v8f32_32107654_v4f32: -; ALL: # %bb.0: -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8f32_32107654_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8f32_32107654_v4f32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2OR512VL-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> %3 = shufflevector <4 x float> %1, <4 x float> %2, <8 x i32> @@ -1348,7 +1501,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2OR512VL-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1364,26 +1517,26 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00000010: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8i32_00000010: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00000010: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_00000010: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1399,26 +1552,26 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00000200: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8i32_00000200: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00000200: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_00000200: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1434,26 +1587,26 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00003000: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8i32_00003000: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00003000: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_00003000: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1470,8 +1623,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00040000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1487,8 +1640,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00500000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1504,8 +1657,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_06000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1521,8 +1674,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_70000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [7,0,0,0] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1536,7 +1689,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_01014545: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1552,26 +1705,26 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00112233: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8i32_00112233: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00112233: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_00112233: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1587,39 +1740,44 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00001111: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8i32_00001111: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00001111: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_00001111: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_81a3c5e7(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_81a3c5e7: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; ALL-NEXT: retq - %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> - ret <8 x i32> %shuffle -} +; AVX1-LABEL: shuffle_v8i32_81a3c5e7: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_81a3c5e7: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2OR512VL-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %shuffle +} define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_08080808: @@ -1631,14 +1789,14 @@ ; ; AVX2-LABEL: shuffle_v8i32_08080808: ; AVX2: # %bb.0: -; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_08080808: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512VL-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_08080808: @@ -1659,14 +1817,14 @@ ; ; AVX2-LABEL: shuffle_v8i32_08084c4c: ; AVX2: # %bb.0: -; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_08084c4c: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX512VL-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_08084c4c: @@ -1706,30 +1864,47 @@ } define <8 x i32> @shuffle_v8i32_08194c5d(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_08194c5d: -; ALL: # %bb.0: -; ALL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_08194c5d: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_08194c5d: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_2a3b6e7f(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_2a3b6e7f: -; ALL: # %bb.0: -; ALL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_2a3b6e7f: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_2a3b6e7f: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) { -; AVX1OR2-LABEL: shuffle_v8i32_08192a3b: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_08192a3b: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_08192a3b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_08192a3b: ; AVX512VL: # %bb.0: @@ -1761,11 +1936,11 @@ ; ; AVX2-FAST-LABEL: shuffle_v8i32_08991abb: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_08991abb: @@ -1825,9 +2000,9 @@ ; ; AVX2-FAST-LABEL: shuffle_v8i32_09ab1def: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_09ab1def: @@ -1841,73 +2016,113 @@ } define <8 x i32> @shuffle_v8i32_00014445(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_00014445: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_00014445: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_00014445: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_00204464(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_00204464: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_00204464: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_00204464: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_03004744(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_03004744: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_03004744: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_03004744: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_10005444(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_10005444: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_10005444: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_10005444: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_22006644(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_22006644: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_22006644: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_22006644: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_33307774(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_33307774: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_33307774: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_33307774: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_32107654(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_32107654: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_32107654: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_32107654: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_00234467(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_00234467: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_00234467: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_00234467: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1920,17 +2135,22 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00224466: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_10325476(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_10325476: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_10325476: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_10325476: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1943,26 +2163,36 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_11335577: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_10235467(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_10235467: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_10235467: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_10235467: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_10225466(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_10225466: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_10225466: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_10225466: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -1975,8 +2205,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00015444: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1990,8 +2220,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00204644: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2005,8 +2235,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_03004474: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2020,8 +2250,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_10004444: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2035,8 +2265,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_22006446: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2050,8 +2280,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_33307474: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2065,8 +2295,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_32104567: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2080,8 +2310,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00236744: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2095,8 +2325,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00226644: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2110,8 +2340,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_10324567: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2125,8 +2355,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_11334567: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2140,8 +2370,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_01235467: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2155,8 +2385,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_01235466: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2170,8 +2400,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_002u6u44: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4> -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2185,8 +2415,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00uu66uu: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u> -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2200,8 +2430,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_103245uu: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u> -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2215,8 +2445,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_1133uu67: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7> -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2230,8 +2460,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_0uu354uu: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u> -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2245,8 +2475,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_uuu3uu66: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = -; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2264,18 +2494,18 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8i32_6caa87e5: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,3,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,0,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8i32_6caa87e5: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,3,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_6caa87e5: @@ -2297,21 +2527,21 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8i32_32103210: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8i32_32103210: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_32103210: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_32103210: @@ -2333,21 +2563,21 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8i32_76547654: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8i32_76547654: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_76547654: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_76547654: @@ -2369,42 +2599,48 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8i32_76543210: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8i32_76543210: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_76543210: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_76543210: ; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] -; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) { -; AVX1OR2-LABEL: shuffle_v8i32_3210ba98: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_3210ba98: +; AVX1: # %bb.0: +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_3210ba98: +; AVX2: # %bb.0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_3210ba98: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_3210ba98: @@ -2417,16 +2653,22 @@ } define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) { -; AVX1OR2-LABEL: shuffle_v8i32_3210fedc: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_3210fedc: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_3210fedc: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_3210fedc: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_3210fedc: @@ -2439,11 +2681,17 @@ } define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) { -; AVX1OR2-LABEL: shuffle_v8i32_7654fedc: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_7654fedc: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_7654fedc: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_7654fedc: ; AVX512VL-SLOW: # %bb.0: @@ -2461,11 +2709,17 @@ } define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) { -; AVX1OR2-LABEL: shuffle_v8i32_fedc7654: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_fedc7654: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_fedc7654: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_fedc7654: ; AVX512VL-SLOW: # %bb.0: @@ -2484,16 +2738,22 @@ } define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) { -; AVX1OR2-LABEL: shuffle_v8i32_ba987654: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_ba987654: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_ba987654: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_ba987654: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_ba987654: @@ -2507,16 +2767,22 @@ } define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) { -; AVX1OR2-LABEL: shuffle_v8i32_ba983210: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_ba983210: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_ba983210: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_ba983210: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_ba983210: @@ -2541,9 +2807,9 @@ ; ; AVX2-LABEL: shuffle_v8i32_089abcde: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_089abcde: @@ -2571,14 +2837,14 @@ ; ; AVX2-LABEL: shuffle_v8i32_0189abcd: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,2] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_0189abcd: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,2] -; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,2] +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_0189abcd: @@ -2623,38 +2889,59 @@ } define <8 x i32> @shuffle_v8i32_80u1b4uu(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_80u1b4uu: -; ALL: # %bb.0: -; ALL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_80u1b4uu: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_80u1b4uu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_uuuu1111: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_uuuu1111: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_uuuu1111: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_2222uuuu: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_2222uuuu: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_2222uuuu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_2A3Buuuu(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_2A3Buuuu: -; ALL: # %bb.0: -; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_2A3Buuuu: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_2A3Buuuu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2668,8 +2955,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_44444444: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2684,8 +2971,8 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_44444444_bc: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX2OR512VL-NEXT: retq %tmp0 = bitcast <8 x float> %a to <8 x i32> %tmp1 = bitcast <8 x float> %b to <8 x i32> @@ -2694,11 +2981,17 @@ } define <8 x i32> @shuffle_v8i32_5555uuuu(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_5555uuuu: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_5555uuuu: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_5555uuuu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2717,10 +3010,10 @@ ; ; AVX2-LABEL: shuffle_v8i32_0dcd3f14: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,3,u,1,4> -; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,3,u,1,4> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_0dcd3f14: @@ -2742,19 +3035,26 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_uuuuuu7u: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_32107654_v4i32(<4 x i32> %a, <4 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_32107654_v4i32: -; ALL: # %bb.0: -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_32107654_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_32107654_v4i32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2OR512VL-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %2 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> @@ -2762,10 +3062,15 @@ } define <8 x float> @splat_mem_v8f32_2(float* %p) { -; ALL-LABEL: splat_mem_v8f32_2: -; ALL: # %bb.0: -; ALL-NEXT: vbroadcastss (%rdi), %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: splat_mem_v8f32_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: splat_mem_v8f32_2: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2OR512VL-NEXT: retq %1 = load float, float* %p %2 = insertelement <4 x float> undef, float %1, i32 0 %3 = shufflevector <4 x float> %2, <4 x float> undef, <8 x i32> zeroinitializer @@ -2781,7 +3086,7 @@ ; ; AVX2OR512VL-LABEL: splat_v8f32: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2OR512VL-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX2OR512VL-NEXT: retq %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer ret <8 x float> %1 @@ -2884,19 +3189,29 @@ } define <8 x i32> @shuffle_v8i32_30127456(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_30127456: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_30127456: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_30127456: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_12305674(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_12305674: -; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_12305674: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v8i32_12305674: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } @@ -2944,10 +3259,15 @@ } define <8 x i32> @insert_mem_and_zero_v8i32(i32* %ptr) { -; ALL-LABEL: insert_mem_and_zero_v8i32: -; ALL: # %bb.0: -; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: retq +; AVX1-LABEL: insert_mem_and_zero_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: insert_mem_and_zero_v8i32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2OR512VL-NEXT: retq %a = load i32, i32* %ptr %v = insertelement <8 x i32> undef, i32 %a, i32 0 %shuffle = shufflevector <8 x i32> %v, <8 x i32> zeroinitializer, <8 x i32> @@ -2955,10 +3275,15 @@ } define <8 x i32> @concat_v8i32_0123CDEF(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: concat_v8i32_0123CDEF: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; ALL-NEXT: retq +; AVX1-LABEL: concat_v8i32_0123CDEF: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: concat_v8i32_0123CDEF: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2OR512VL-NEXT: retq %alo = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> %bhi = shufflevector <8 x i32> %b, <8 x i32> undef, <4 x i32> %shuf = shufflevector <4 x i32> %alo, <4 x i32> %bhi, <8 x i32> @@ -2966,15 +3291,15 @@ } define <8 x i32> @concat_v8i32_4567CDEF_bc(<8 x i32> %a0, <8 x i32> %a1) { -; AVX1OR2-LABEL: concat_v8i32_4567CDEF_bc: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: concat_v8i32_4567CDEF_bc: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: retq ; -; AVX512VL-LABEL: concat_v8i32_4567CDEF_bc: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: concat_v8i32_4567CDEF_bc: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2OR512VL-NEXT: retq %a0hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> %a1hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> %bc0hi = bitcast <4 x i32> %a0hi to <2 x i64> @@ -2985,10 +3310,20 @@ } define <8 x float> @concat_v8f32_4567CDEF_bc(<8 x float> %f0, <8 x float> %f1) { -; ALL-LABEL: concat_v8f32_4567CDEF_bc: -; ALL: # %bb.0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: retq +; AVX1-LABEL: concat_v8f32_4567CDEF_bc: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_v8f32_4567CDEF_bc: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: concat_v8f32_4567CDEF_bc: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: retq %a0 = bitcast <8 x float> %f0 to <4 x i64> %a1 = bitcast <8 x float> %f1 to <8 x i32> %a0hi = shufflevector <4 x i64> %a0, <4 x i64> undef, <2 x i32> @@ -3001,10 +3336,15 @@ } define <8 x i32> @insert_dup_mem_v8i32(i32* %ptr) { -; ALL-LABEL: insert_dup_mem_v8i32: -; ALL: # %bb.0: -; ALL-NEXT: vbroadcastss (%rdi), %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: insert_dup_mem_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: insert_dup_mem_v8i32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2OR512VL-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <8 x i32> zeroinitializer @@ -3043,8 +3383,8 @@ ; ; AVX2-LABEL: shuffle_v8i32_12345670: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_12345670: @@ -3195,11 +3535,17 @@ ; PR40434: https://bugs.llvm.org/show_bug.cgi?id=40434 define <8 x i32> @unpckh_v8i32(<8 x i32> %x, <8 x i32> %y) { -; ALL-LABEL: unpckh_v8i32: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; ALL-NEXT: retq +; AVX1-LABEL: unpckh_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: unpckh_v8i32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2OR512VL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: retq %unpckh = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> ret <8 x i32> %unpckh } @@ -3207,11 +3553,17 @@ ; Same as above but with floats. define <8 x float> @unpckh_v8f32(<8 x float> %x, <8 x float> %y) { -; ALL-LABEL: unpckh_v8f32: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; ALL-NEXT: retq +; AVX1-LABEL: unpckh_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: unpckh_v8f32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2OR512VL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: retq %unpckh = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> ret <8 x float> %unpckh } @@ -3219,11 +3571,17 @@ ; Alternate form of the above - make sure we don't have conflicting transforms. define <8 x i32> @blend_perm_v8i32(<8 x i32> %x, <8 x i32> %y) { -; ALL-LABEL: blend_perm_v8i32: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; ALL-NEXT: retq +; AVX1-LABEL: blend_perm_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: blend_perm_v8i32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2OR512VL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: retq %unpckh = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> %r = shufflevector <8 x i32> %unpckh, <8 x i32> undef, <8 x i32> ret <8 x i32> %r @@ -3232,11 +3590,17 @@ ; Same as above but with floats. define <8 x float> @blend_perm_v8f32(<8 x float> %x, <8 x float> %y) { -; ALL-LABEL: blend_perm_v8f32: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; ALL-NEXT: retq +; AVX1-LABEL: blend_perm_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: blend_perm_v8f32: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2OR512VL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: retq %unpckh = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> %r = shufflevector <8 x float> %unpckh, <8 x float> undef, <8 x i32> ret <8 x float> %r @@ -3245,11 +3609,17 @@ ; Another variation of the above - make sure we don't have conflicting transforms. define <8 x i32> @unpckh_v8i32_unary(<8 x i32> %x) { -; ALL-LABEL: unpckh_v8i32_unary: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; ALL-NEXT: retq +; AVX1-LABEL: unpckh_v8i32_unary: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: unpckh_v8i32_unary: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: retq %r = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> ret <8 x i32> %r } @@ -3257,11 +3627,17 @@ ; Same as above but with floats. define <8 x float> @unpckh_v8f32_unary(<8 x float> %x) { -; ALL-LABEL: unpckh_v8f32_unary: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; ALL-NEXT: retq +; AVX1-LABEL: unpckh_v8f32_unary: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: unpckh_v8f32_unary: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: retq %r = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> ret <8 x float> %r } @@ -3279,10 +3655,10 @@ ; ; AVX2-LABEL: lowhalf_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,6,2,6,3,6] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,3,6,2,6,3,6] ; AVX2-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: lowhalf_v8i32: @@ -3307,10 +3683,10 @@ ; ; AVX2-LABEL: lowhalf_v8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,6,2,6,3,6] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,3,6,2,6,3,6] ; AVX2-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: lowhalf_v8f32: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -7,7 +7,7 @@ define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> ret <16 x float> %shuffle @@ -16,8 +16,8 @@ define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: ; ALL: # %bb.0: -; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0 -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> ret <16 x float> %shuffle @@ -26,8 +26,8 @@ define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc: ; ALL: # %bb.0: -; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0 -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 ; ALL-NEXT: retq %tmp0 = bitcast <16 x i32> %a to <16 x float> %tmp1 = bitcast <16 x i32> %b to <16 x float> @@ -38,7 +38,7 @@ define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d: ; ALL: # %bb.0: -; ALL-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; ALL-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> ret <16 x float> %shuffle @@ -47,8 +47,8 @@ define <16 x float> @shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <16 x i32> ret <16 x float> %shuffle @@ -57,7 +57,7 @@ define <16 x float> @shuffle_v16f32_vunpcklps_swap(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_vunpcklps_swap: ; ALL: # %bb.0: -; ALL-NEXT: vunpcklps {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13] +; ALL-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> ret <16 x float> %shuffle @@ -76,7 +76,7 @@ define <16 x i32> @shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d: ; ALL: # %bb.0: -; ALL-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; ALL-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %shuffle @@ -85,8 +85,8 @@ define <16 x i32> @shuffle_v16i32_zz_10_zz_11_zz_14_zz_15_zz_18_zz_19_zz_1c_zz_1d(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_zz_10_zz_11_zz_14_zz_15_zz_18_zz_19_zz_1c_zz_1d: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; ALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> zeroinitializer, <16 x i32> %b, <16 x i32> ret <16 x i32> %shuffle @@ -95,7 +95,7 @@ define <16 x float> @shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f: ; ALL: # %bb.0: -; ALL-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; ALL-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> ret <16 x float> %shuffle @@ -104,8 +104,8 @@ define <16 x float> @shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; ALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> zeroinitializer, <16 x float> %b, <16 x i32> ret <16 x float> %shuffle @@ -141,7 +141,7 @@ define <16 x float> @shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4,8,8,10,8,12,12,14,12] +; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4,8,8,10,8,12,12,14,12] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> ret <16 x float> %shuffle @@ -150,7 +150,7 @@ define <16 x float> @shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,0,3,0,7,4,7,4,11,8,11,8,15,12,15,12] +; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,0,3,0,7,4,7,4,11,8,11,8,15,12,15,12] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> ret <16 x float> %shuffle @@ -160,7 +160,7 @@ define <16 x float> @shuffle_v16f32_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x float> %a) { ; ALL-LABEL: shuffle_v16f32_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; ALL: # %bb.0: -; ALL-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 +; ALL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 ; ALL-NEXT: retq %tmp1 = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> %tmp2 = shufflevector <16 x float> %tmp1, <16 x float> , <16 x i32> @@ -180,7 +180,7 @@ define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %shuffle @@ -189,8 +189,8 @@ define <16 x i32> @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04: ; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %shuffle @@ -199,7 +199,7 @@ define <16 x i32> @shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f: ; ALL: # %bb.0: -; ALL-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; ALL-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %shuffle @@ -208,8 +208,8 @@ define <16 x i32> @shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_zz(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_zz: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> zeroinitializer, <16 x i32> ret <16 x i32> %shuffle @@ -233,8 +233,8 @@ define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x float> %a) { ; ALL-LABEL: shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1> -; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1> +; ALL-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> ret <16 x float> %c @@ -243,8 +243,8 @@ define <16 x i32> @shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x i32> %a) { ; ALL-LABEL: shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1> -; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1> +; ALL-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> ret <16 x i32> %c @@ -327,7 +327,7 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u: ; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %c @@ -349,10 +349,10 @@ define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) { ; ALL-LABEL: test_v16i32_0_1_2_12: ; ALL: # %bb.0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vbroadcastss %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; ALL-NEXT: vpbroadcastd %xmm1, %xmm1 +; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> @@ -364,8 +364,8 @@ define <4 x i32> @test_v16i32_0_4_8_12(<16 x i32> %v) { ; ALL-LABEL: test_v16i32_0_4_8_12: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [0,4,8,12] -; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12] +; ALL-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq @@ -376,7 +376,7 @@ define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) { ; ALL-LABEL: shuffle_v16f32_extract_256: ; ALL: # %bb.0: -; ALL-NEXT: vmovups 32(%rsi), %ymm0 +; ALL-NEXT: vmovdqu 32(%rsi), %ymm0 ; ALL-NEXT: retq %ptr_a = bitcast float* %a to <16 x float>* %v_a = load <16 x float>, <16 x float>* %ptr_a, align 4 @@ -414,7 +414,7 @@ define <16 x i32> @shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] +; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %c @@ -423,7 +423,7 @@ define <16 x i32> @shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %c @@ -441,7 +441,7 @@ define <16 x i32> @insert_mem_and_zero_v16i32(i32* %ptr) { ; ALL-LABEL: insert_mem_and_zero_v16i32: ; ALL: # %bb.0: -; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ALL-NEXT: retq %a = load i32, i32* %ptr %v = insertelement <16 x i32> undef, i32 %a, i32 0 @@ -453,8 +453,8 @@ define <16 x i32> @shuffle_v16i32_0zzzzzzzzzzzzzzz(<16 x i32> %a) { ; ALL-LABEL: shuffle_v16i32_0zzzzzzzzzzzzzzz: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> zeroinitializer, <16 x i32> ret <16 x i32> %shuffle @@ -463,8 +463,8 @@ define <16 x float> @shuffle_v16f32_0zzzzzzzzzzzzzzz(<16 x float> %a) { ; ALL-LABEL: shuffle_v16f32_0zzzzzzzzzzzzzzz: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <16 x i32> ret <16 x float> %shuffle @@ -543,8 +543,8 @@ define <16 x float> @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<8 x float> %a) { ; ALL-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04: ; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vbroadcastss %xmm0, %zmm0 +; ALL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; ALL-NEXT: vpbroadcastd %xmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> ret <16 x float> %shuffle @@ -772,8 +772,8 @@ ; ALL-LABEL: mask_shuffle_v4i32_v16i32_00_01_02_03_00_01_02_03_00_01_02_03_00_01_02_03: ; ALL: # %bb.0: ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: retq %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> ret <16 x i32> %res @@ -783,8 +783,8 @@ ; ALL-LABEL: mask_shuffle_v4f32_v16f32_00_01_02_03_00_01_02_03_00_01_02_03_00_01_02_03: ; ALL: # %bb.0: ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: retq %res = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> ret <16 x float> %res diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -197,17 +197,11 @@ } define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) { -; KNL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: -; KNL: ## %bb.0: -; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: -; SKX: ## %bb.0: -; SKX-NEXT: vmovaps {{.*#+}} xmm1 = [65535,0,0,0] -; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; ALL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: +; ALL: ## %bb.0: +; ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; ALL-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: retq %shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -107,29 +107,11 @@ define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) { -; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] -; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,0,0] -; AVX512DQ-NEXT: vandps %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: -; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] -; AVX512VBMI-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512VBMI-NEXT: retq +; ALL-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: +; ALL: # %bb.0: +; ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; ALL-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> ret <64 x i8> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -5,7 +5,7 @@ define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00000000: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -14,8 +14,8 @@ define <8 x double> @shuffle_v8f64_22222222(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_22222222: ; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -24,8 +24,8 @@ define <8 x double> @shuffle_v8f64_44444444(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_44444444: ; ALL: # %bb.0: -; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0 -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -34,8 +34,8 @@ define <8 x double> @shuffle_v8f64_44444444_bc(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8f64_44444444_bc: ; ALL: # %bb.0: -; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0 -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %tmp0 = bitcast <8 x i64> %a to <8 x double> %tmp1 = bitcast <8 x i64> %b to <8 x double> @@ -46,14 +46,14 @@ define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00000010: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00000010: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -62,14 +62,14 @@ define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00000200: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00000200: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -78,14 +78,14 @@ define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00003000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00003000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -94,14 +94,14 @@ define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00040000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00040000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -110,14 +110,14 @@ define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00500000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00500000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -126,14 +126,14 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_06000000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_06000000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -142,8 +142,8 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_70000000: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [7,0,0,0] +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -152,7 +152,7 @@ define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01014545: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -161,14 +161,14 @@ define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00112233: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00112233: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -177,14 +177,14 @@ define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00001111: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00001111: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -390,7 +390,7 @@ define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00014445: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -399,7 +399,7 @@ define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00204464: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -408,7 +408,7 @@ define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_03004744: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -417,7 +417,7 @@ define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10005444: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -426,7 +426,7 @@ define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_22006644: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -435,7 +435,7 @@ define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_33307774: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -444,7 +444,7 @@ define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_32107654: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -507,14 +507,14 @@ define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00015444: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00015444: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -523,14 +523,14 @@ define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00204644: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00204644: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -539,14 +539,14 @@ define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_03004474: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_03004474: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -555,14 +555,14 @@ define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_10004444: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_10004444: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -571,14 +571,14 @@ define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_22006446: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_22006446: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -587,14 +587,14 @@ define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_33307474: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_33307474: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -603,14 +603,14 @@ define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_32104567: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_32104567: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -619,14 +619,14 @@ define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00236744: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00236744: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -635,14 +635,14 @@ define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00226644: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00226644: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -687,14 +687,14 @@ define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_002u6u44: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4> -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4> +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_002u6u44: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0> -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0> +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -703,14 +703,14 @@ define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_00uu66uu: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u> -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u> +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00uu66uu: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u> -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u> +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -799,7 +799,7 @@ define <8 x double> @shuffle_v8f64_23uuuuuu(<8 x double> %a0, <8 x double> %a1) { ; ALL-LABEL: shuffle_v8f64_23uuuuuu: ; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 +; ALL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; ALL-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> ret <8 x double> %1 @@ -808,7 +808,7 @@ define <8 x double> @shuffle_v8f64_67zzzzzz(<8 x double> %a0, <8 x double> %a1) { ; ALL-LABEL: shuffle_v8f64_67zzzzzz: ; ALL: # %bb.0: -; ALL-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; ALL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; ALL-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x double> %a0, <8 x double> zeroinitializer, <8 x i32> ret <8 x double> %1 @@ -817,7 +817,7 @@ define <8 x double> @shuffle_v8f64_4567uuuu(<8 x double> %a0, <8 x double> %a1) { ; ALL-LABEL: shuffle_v8f64_4567uuuu: ; ALL: # %bb.0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> ret <8 x double> %1 @@ -826,7 +826,7 @@ define <8 x double> @shuffle_v8f64_4567zzzz(<8 x double> %a0, <8 x double> %a1) { ; ALL-LABEL: shuffle_v8f64_4567zzzz: ; ALL: # %bb.0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x double> %a0, <8 x double> zeroinitializer, <8 x i32> ret <8 x double> %1 @@ -835,7 +835,7 @@ define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00000000: ; ALL: # %bb.0: -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -844,8 +844,8 @@ define <8 x i64> @shuffle_v8i64_44444444(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_44444444: ; ALL: # %bb.0: -; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0 -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -854,8 +854,8 @@ define <8 x i64> @shuffle_v8i64_66666666(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_66666666: ; ALL: # %bb.0: -; ALL-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -864,14 +864,14 @@ define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00000010: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00000010: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -880,14 +880,14 @@ define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00000200: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00000200: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -896,14 +896,14 @@ define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00003000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00003000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -912,14 +912,14 @@ define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00040000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00040000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -928,14 +928,14 @@ define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00500000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00500000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -944,14 +944,14 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_06000000: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_06000000: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -960,8 +960,8 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_70000000: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0] -; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [7,0,0,0] +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -970,7 +970,7 @@ define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01014545: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> @@ -980,13 +980,13 @@ define <8 x i64> @shuffle_v8i64_01014545_mem(<8 x i64>* %ptr, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_01014545_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5] +; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_01014545_mem: ; AVX512F-32: # %bb.0: ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5] +; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5] ; AVX512F-32-NEXT: retl %a = load <8 x i64>, <8 x i64>* %ptr @@ -997,14 +997,14 @@ define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00112233: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00112233: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1013,14 +1013,14 @@ define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00001111: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00001111: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1228,7 +1228,7 @@ define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00014445: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1237,7 +1237,7 @@ define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00204464: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1246,7 +1246,7 @@ define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_03004744: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1255,7 +1255,7 @@ define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10005444: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1264,7 +1264,7 @@ define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_22006644: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1273,7 +1273,7 @@ define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_33307774: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1282,7 +1282,7 @@ define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_32107654: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1291,7 +1291,7 @@ define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00234467: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1300,7 +1300,7 @@ define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00224466: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1309,7 +1309,7 @@ define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10325476: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1318,7 +1318,7 @@ define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_11335577: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1327,7 +1327,7 @@ define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10235467: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1336,7 +1336,7 @@ define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10225466: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6] +; ALL-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1345,14 +1345,14 @@ define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00015444: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00015444: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1361,14 +1361,14 @@ define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00204644: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00204644: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1377,14 +1377,14 @@ define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_03004474: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_03004474: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1393,14 +1393,14 @@ define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_10004444: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_10004444: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1409,14 +1409,14 @@ define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_22006446: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_22006446: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1425,14 +1425,14 @@ define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_33307474: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_33307474: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1441,14 +1441,14 @@ define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_32104567: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_32104567: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1457,14 +1457,14 @@ define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00236744: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00236744: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1473,14 +1473,14 @@ define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00226644: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00226644: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1489,14 +1489,14 @@ define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_10324567: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_10324567: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,6,0,7,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1505,14 +1505,14 @@ define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_11334567: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,1,3,3,4,5,6,7] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,3,3,4,5,6,7] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_11334567: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,4,0,5,0,6,0,7,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,4,0,5,0,6,0,7,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1521,14 +1521,14 @@ define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_01235467: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,2,3,5,4,6,7] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,5,4,6,7] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_01235467: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,7,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,7,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1537,14 +1537,14 @@ define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_01235466: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,2,3,5,4,6,6] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,5,4,6,6] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_01235466: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,6,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,6,0] +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1553,14 +1553,14 @@ define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_002u6u44: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4> -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4> +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_002u6u44: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0> -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0> +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1569,14 +1569,14 @@ define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_00uu66uu: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u> -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u> +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00uu66uu: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u> -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u> +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1585,14 +1585,14 @@ define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_103245uu: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <1,0,3,2,4,5,u,u> -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,0,3,2,4,5,u,u> +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_103245uu: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <1,0,0,0,3,0,2,0,4,0,5,0,u,u,u,u> -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,0,0,0,3,0,2,0,4,0,5,0,u,u,u,u> +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1601,14 +1601,14 @@ define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_1133uu67: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <1,1,3,3,u,u,6,7> -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,1,3,3,u,u,6,7> +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_1133uu67: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <1,0,1,0,3,0,3,0,u,u,u,u,6,0,7,0> -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,0,1,0,3,0,3,0,u,u,u,u,6,0,7,0> +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1617,14 +1617,14 @@ define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_0uu354uu: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,u,u,3,5,4,u,u> -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,u,u,3,5,4,u,u> +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_0uu354uu: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,u,u,u,u,3,0,5,0,4,0,u,u,u,u> -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,u,u,u,u,3,0,5,0,4,0,u,u,u,u> +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1633,14 +1633,14 @@ define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_uuu3uu66: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_uuu3uu66: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1667,7 +1667,7 @@ define <8 x double> @shuffle_v8f64_082a4c6e(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_082a4c6e: ; ALL: # %bb.0: -; ALL-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; ALL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -1686,7 +1686,7 @@ define <8 x i64> @shuffle_v8i64_082a4c6e(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_082a4c6e: ; ALL: # %bb.0: -; ALL-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; ALL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1695,8 +1695,8 @@ define <8 x i64> @shuffle_v8i64_z8zazcze(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_z8zazcze: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; ALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> zeroinitializer, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1705,7 +1705,7 @@ define <8 x double> @shuffle_v8f64_193b5d7f(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_193b5d7f: ; ALL: # %bb.0: -; ALL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; ALL-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -1724,7 +1724,7 @@ define <8 x i64> @shuffle_v8i64_193b5d7f(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_193b5d7f: ; ALL: # %bb.0: -; ALL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; ALL-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1733,8 +1733,8 @@ define <8 x i64> @shuffle_v8i64_1z3z5z7z(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_1z3z5z7z: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> zeroinitializer, <8 x i32> ret <8 x i64> %shuffle @@ -1858,7 +1858,7 @@ define <8 x double> @shuffle_v8f64_2301uuuu(<8 x double> %a0, <8 x double> %a1) { ; ALL-LABEL: shuffle_v8f64_2301uuuu: ; ALL: # %bb.0: -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[2,3,0,1] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] ; ALL-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> ret <8 x double> %1 @@ -1992,7 +1992,7 @@ define <8 x double> @shuffle_v8f64_012389AB(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_012389AB: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -2001,7 +2001,7 @@ define <8 x double> @shuffle_v8f64_89AB0123(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_89AB0123: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -2010,7 +2010,7 @@ define <8 x double> @shuffle_v8f64_01230123(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01230123: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -2019,7 +2019,7 @@ define <8 x i64> @shuffle_v8i64_012389AB(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_012389AB: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -2028,7 +2028,7 @@ define <8 x i64> @shuffle_v8i64_89AB0123(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_89AB0123: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -2037,7 +2037,7 @@ define <8 x i64> @shuffle_v8i64_01230123(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01230123: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -2046,7 +2046,7 @@ define <8 x double> @shuffle_v8f64_89234567(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_89234567: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 +; ALL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -2055,7 +2055,7 @@ define <8 x double> @shuffle_v8f64_01894567(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01894567: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 +; ALL-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -2064,7 +2064,7 @@ define <8 x double> @shuffle_v8f64_01238967(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01238967: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 +; ALL-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -2073,7 +2073,7 @@ define <8 x double> @shuffle_v8f64_01234589(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01234589: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 +; ALL-NEXT: vinserti32x4 $3, %xmm1, %zmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -2082,7 +2082,7 @@ define <8 x i64> @shuffle_v8i64_89234567(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_89234567: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 +; ALL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -2091,7 +2091,7 @@ define <8 x i64> @shuffle_v8i64_01894567(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01894567: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 +; ALL-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -2100,7 +2100,7 @@ define <8 x i64> @shuffle_v8i64_01238967(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01238967: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 +; ALL-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -2109,7 +2109,7 @@ define <8 x i64> @shuffle_v8i64_01234589(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01234589: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 +; ALL-NEXT: vinserti32x4 $3, %xmm1, %zmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -2118,7 +2118,7 @@ define <8 x i64> @shuffle_v8i64_23uuuuuu(<8 x i64> %a0, <8 x i64> %a1) { ; ALL-LABEL: shuffle_v8i64_23uuuuuu: ; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 +; ALL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; ALL-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> ret <8 x i64> %1 @@ -2127,7 +2127,7 @@ define <8 x i64> @shuffle_v8i64_45zzzzzz(<8 x i64> %a0, <8 x i64> %a1) { ; ALL-LABEL: shuffle_v8i64_45zzzzzz: ; ALL: # %bb.0: -; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm0 ; ALL-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x i64> %a0, <8 x i64> zeroinitializer, <8 x i32> ret <8 x i64> %1 @@ -2136,7 +2136,7 @@ define <8 x i64> @shuffle_v8i64_4567uuuu(<8 x i64> %a0, <8 x i64> %a1) { ; ALL-LABEL: shuffle_v8i64_4567uuuu: ; ALL: # %bb.0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> ret <8 x i64> %1 @@ -2145,7 +2145,7 @@ define <8 x i64> @shuffle_v8i64_uu67zzzz(<8 x i64> %a0, <8 x i64> %a1) { ; ALL-LABEL: shuffle_v8i64_uu67zzzz: ; ALL: # %bb.0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x i64> %a0, <8 x i64> zeroinitializer, <8 x i32> ret <8 x i64> %1 @@ -2154,8 +2154,8 @@ define <8 x double> @shuffle_v4f64_v8f64_22222222(<4 x double> %a) { ; ALL-LABEL: shuffle_v4f64_v8f64_22222222: ; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <8 x i32> ret <8 x double> %shuffle @@ -2165,8 +2165,8 @@ ; ALL-LABEL: shuffle_v2i64_v8i64_01010101: ; ALL: # %bb.0: ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <2 x i64> %a, <2 x i64> undef, <8 x i32> ret <8 x i64> %shuffle @@ -2176,8 +2176,8 @@ ; ALL-LABEL: shuffle_v2f64_v8f64_01010101: ; ALL: # %bb.0: ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <2 x double> %a, <2 x double> undef, <8 x i32> ret <8 x double> %shuffle @@ -2241,9 +2241,9 @@ define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) { ; ALL-LABEL: test_v8i64_2_5: ; ALL: # %bb.0: -; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; ALL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; ALL-NEXT: vzeroupper ; ALL-NEXT: ret{{[l|q]}} %res = shufflevector <8 x i64> %v, <8 x i64> undef, <2 x i32> @@ -2264,7 +2264,7 @@ define <8 x i64> @test_v8i64_insert_zero_256(<8 x i64> %a) { ; ALL-LABEL: test_v8i64_insert_zero_256: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps %ymm0, %ymm0 +; ALL-NEXT: vmovdqa %ymm0, %ymm0 ; ALL-NEXT: ret{{[l|q]}} %res = shufflevector <8 x i64> %a, <8 x i64> , <8 x i32> ret <8 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -16,9 +16,9 @@ ; ; KNL-LABEL: expand: ; KNL: # %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7] +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7] ; KNL-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32> ret <8 x float> %res @@ -50,9 +50,9 @@ ; CHECK-LABEL: expand2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] -; CHECK-NEXT: vmovaps %xmm0, %xmm0 -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; CHECK-NEXT: vperm2i128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] +; CHECK-NEXT: vmovdqa %xmm0, %xmm0 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> ret <4 x double> %res @@ -70,9 +70,9 @@ ; ; KNL-LABEL: expand3: ; KNL: # %bb.0: -; KNL-NEXT: vbroadcastsd %xmm0, %ymm0 -; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7] +; KNL-NEXT: vpbroadcastq %xmm0, %ymm0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7] ; KNL-NEXT: ret{{[l|q]}} %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32> ret <8 x i32> %res @@ -91,9 +91,9 @@ ; KNL-LABEL: expand4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; KNL-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] -; KNL-NEXT: vmovaps %xmm0, %xmm0 -; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; KNL-NEXT: vperm2i128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] +; KNL-NEXT: vmovdqa %xmm0, %xmm0 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; KNL-NEXT: ret{{[l|q]}} %res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> ret <4 x i64> %res @@ -111,9 +111,9 @@ ; ; KNL-LABEL: expand5: ; KNL: # %bb.0: -; KNL-NEXT: vbroadcastss %xmm0, %ymm0 -; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; KNL-NEXT: vpbroadcastd %xmm0, %ymm0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; KNL-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> ret <8 x float> %res @@ -123,8 +123,8 @@ define <8 x float> @expand6(<4 x float> %a ) { ; CHECK-LABEL: expand6: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> ret <8 x float> %res @@ -248,8 +248,8 @@ define <16 x float> @expand13(<8 x float> %a ) { ; CHECK-LABEL: expand13: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> ret <16 x float> %res @@ -284,16 +284,16 @@ ; SKX-LABEL: expand15: ; SKX: # %bb.0: ; SKX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; SKX-NEXT: vmovaps {{.*#+}} ymm1 = -; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; SKX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] +; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = +; SKX-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] ; SKX-NEXT: ret{{[l|q]}} ; ; KNL-LABEL: expand15: ; KNL: # %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; KNL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] ; KNL-NEXT: ret{{[l|q]}} %addV = fadd <4 x float> , %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> @@ -617,30 +617,30 @@ define void @PR43170(<16 x float>* %a0) { ; SKX64-LABEL: PR43170: ; SKX64: # %bb.0: # %entry -; SKX64-NEXT: vmovaps {{.*}}(%rip), %ymm0 -; SKX64-NEXT: vmovaps %zmm0, (%rdi) +; SKX64-NEXT: vmovdqa {{.*}}(%rip), %ymm0 +; SKX64-NEXT: vmovdqa64 %zmm0, (%rdi) ; SKX64-NEXT: vzeroupper ; SKX64-NEXT: retq ; ; KNL64-LABEL: PR43170: ; KNL64: # %bb.0: # %entry -; KNL64-NEXT: vmovaps {{.*}}(%rip), %ymm0 -; KNL64-NEXT: vmovaps %zmm0, (%rdi) +; KNL64-NEXT: vmovdqa {{.*}}(%rip), %ymm0 +; KNL64-NEXT: vmovdqa64 %zmm0, (%rdi) ; KNL64-NEXT: retq ; ; SKX32-LABEL: PR43170: ; SKX32: # %bb.0: # %entry ; SKX32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX32-NEXT: vmovaps src1, %ymm0 -; SKX32-NEXT: vmovaps %zmm0, (%eax) +; SKX32-NEXT: vmovdqa src1, %ymm0 +; SKX32-NEXT: vmovdqa64 %zmm0, (%eax) ; SKX32-NEXT: vzeroupper ; SKX32-NEXT: retl ; ; KNL32-LABEL: PR43170: ; KNL32: # %bb.0: # %entry ; KNL32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL32-NEXT: vmovaps src1, %ymm0 -; KNL32-NEXT: vmovaps %zmm0, (%eax) +; KNL32-NEXT: vmovdqa src1, %ymm0 +; KNL32-NEXT: vmovdqa64 %zmm0, (%eax) ; KNL32-NEXT: retl entry: %0 = load <8 x float>, <8 x float>* bitcast (%union1* @src1 to <8 x float>*), align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -32,24 +32,56 @@ } define <4 x float> @combine_vpermilvar_4f32_movddup(<4 x float> %a0) { -; CHECK-LABEL: combine_vpermilvar_4f32_movddup: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: combine_vpermilvar_4f32_movddup: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vpermilvar_4f32_movddup: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: combine_vpermilvar_4f32_movddup: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) ret <4 x float> %1 } define <4 x float> @combine_vpermilvar_4f32_movddup_load(<4 x float> *%a0) { -; X86-LABEL: combine_vpermilvar_4f32_movddup_load: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X86-NEXT: retl +; X86-AVX1-LABEL: combine_vpermilvar_4f32_movddup_load: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X86-AVX1-NEXT: retl ; -; X64-LABEL: combine_vpermilvar_4f32_movddup_load: -; X64: # %bb.0: -; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X64-NEXT: retq +; X86-AVX2-LABEL: combine_vpermilvar_4f32_movddup_load: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vpbroadcastq (%eax), %xmm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: combine_vpermilvar_4f32_movddup_load: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vpbroadcastq (%eax), %xmm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: combine_vpermilvar_4f32_movddup_load: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: combine_vpermilvar_4f32_movddup_load: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: combine_vpermilvar_4f32_movddup_load: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpbroadcastq (%rdi), %xmm0 +; X64-AVX512-NEXT: retq %1 = load <4 x float>, <4 x float> *%a0 %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> ) ret <4 x float> %2 @@ -74,19 +106,39 @@ } define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) { -; CHECK-LABEL: combine_vpermilvar_4f32_unpckh: -; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: combine_vpermilvar_4f32_unpckh: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vpermilvar_4f32_unpckh: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: combine_vpermilvar_4f32_unpckh: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) ret <4 x float> %1 } define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) { -; CHECK-LABEL: combine_vpermilvar_4f32_unpckl: -; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: combine_vpermilvar_4f32_unpckl: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vpermilvar_4f32_unpckl: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: combine_vpermilvar_4f32_unpckl: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) ret <4 x float> %1 } @@ -118,12 +170,12 @@ ; ; AVX2-LABEL: combine_vpermilvar_vperm2f128_8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: combine_vpermilvar_vperm2f128_8f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) %2 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> @@ -132,10 +184,15 @@ } define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) { -; AVX-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: -; AVX: # %bb.0: -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; AVX-NEXT: ret{{[l|q]}} +; AVX1-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: ; AVX512: # %bb.0: @@ -152,10 +209,20 @@ } define <4 x double> @combine_vperm2f128_vpermilvar_as_vperm2f128(<4 x double> %a0) { -; CHECK-LABEL: combine_vperm2f128_vpermilvar_as_vperm2f128: -; CHECK: # %bb.0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: combine_vperm2f128_vpermilvar_as_vperm2f128: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vperm2f128_vpermilvar_as_vperm2f128: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: combine_vperm2f128_vpermilvar_as_vperm2f128: +; AVX512: # %bb.0: +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> ) %2 = shufflevector <4 x double> %1, <4 x double> zeroinitializer, <4 x i32> %3 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %2, <4 x i64> ) @@ -163,10 +230,20 @@ } define <4 x double> @combine_vperm2f128_vpermilvar_as_vmovaps(<4 x double> %a0) { -; CHECK-LABEL: combine_vperm2f128_vpermilvar_as_vmovaps: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: combine_vperm2f128_vpermilvar_as_vmovaps: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %xmm0, %xmm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vperm2f128_vpermilvar_as_vmovaps: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %xmm0, %xmm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: combine_vperm2f128_vpermilvar_as_vmovaps: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> ) %2 = shufflevector <4 x double> %1, <4 x double> zeroinitializer, <4 x i32> %3 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %2, <4 x i64> ) @@ -245,10 +322,20 @@ } define <2 x double> @combine_vpermilvar_2f64_movddup(<2 x double> %a0) { -; CHECK-LABEL: combine_vpermilvar_2f64_movddup: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: combine_vpermilvar_2f64_movddup: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vpermilvar_2f64_movddup: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: combine_vpermilvar_2f64_movddup: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> ) ret <2 x double> %1 } @@ -272,10 +359,20 @@ } define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) { -; CHECK-LABEL: combine_vpermilvar_4f32_4stage: -; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: combine_vpermilvar_4f32_4stage: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vpermilvar_4f32_4stage: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: combine_vpermilvar_4f32_4stage: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> ) %3 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %2, <4 x i32> ) @@ -284,10 +381,20 @@ } define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) { -; CHECK-LABEL: combine_vpermilvar_8f32_4stage: -; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: combine_vpermilvar_8f32_4stage: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vpermilvar_8f32_4stage: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: combine_vpermilvar_8f32_4stage: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> ) %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> ) @@ -306,37 +413,77 @@ } define <2 x double> @constant_fold_vpermilvar_pd() { -; CHECK-LABEL: constant_fold_vpermilvar_pd: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2.0E+0,1.0E+0] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: constant_fold_vpermilvar_pd: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [2.0E+0,1.0E+0] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: constant_fold_vpermilvar_pd: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2.0E+0,1.0E+0] +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: constant_fold_vpermilvar_pd: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [2.0E+0,1.0E+0] +; AVX512-NEXT: ret{{[l|q]}} %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> , <2 x i64> ) ret <2 x double> %1 } define <4 x double> @constant_fold_vpermilvar_pd_256() { -; CHECK-LABEL: constant_fold_vpermilvar_pd_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: constant_fold_vpermilvar_pd_256: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: constant_fold_vpermilvar_pd_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0] +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: constant_fold_vpermilvar_pd_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0] +; AVX512-NEXT: ret{{[l|q]}} %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> , <4 x i64> ) ret <4 x double> %1 } define <4 x float> @constant_fold_vpermilvar_ps() { -; CHECK-LABEL: constant_fold_vpermilvar_ps: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: constant_fold_vpermilvar_ps: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: constant_fold_vpermilvar_ps: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0] +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: constant_fold_vpermilvar_ps: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0] +; AVX512-NEXT: ret{{[l|q]}} %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> , <4 x i32> ) ret <4 x float> %1 } define <8 x float> @constant_fold_vpermilvar_ps_256() { -; CHECK-LABEL: constant_fold_vpermilvar_ps_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: constant_fold_vpermilvar_ps_256: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: constant_fold_vpermilvar_ps_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0] +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: constant_fold_vpermilvar_ps_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0] +; AVX512-NEXT: ret{{[l|q]}} %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> , <8 x i32> ) ret <8 x float> %1 } @@ -447,13 +594,13 @@ ; AVX2-LABEL: concat_self_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: concat_self_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512-NEXT: ret{{[l|q]}} %cat = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> %s = shufflevector <4 x i64> %cat, <4 x i64> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -12,7 +12,7 @@ define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) { ; CHECK-LABEL: combine_pshufb_pslldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> @@ -22,7 +22,7 @@ define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) { ; CHECK-LABEL: combine_pshufb_psrldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> @@ -116,8 +116,8 @@ define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) { ; CHECK-LABEL: combine_as_vpermd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) @@ -128,8 +128,8 @@ define <8 x float> @combine_as_vpermps(<8 x float> %a0) { ; CHECK-LABEL: combine_as_vpermps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7> -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7> +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) @@ -140,7 +140,7 @@ define <32 x i8> @combine_permq_pshufb_as_vmovaps(<4 x i64> %a0) { ; CHECK-LABEL: combine_permq_pshufb_as_vmovaps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> %2 = bitcast <4 x i64> %1 to <32 x i8> @@ -151,8 +151,8 @@ define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) { ; CHECK-LABEL: combine_permq_pshufb_as_vpblendd: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> %2 = bitcast <4 x i64> %1 to <32 x i8> @@ -244,7 +244,7 @@ define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) { ; CHECK-LABEL: combine_pshufb_as_vpbroadcastq128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> ) ret <16 x i8> %1 @@ -271,7 +271,7 @@ define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) { ; CHECK-LABEL: combine_pshufb_as_vpbroadcastss128: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = bitcast <4 x float> %a to <16 x i8> %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> ) @@ -282,7 +282,7 @@ define <8 x float> @combine_permps_as_vpbroadcastss256(<4 x float> %a) { ; CHECK-LABEL: combine_permps_as_vpbroadcastss256: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer) @@ -292,7 +292,7 @@ define <4 x double> @combine_permps_as_vpbroadcastsd256(<2 x double> %a) { ; CHECK-LABEL: combine_permps_as_vpbroadcastsd256: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> %2 = bitcast <4 x double> %1 to <8 x float> @@ -324,7 +324,7 @@ define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %a) { ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer %2 = bitcast <4 x float> %1 to <16 x i8> @@ -336,7 +336,7 @@ define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) { ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> zeroinitializer %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer) @@ -346,7 +346,7 @@ define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) { ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> zeroinitializer %2 = bitcast <4 x double> %1 to <8 x float> @@ -358,7 +358,7 @@ define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) { ; CHECK-LABEL: combine_permd_as_permq: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,1] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> ) ret <8 x i32> %1 @@ -367,7 +367,7 @@ define <8 x float> @combine_permps_as_permpd(<8 x float> %a) { ; CHECK-LABEL: combine_permps_as_permpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,0,1] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> ) ret <8 x float> %1 @@ -422,8 +422,8 @@ define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) { ; CHECK-LABEL: combine_pshufb_as_vzmovl_32: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; CHECK-NEXT: ret{{[l|q]}} %1 = bitcast <8 x float> %a0 to <32 x i8> %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> ) @@ -543,12 +543,12 @@ define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) { ; X86-LABEL: combine_psrlw_pshufb: ; X86: # %bb.0: -; X86-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; X86-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_psrlw_pshufb: ; X64: # %bb.0: -; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq %1 = lshr <16 x i16> %a0, %2 = bitcast <16 x i16> %1 to <32 x i8> @@ -559,12 +559,12 @@ define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) { ; X86-LABEL: combine_pslld_pshufb: ; X86: # %bb.0: -; X86-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; X86-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_pslld_pshufb: ; X64: # %bb.0: -; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq %1 = shl <8 x i32> %a0, %2 = bitcast <8 x i32> %1 to <32 x i8> @@ -686,7 +686,7 @@ define <16 x i8> @combine_pshufb_insertion_as_broadcast_v2i64(i64 %a0) { ; X86-LABEL: combine_pshufb_insertion_as_broadcast_v2i64: ; X86: # %bb.0: -; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_pshufb_insertion_as_broadcast_v2i64: @@ -703,7 +703,7 @@ define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) { ; X86-LABEL: combine_permd_insertion_as_broadcast_v4i64: ; X86: # %bb.0: -; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0 +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_permd_insertion_as_broadcast_v4i64: @@ -720,7 +720,7 @@ define <32 x i8> @combine_pshufb_pshufb_or_as_blend(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: combine_pshufb_pshufb_or_as_blend: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> ) @@ -742,7 +742,7 @@ define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) { ; CHECK-LABEL: combine_pshufb_pshufb_or_pshufb: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) @@ -754,7 +754,7 @@ define <8 x i32> @constant_fold_permd() { ; CHECK-LABEL: constant_fold_permd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> , <8 x i32> ) ret <8 x i32> %1 @@ -763,7 +763,7 @@ define <8 x float> @constant_fold_permps() { ; CHECK-LABEL: constant_fold_permps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5.0E+0,7.0E+0,3.0E+0,2.0E+0,8.0E+0,2.0E+0,6.0E+0,1.0E+0] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [5.0E+0,7.0E+0,3.0E+0,2.0E+0,8.0E+0,2.0E+0,6.0E+0,1.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> , <8 x i32> ) ret <8 x float> %1 @@ -772,7 +772,7 @@ define <32 x i8> @constant_fold_pshufb_256() { ; CHECK-LABEL: constant_fold_pshufb_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250> +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250> ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> , <32 x i8> ) ret <32 x i8> %1 @@ -782,9 +782,9 @@ ; X86-LABEL: broadcast_v2i64_multiuse: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; X86-NEXT: vextractps $2, %xmm0, %eax +; X86-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vpbroadcastq %xmm0, %xmm0 +; X86-NEXT: vpextrd $2, %xmm0, %eax ; X86-NEXT: addl (%ecx), %eax ; X86-NEXT: retl ; @@ -819,12 +819,12 @@ define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) { ; AVX2-LABEL: PR34577: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: PR34577: @@ -848,15 +848,15 @@ define void @packss_zext_v8i1() { ; X86-LABEL: packss_zext_v8i1: ; X86: # %bb.0: -; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X86-NEXT: vmovups %ymm0, (%eax) +; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovdqu %ymm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: packss_zext_v8i1: ; X64: # %bb.0: -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovups %ymm0, (%rax) +; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovdqu %ymm0, (%rax) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %tmp0 = icmp sgt <8 x i32> undef, undef diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll @@ -519,7 +519,7 @@ define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) { ; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) ret <16 x float> %res0 @@ -528,12 +528,12 @@ ; X86-LABEL: combine_vpermt2var_16f32_vpermilps_load: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; X86-NEXT: vpshufd {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16f32_vpermilps_load: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; X64-NEXT: vpshufd {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; X64-NEXT: retq %x0 = load <16 x float>, <16 x float> *%p0 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) @@ -627,7 +627,7 @@ define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) { ; CHECK-LABEL: combine_permvar_as_vpbroadcastd512: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> zeroinitializer) ret <16 x i32> %1 @@ -636,7 +636,7 @@ define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) { ; CHECK-LABEL: combine_permvar_as_vpbroadcastq512: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> zeroinitializer) ret <8 x i64> %1 @@ -645,7 +645,7 @@ define <8 x i64> @combine_permvar_8i64_as_permq(<8 x i64> %x0, <8 x i64> %x1) { ; CHECK-LABEL: combine_permvar_8i64_as_permq: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] +; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> ) ret <8 x i64> %1 @@ -689,7 +689,7 @@ define <8 x double> @combine_permvar_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1) { ; CHECK-LABEL: combine_permvar_8f64_as_permpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] +; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> ) ret <8 x double> %1 @@ -788,7 +788,7 @@ define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float> %a0, <16 x float> %a1) { ; CHECK-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] +; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %a0, <16 x i32> , <16 x float> %a1, i16 -1) ret <16 x float> %res0 @@ -797,7 +797,7 @@ define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; CHECK-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %a0, <16 x i32> , <16 x i32> %a1, i16 -1) ret <16 x i32> %res0 @@ -806,14 +806,14 @@ define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x double> %x1) { ; X86-LABEL: combine_vpermi2var_8f64_as_vpermpd: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] -; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] +; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermi2var_8f64_as_vpermpd: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] -; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; X64-NEXT: retq %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> , <8 x double> %x1, i8 -1) %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> , <8 x double> %res0, i8 -1) @@ -823,14 +823,14 @@ define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) { ; X86-LABEL: combine_vpermt2var_8i64_as_vpermq: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] -; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] +; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_8i64_as_vpermq: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] -; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; X64-NEXT: retq %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %x0, <8 x i64> %x1, i8 -1) %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %res0, <8 x i64> %res0, i8 -1) @@ -840,8 +840,8 @@ define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x float> %x1) { ; CHECK-LABEL: combine_vpermi2var_16f32_as_vpermps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x1, i16 -1) %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> , <16 x float> %res0, i16 -1) @@ -851,8 +851,8 @@ define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) { ; CHECK-LABEL: combine_vpermt2var_16i32_as_vpermd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %x0, <16 x i32> %x1, i16 -1) %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %res0, <16 x i32> %res0, i16 -1) @@ -909,7 +909,7 @@ ; ; X64-LABEL: combine_vpermi2var_8f64_as_permpd: ; X64: # %bb.0: -; X64-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,3,2,2,5,7,6,6] +; X64-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,2,2,5,7,6,6] ; X64-NEXT: retq %res0 = insertelement <8 x i64> , i64 %a2, i32 0 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %res0, <8 x double> %x1, i8 -1) @@ -962,7 +962,7 @@ define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) { ; X86-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64: ; X86: # %bb.0: -; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -15,10 +15,20 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vpshufb_as_zero: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vpshufb_as_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vpshufb_as_zero: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_vpshufb_as_zero: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: retq %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> ) %res2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res1, <16 x i8> ) @@ -51,10 +61,20 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_pshufb_as_movsd: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_as_movsd: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_as_movsd: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_as_movsd: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: retq %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> %2 = bitcast <2 x double> %1 to <16 x i8> %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> ) @@ -73,10 +93,20 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_pshufb_as_movss: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_as_movss: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_as_movss: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_as_movss: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512F-NEXT: retq %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> %2 = bitcast <4 x float> %1 to <16 x i8> %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> ) @@ -134,11 +164,23 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_pshufb_as_vzmovl_32: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_as_vzmovl_32: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_as_vzmovl_32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_as_vzmovl_32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512F-NEXT: retq %1 = bitcast <4 x float> %a0 to <16 x i8> %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> ) %3 = bitcast <16 x i8> %2 to <4 x float> @@ -202,10 +244,20 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_pshufb_palignr: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_palignr: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_palignr: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_palignr: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512F-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> ) ret <16 x i8> %2 @@ -217,10 +269,20 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_pshufb_pslldq: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_pslldq: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_pslldq: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_pslldq: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: retq %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %2 @@ -232,10 +294,20 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_pshufb_psrldq: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_psrldq: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_psrldq: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_psrldq: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: retq %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %2 @@ -495,11 +567,23 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_pshufb_as_unpacklo_zero: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_as_unpacklo_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_as_unpacklo_zero: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_as_unpacklo_zero: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-NEXT: retq %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) ret <16 x i8> %1 } @@ -689,10 +773,20 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_pshufb_pshufb_or_as_blend: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_pshufb_or_as_blend: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_pshufb_or_as_blend: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_pshufb_or_as_blend: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: retq %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a1, <16 x i8> ) %3 = or <16 x i8> %1, %2 @@ -728,12 +822,12 @@ ; ; AVX2-LABEL: combine_pshufb_pshufb_or_pshufb: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: combine_pshufb_pshufb_or_pshufb: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512F-NEXT: retq %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) @@ -748,10 +842,20 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> ; SSE-NEXT: retq ; -; AVX-LABEL: constant_fold_pshufb: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> -; AVX-NEXT: retq +; AVX1-LABEL: constant_fold_pshufb: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_fold_pshufb: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> +; AVX2-NEXT: retq +; +; AVX512F-LABEL: constant_fold_pshufb: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> +; AVX512F-NEXT: retq %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> , <16 x i8> ) ret <16 x i8> %1 } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,X86 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+xop | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,X86,X86-AVX +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+xop | FileCheck %s --check-prefixes=CHECK,X86,X86-AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,X64,X64-AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s --check-prefixes=CHECK,X64,X64-AVX2 @@ -13,20 +13,50 @@ declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone define <2 x double> @combine_vpermil2pd_identity(<2 x double> %a0, <2 x double> %a1) { -; CHECK-LABEL: combine_vpermil2pd_identity: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-AVX-LABEL: combine_vpermil2pd_identity: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovaps %xmm1, %xmm0 +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: combine_vpermil2pd_identity: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; X86-AVX2-NEXT: retl +; +; X64-AVX-LABEL: combine_vpermil2pd_identity: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps %xmm1, %xmm0 +; X64-AVX-NEXT: retq +; +; X64-AVX2-LABEL: combine_vpermil2pd_identity: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; X64-AVX2-NEXT: retq %res0 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a1, <2 x double> %a0, <2 x i64> , i8 0) %res1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %res0, <2 x double> undef, <2 x i64> , i8 0) ret <2 x double> %res1 } define <4 x double> @combine_vpermil2pd256_identity(<4 x double> %a0, <4 x double> %a1) { -; CHECK-LABEL: combine_vpermil2pd256_identity: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %ymm1, %ymm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-AVX-LABEL: combine_vpermil2pd256_identity: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovaps %ymm1, %ymm0 +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: combine_vpermil2pd256_identity: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa %ymm1, %ymm0 +; X86-AVX2-NEXT: retl +; +; X64-AVX-LABEL: combine_vpermil2pd256_identity: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps %ymm1, %ymm0 +; X64-AVX-NEXT: retq +; +; X64-AVX2-LABEL: combine_vpermil2pd256_identity: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa %ymm1, %ymm0 +; X64-AVX2-NEXT: retq %res0 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a1, <4 x double> %a0, <4 x i64> , i8 0) %res1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %res0, <4 x double> undef, <4 x i64> , i8 0) ret <4 x double> %res1 @@ -43,10 +73,25 @@ } define <4 x float> @combine_vpermil2ps_identity(<4 x float> %a0, <4 x float> %a1) { -; CHECK-LABEL: combine_vpermil2ps_identity: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-AVX-LABEL: combine_vpermil2ps_identity: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovaps %xmm1, %xmm0 +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: combine_vpermil2ps_identity: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; X86-AVX2-NEXT: retl +; +; X64-AVX-LABEL: combine_vpermil2ps_identity: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps %xmm1, %xmm0 +; X64-AVX-NEXT: retq +; +; X64-AVX2-LABEL: combine_vpermil2ps_identity: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; X64-AVX2-NEXT: retq %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a1, <4 x float> %a0, <4 x i32> , i8 0) %res1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %res0, <4 x float> undef, <4 x i32> , i8 0) ret <4 x float> %res1 @@ -74,10 +119,25 @@ } define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float> %a1) { -; CHECK-LABEL: combine_vpermil2ps256_identity: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %ymm1, %ymm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-AVX-LABEL: combine_vpermil2ps256_identity: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovaps %ymm1, %ymm0 +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: combine_vpermil2ps256_identity: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa %ymm1, %ymm0 +; X86-AVX2-NEXT: retl +; +; X64-AVX-LABEL: combine_vpermil2ps256_identity: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps %ymm1, %ymm0 +; X64-AVX-NEXT: retq +; +; X64-AVX2-LABEL: combine_vpermil2ps256_identity: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa %ymm1, %ymm0 +; X64-AVX2-NEXT: retq %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> , i8 0) %res1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %res0, <8 x float> undef, <8 x i32> , i8 0) ret <8 x float> %res1 @@ -94,20 +154,53 @@ } define <8 x float> @combine_vpermil2ps256_zero(<8 x float> %a0, <8 x float> %a1) { -; CHECK-LABEL: combine_vpermil2ps256_zero: -; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-AVX-LABEL: combine_vpermil2ps256_zero: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: combine_vpermil2ps256_zero: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl +; +; X64-AVX-LABEL: combine_vpermil2ps256_zero: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX-NEXT: retq +; +; X64-AVX2-LABEL: combine_vpermil2ps256_zero: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> , i8 2) ret <8 x float> %res0 } define <4 x float> @combine_vpermil2ps_blend_with_zero(<4 x float> %a0, <4 x float> %a1) { -; CHECK-LABEL: combine_vpermil2ps_blend_with_zero: -; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; CHECK-NEXT: ret{{[l|q]}} +; X86-AVX-LABEL: combine_vpermil2ps_blend_with_zero: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: combine_vpermil2ps_blend_with_zero: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X86-AVX2-NEXT: retl +; +; X64-AVX-LABEL: combine_vpermil2ps_blend_with_zero: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-AVX-NEXT: retq +; +; X64-AVX2-LABEL: combine_vpermil2ps_blend_with_zero: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-AVX2-NEXT: retq %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> , i8 2) ret <4 x float> %res0 } @@ -151,20 +244,50 @@ } define <16 x i8> @combine_vpperm_identity(<16 x i8> %a0, <16 x i8> %a1) { -; CHECK-LABEL: combine_vpperm_identity: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-AVX-LABEL: combine_vpperm_identity: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovaps %xmm1, %xmm0 +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: combine_vpperm_identity: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; X86-AVX2-NEXT: retl +; +; X64-AVX-LABEL: combine_vpperm_identity: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps %xmm1, %xmm0 +; X64-AVX-NEXT: retq +; +; X64-AVX2-LABEL: combine_vpperm_identity: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; X64-AVX2-NEXT: retq %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> ) %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> ) ret <16 x i8> %res1 } define <16 x i8> @combine_vpperm_zero(<16 x i8> %a0, <16 x i8> %a1) { -; CHECK-LABEL: combine_vpperm_zero: -; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-AVX-LABEL: combine_vpperm_zero: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: combine_vpperm_zero: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl +; +; X64-AVX-LABEL: combine_vpperm_zero: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX-NEXT: retq +; +; X64-AVX2-LABEL: combine_vpperm_zero: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> ) %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> ) %res2 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res1, <16 x i8> undef, <16 x i8> ) @@ -273,12 +396,19 @@ declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) { -; X86-LABEL: buildvector_v4f32_0404: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X86-NEXT: vmovaps %xmm0, (%eax) -; X86-NEXT: retl +; X86-AVX-LABEL: buildvector_v4f32_0404: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: buildvector_v4f32_0404: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm0 +; X86-AVX2-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX2-NEXT: retl ; ; X64-AVX-LABEL: buildvector_v4f32_0404: ; X64-AVX: # %bb.0: @@ -325,46 +455,121 @@ } define <2 x double> @constant_fold_vpermil2pd() { -; CHECK-LABEL: constant_fold_vpermil2pd: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] -; CHECK-NEXT: ret{{[l|q]}} +; X86-AVX-LABEL: constant_fold_vpermil2pd: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: constant_fold_vpermil2pd: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] +; X86-AVX2-NEXT: retl +; +; X64-AVX-LABEL: constant_fold_vpermil2pd: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] +; X64-AVX-NEXT: retq +; +; X64-AVX2-LABEL: constant_fold_vpermil2pd: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] +; X64-AVX2-NEXT: retq %1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> , <2 x double> , <2 x i64> , i8 2) ret <2 x double> %1 } define <4 x double> @constant_fold_vpermil2pd_256() { -; CHECK-LABEL: constant_fold_vpermil2pd_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0] -; CHECK-NEXT: ret{{[l|q]}} +; X86-AVX-LABEL: constant_fold_vpermil2pd_256: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0] +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: constant_fold_vpermil2pd_256: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0] +; X86-AVX2-NEXT: retl +; +; X64-AVX-LABEL: constant_fold_vpermil2pd_256: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0] +; X64-AVX-NEXT: retq +; +; X64-AVX2-LABEL: constant_fold_vpermil2pd_256: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0] +; X64-AVX2-NEXT: retq %1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> , <4 x double> , <4 x i64> , i8 2) ret <4 x double> %1 } define <4 x float> @constant_fold_vpermil2ps() { -; CHECK-LABEL: constant_fold_vpermil2ps: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0] -; CHECK-NEXT: ret{{[l|q]}} +; X86-AVX-LABEL: constant_fold_vpermil2ps: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0] +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: constant_fold_vpermil2ps: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0] +; X86-AVX2-NEXT: retl +; +; X64-AVX-LABEL: constant_fold_vpermil2ps: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0] +; X64-AVX-NEXT: retq +; +; X64-AVX2-LABEL: constant_fold_vpermil2ps: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0] +; X64-AVX2-NEXT: retq %1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> , <4 x float> , <4 x i32> , i8 2) ret <4 x float> %1 } define <8 x float> @constant_fold_vpermil2ps_256() { -; CHECK-LABEL: constant_fold_vpermil2ps_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0] -; CHECK-NEXT: ret{{[l|q]}} +; X86-AVX-LABEL: constant_fold_vpermil2ps_256: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0] +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: constant_fold_vpermil2ps_256: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0] +; X86-AVX2-NEXT: retl +; +; X64-AVX-LABEL: constant_fold_vpermil2ps_256: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0] +; X64-AVX-NEXT: retq +; +; X64-AVX2-LABEL: constant_fold_vpermil2ps_256: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0] +; X64-AVX2-NEXT: retq %1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> , <8 x float> , <8 x i32> , i8 2) ret <8 x float> %1 } define <16 x i8> @constant_fold_vpperm() { -; CHECK-LABEL: constant_fold_vpperm: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; CHECK-NEXT: ret{{[l|q]}} +; X86-AVX-LABEL: constant_fold_vpperm: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: constant_fold_vpperm: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; X86-AVX2-NEXT: retl +; +; X64-AVX-LABEL: constant_fold_vpperm: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; X64-AVX-NEXT: retq +; +; X64-AVX2-LABEL: constant_fold_vpperm: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; X64-AVX2-NEXT: retq %1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> , <16 x i8> , <16 x i8> ) ret <16 x i8> %1 } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -104,7 +104,7 @@ ; ; AVX2-LABEL: combine_pshufd6: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-NEXT: retq entry: %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0) @@ -174,11 +174,17 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test1: -; AVX: # %bb.0: -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test1: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> %and = and <4 x i32> %shuf1, %shuf2 @@ -192,11 +198,17 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test2: -; AVX: # %bb.0: -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test2: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> %or = or <4 x i32> %shuf1, %shuf2 @@ -210,11 +222,17 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test3: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test3: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> %xor = xor <4 x i32> %shuf1, %shuf2 @@ -228,11 +246,17 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test4: -; AVX: # %bb.0: -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test4: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> %and = and <4 x i32> %shuf1, %shuf2 @@ -246,11 +270,17 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test5: -; AVX: # %bb.0: -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test5: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test5: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> %or = or <4 x i32> %shuf1, %shuf2 @@ -264,11 +294,17 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test6: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test6: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test6: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> %xor = xor <4 x i32> %shuf1, %shuf2 @@ -302,11 +338,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test1b: -; AVX: # %bb.0: -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test1b: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test1b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> %and = and <4 x i32> %shuf1, %shuf2 @@ -336,11 +378,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test2b: -; AVX: # %bb.0: -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test2b: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test2b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> %or = or <4 x i32> %shuf1, %shuf2 @@ -367,12 +415,19 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test3b: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test3b: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test3b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> %xor = xor <4 x i32> %shuf1, %shuf2 @@ -402,11 +457,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test4b: -; AVX: # %bb.0: -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test4b: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test4b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> %and = and <4 x i32> %shuf1, %shuf2 @@ -436,11 +497,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test5b: -; AVX: # %bb.0: -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test5b: +; AVX1: # %bb.0: +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test5b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> %or = or <4 x i32> %shuf1, %shuf2 @@ -467,12 +534,19 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test6b: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test6b: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test6b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> %xor = xor <4 x i32> %shuf1, %shuf2 @@ -625,10 +699,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test1: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test1: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -640,10 +719,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test2: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -655,10 +739,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test3: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test3: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -677,7 +766,7 @@ ; ; AVX2-LABEL: combine_nested_undef_test4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -690,10 +779,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test5: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test5: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test5: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -705,10 +799,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test6: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test6: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test6: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -720,10 +819,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test7: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test7: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test7: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -735,10 +839,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test8: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -750,10 +859,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test9: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test9: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test9: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -765,10 +879,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test10: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test10: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test10: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -780,10 +899,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test11: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test11: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test11: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -802,7 +926,7 @@ ; ; AVX2-LABEL: combine_nested_undef_test12: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -826,10 +950,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test14: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test14: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test14: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -873,9 +1002,9 @@ ; ; AVX2-LABEL: combine_nested_undef_test15: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -903,11 +1032,17 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test16: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -932,11 +1067,17 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test17: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test17: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test17: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -948,10 +1089,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test18: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test18: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test18: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -976,11 +1122,17 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test19: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test19: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test19: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -1007,11 +1159,17 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test20: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test20: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test20: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -1044,8 +1202,8 @@ ; ; AVX2-LABEL: combine_nested_undef_test21: ; AVX2: # %bb.0: -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1062,10 +1220,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test22: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test22: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test22: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -1077,10 +1240,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test23: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test23: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test23: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -1092,10 +1260,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test24: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test24: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -1114,7 +1287,7 @@ ; ; AVX2-LABEL: combine_nested_undef_test25: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1127,10 +1300,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test26: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test26: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test26: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -1149,7 +1327,7 @@ ; ; AVX2-LABEL: combine_nested_undef_test27: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1162,10 +1340,15 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test28: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test28: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test28: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -1177,10 +1360,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test1: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_test1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -1204,10 +1392,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test2: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test2: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -1219,10 +1412,15 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test3: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test3: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -1234,10 +1432,15 @@ ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test4: -; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test4: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -1261,10 +1464,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test5: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test5: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test5: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -1276,10 +1484,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test6: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_test6: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test6: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> ret <4 x i32> %2 @@ -1303,10 +1516,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test7: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test7: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test7: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> ret <4 x i32> %2 @@ -1318,10 +1536,15 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test8: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> ret <4 x i32> %2 @@ -1334,10 +1557,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test9: -; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test9: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test9: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> ret <4 x i32> %2 @@ -1361,10 +1589,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test10: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test10: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test10: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> ret <4 x i32> %2 @@ -1397,10 +1630,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test12: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test12: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test12: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> ret <4 x float> %2 @@ -1412,10 +1650,15 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test13: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test13: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test13: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> ret <4 x float> %2 @@ -1427,10 +1670,15 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test14: -; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test14: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test14: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> ret <4 x float> %2 @@ -1454,10 +1702,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test15: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test15: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test15: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> ret <4 x float> %2 @@ -1490,10 +1743,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test17: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test17: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test17: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> ret <4 x i32> %2 @@ -1505,10 +1763,15 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test18: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test18: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test18: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> ret <4 x i32> %2 @@ -1520,10 +1783,15 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test19: -; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test19: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test19: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> ret <4 x i32> %2 @@ -1547,10 +1815,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test20: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test20: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test20: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> ret <4 x i32> %2 @@ -1565,14 +1838,23 @@ ; SSE-NEXT: movaps %xmm2, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test21: -; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX-NEXT: vmovaps %xmm2, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: combine_test21: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: vmovaps %xmm2, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test21: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: vmovdqa %xmm2, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> store <4 x i32> %1, <4 x i32>* %ptr, align 16 @@ -1605,11 +1887,17 @@ ; SSE-NEXT: movups %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test23: -; AVX: # %bb.0: -; AVX-NEXT: vmovups %xmm0, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: combine_test23: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups %xmm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test23: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1 %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> @@ -1628,10 +1916,15 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test1b: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test1b: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test1b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,2,0] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -1654,10 +1947,15 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test2b: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test2b: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test2b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -1682,11 +1980,17 @@ ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test3b: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test3b: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test3b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -1699,10 +2003,15 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test4b: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test4b: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test4b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -1885,10 +2194,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_blend_01: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_blend_01: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_blend_01: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle6 @@ -1914,10 +2228,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_blend_02: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_blend_02: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_blend_02: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle6 @@ -1941,10 +2260,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_blend_123: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_blend_123: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_blend_123: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> @@ -1958,10 +2282,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test_movhl_1: -; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test_movhl_1: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test_movhl_1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> ret <4 x i32> %2 @@ -1974,10 +2303,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test_movhl_2: -; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test_movhl_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test_movhl_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> ret <4 x i32> %2 @@ -1990,10 +2324,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test_movhl_3: -; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test_movhl_3: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test_movhl_3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> ret <4 x i32> %2 @@ -2019,10 +2358,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test1: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test1: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -2034,10 +2378,15 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test2: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test2: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -2049,10 +2398,15 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test3: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test3: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -2064,10 +2418,15 @@ ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test4: -; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test4: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -2089,10 +2448,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test5: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test5: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test5: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -2127,10 +2491,15 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test7: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test7: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test7: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> ret <4 x float> %2 @@ -2152,10 +2521,15 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test8: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> ret <4 x float> %2 @@ -2201,10 +2575,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test11: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test11: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test11: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> ret <4 x float> %2 @@ -2216,10 +2595,15 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test12: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test12: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test12: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> ret <4 x float> %2 @@ -2231,10 +2615,15 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test13: -; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test13: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test13: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> ret <4 x float> %2 @@ -2246,10 +2635,15 @@ ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test14: -; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test14: +; AVX1: # %bb.0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test14: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> ret <4 x float> %2 @@ -2271,10 +2665,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test15: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test15: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test15: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> ret <4 x float> %2 @@ -2315,10 +2714,15 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test17: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test17: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test17: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> ret <4 x float> %2 @@ -2340,10 +2744,15 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test18: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test18: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test18: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> ret <4 x float> %2 @@ -2565,11 +2974,17 @@ ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: combine_scalar_load_with_blend_with_zero: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovaps %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: combine_scalar_load_with_blend_with_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovaps %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_scalar_load_with_blend_with_zero: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq %1 = load double, double* %a0, align 8 %2 = insertelement <2 x double> undef, double %1, i32 0 %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1 @@ -2600,10 +3015,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_constant_insertion_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_constant_insertion_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_constant_insertion_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] +; AVX2-NEXT: retq %a0 = insertelement <4 x float> undef, float %f, i32 0 %ret = shufflevector <4 x float> %a0, <4 x float> , <4 x i32> ret <4 x float> %ret diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 ; ; Unary shuffle indices from registers @@ -19,14 +19,23 @@ ; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: var_shuffle_v2f64_v2f64_xx_i64: -; AVX: # %bb.0: -; AVX-NEXT: andl $1, %esi -; AVX-NEXT: andl $1, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; AVX-NEXT: retq +; AVX1-LABEL: var_shuffle_v2f64_v2f64_xx_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: andl $1, %esi +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v2f64_v2f64_xx_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX2-NEXT: retq %x0 = extractelement <2 x double> %x, i64 %i0 %x1 = extractelement <2 x double> %x, i64 %i1 %r0 = insertelement <2 x double> undef, double %x0, i32 0 @@ -47,17 +56,29 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: var_shuffle_v2i64_v2i64_xx_i64: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: # kill: def $edi killed $edi def $rdi -; AVX-NEXT: andl $1, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $1, %esi -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: retq +; AVX1-LABEL: var_shuffle_v2i64_v2i64_xx_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $1, %esi +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v2i64_v2i64_xx_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: # kill: def $edi killed $edi def $rdi +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: retq %x0 = extractelement <2 x i64> %x, i32 %i0 %x1 = extractelement <2 x i64> %x, i32 %i1 %r0 = insertelement <2 x i64> undef, i64 %x0, i32 0 @@ -123,22 +144,39 @@ ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX-NEXT: # kill: def $edx killed $edx def $rdx -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: # kill: def $edi killed $edi def $rdi -; AVX-NEXT: andl $3, %edi -; AVX-NEXT: andl $3, %esi -; AVX-NEXT: andl $3, %edx -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $3, %ecx -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; AVX-NEXT: retq +; AVX1-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX1-NEXT: # kill: def $edx killed $edx def $rdx +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: andl $3, %edi +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: andl $3, %edx +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $3, %ecx +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: # kill: def $edi killed $edi def $rdi +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $3, %ecx +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX2-NEXT: retq %x0 = extractelement <4 x float> %x, i32 %i0 %x1 = extractelement <4 x float> %x, i32 %i1 %x2 = extractelement <4 x float> %x, i32 %i2 @@ -208,22 +246,39 @@ ; SSE41-NEXT: pinsrd $3, -24(%rsp,%rcx,4), %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX-NEXT: # kill: def $edx killed $edx def $rdx -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: # kill: def $edi killed $edi def $rdi -; AVX-NEXT: andl $3, %edi -; AVX-NEXT: andl $3, %esi -; AVX-NEXT: andl $3, %edx -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $3, %ecx -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0 -; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0 -; AVX-NEXT: vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX1-NEXT: # kill: def $edx killed $edx def $rdx +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: andl $3, %edi +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: andl $3, %edx +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $3, %ecx +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: # kill: def $edi killed $edi def $rdi +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $3, %ecx +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0 +; AVX2-NEXT: retq %x0 = extractelement <4 x i32> %x, i32 %i0 %x1 = extractelement <4 x i32> %x, i32 %i1 %x2 = extractelement <4 x i32> %x, i32 %i2 @@ -354,35 +409,65 @@ ; SSE41-NEXT: pinsrw $7, -24(%rsp,%r10,2), %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $r9d killed $r9d def $r9 -; AVX-NEXT: # kill: def $r8d killed $r8d def $r8 -; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX-NEXT: # kill: def $edx killed $edx def $rdx -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: # kill: def $edi killed $edi def $rdi -; AVX-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; AVX-NEXT: andl $7, %r10d -; AVX-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; AVX-NEXT: andl $7, %eax -; AVX-NEXT: andl $7, %edi -; AVX-NEXT: andl $7, %esi -; AVX-NEXT: andl $7, %edx -; AVX-NEXT: andl $7, %ecx -; AVX-NEXT: andl $7, %r8d -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $7, %r9d -; AVX-NEXT: movzwl -24(%rsp,%rdi,2), %edi -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $7, -24(%rsp,%r10,2), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $r9d killed $r9d def $r9 +; AVX1-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX1-NEXT: # kill: def $edx killed $edx def $rdx +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; AVX1-NEXT: andl $7, %r10d +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $7, %eax +; AVX1-NEXT: andl $7, %edi +; AVX1-NEXT: andl $7, %esi +; AVX1-NEXT: andl $7, %edx +; AVX1-NEXT: andl $7, %ecx +; AVX1-NEXT: andl $7, %r8d +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $7, %r9d +; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %edi +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $7, -24(%rsp,%r10,2), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 +; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: # kill: def $edi killed $edi def $rdi +; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; AVX2-NEXT: andl $7, %r10d +; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $7, %eax +; AVX2-NEXT: andl $7, %edi +; AVX2-NEXT: andl $7, %esi +; AVX2-NEXT: andl $7, %edx +; AVX2-NEXT: andl $7, %ecx +; AVX2-NEXT: andl $7, %r8d +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $7, %r9d +; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %edi +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $7, -24(%rsp,%r10,2), %xmm0, %xmm0 +; AVX2-NEXT: retq %x0 = extractelement <8 x i16> %x, i16 %i0 %x1 = extractelement <8 x i16> %x, i16 %i1 %x2 = extractelement <8 x i16> %x, i16 %i2 @@ -625,59 +710,113 @@ ; SSE41-NEXT: pinsrb $15, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $r9d killed $r9d def $r9 -; AVX-NEXT: # kill: def $r8d killed $r8d def $r8 -; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX-NEXT: # kill: def $edx killed $edx def $rdx -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: # kill: def $edi killed $edi def $rdi -; AVX-NEXT: andl $15, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movzbl -24(%rsp,%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: vpinsrb $1, -24(%rsp,%rsi), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %edx -; AVX-NEXT: vpinsrb $2, -24(%rsp,%rdx), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $3, -24(%rsp,%rcx), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %r8d -; AVX-NEXT: vpinsrb $4, -24(%rsp,%r8), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %r9d -; AVX-NEXT: vpinsrb $5, -24(%rsp,%r9), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $r9d killed $r9d def $r9 +; AVX1-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX1-NEXT: # kill: def $edx killed $edx def $rdx +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: andl $15, %edi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movzbl -24(%rsp,%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: andl $15, %esi +; AVX1-NEXT: vpinsrb $1, -24(%rsp,%rsi), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %edx +; AVX1-NEXT: vpinsrb $2, -24(%rsp,%rdx), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %ecx +; AVX1-NEXT: vpinsrb $3, -24(%rsp,%rcx), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %r8d +; AVX1-NEXT: vpinsrb $4, -24(%rsp,%r8), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %r9d +; AVX1-NEXT: vpinsrb $5, -24(%rsp,%r9), %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 +; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: # kill: def $edi killed $edi def $rdi +; AVX2-NEXT: andl $15, %edi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movzbl -24(%rsp,%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: vpinsrb $1, -24(%rsp,%rsi), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %edx +; AVX2-NEXT: vpinsrb $2, -24(%rsp,%rdx), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %ecx +; AVX2-NEXT: vpinsrb $3, -24(%rsp,%rcx), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %r8d +; AVX2-NEXT: vpinsrb $4, -24(%rsp,%r8), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %r9d +; AVX2-NEXT: vpinsrb $5, -24(%rsp,%r9), %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: retq %x0 = extractelement <16 x i8> %x, i8 %i0 %x1 = extractelement <16 x i8> %x, i8 %i1 %x2 = extractelement <16 x i8> %x, i8 %i2 @@ -775,22 +914,39 @@ ; SSE41-NEXT: pinsrd $3, -24(%rsp,%rsi,4), %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32: -; AVX: # %bb.0: -; AVX-NEXT: movl (%rdi), %eax -; AVX-NEXT: movl 4(%rdi), %ecx -; AVX-NEXT: andl $3, %eax -; AVX-NEXT: andl $3, %ecx -; AVX-NEXT: movl 8(%rdi), %edx -; AVX-NEXT: andl $3, %edx -; AVX-NEXT: movl 12(%rdi), %esi -; AVX-NEXT: andl $3, %esi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0 -; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0 -; AVX-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: movl 4(%rdi), %ecx +; AVX1-NEXT: andl $3, %eax +; AVX1-NEXT: andl $3, %ecx +; AVX1-NEXT: movl 8(%rdi), %edx +; AVX1-NEXT: andl $3, %edx +; AVX1-NEXT: movl 12(%rdi), %esi +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: movl 4(%rdi), %ecx +; AVX2-NEXT: andl $3, %eax +; AVX2-NEXT: andl $3, %ecx +; AVX2-NEXT: movl 8(%rdi), %edx +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: movl 12(%rdi), %esi +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm0, %xmm0 +; AVX2-NEXT: retq %p0 = getelementptr inbounds i32, i32* %i, i64 0 %p1 = getelementptr inbounds i32, i32* %i, i64 1 %p2 = getelementptr inbounds i32, i32* %i, i64 2 @@ -1075,73 +1231,141 @@ ; SSE41-NEXT: popq %rbp ; SSE41-NEXT: retq ; -; AVX-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r15 -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %r13 -; AVX-NEXT: pushq %r12 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: movzbl (%rdi), %r9d -; AVX-NEXT: andl $15, %r9d -; AVX-NEXT: movzbl 1(%rdi), %ebx -; AVX-NEXT: movzbl 2(%rdi), %eax -; AVX-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX-NEXT: movzbl 3(%rdi), %r11d -; AVX-NEXT: movzbl 4(%rdi), %r14d -; AVX-NEXT: movzbl 5(%rdi), %r15d -; AVX-NEXT: movzbl 6(%rdi), %r12d -; AVX-NEXT: movzbl 7(%rdi), %r13d -; AVX-NEXT: movzbl 8(%rdi), %r10d -; AVX-NEXT: movzbl 9(%rdi), %r8d -; AVX-NEXT: movzbl 10(%rdi), %ecx -; AVX-NEXT: movzbl 11(%rdi), %edx -; AVX-NEXT: movzbl 12(%rdi), %esi -; AVX-NEXT: movzbl 13(%rdi), %ebp -; AVX-NEXT: movzbl 14(%rdi), %eax -; AVX-NEXT: movzbl 15(%rdi), %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movzbl -24(%rsp,%r9), %r9d -; AVX-NEXT: vmovd %r9d, %xmm0 -; AVX-NEXT: andl $15, %ebx -; AVX-NEXT: vpinsrb $1, -24(%rsp,%rbx), %xmm0, %xmm0 -; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX-NEXT: andl $15, %ebx -; AVX-NEXT: vpinsrb $2, -24(%rsp,%rbx), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %r11d -; AVX-NEXT: vpinsrb $3, -24(%rsp,%r11), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %r14d -; AVX-NEXT: vpinsrb $4, -24(%rsp,%r14), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %r15d -; AVX-NEXT: vpinsrb $5, -24(%rsp,%r15), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %r12d -; AVX-NEXT: vpinsrb $6, -24(%rsp,%r12), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %r13d -; AVX-NEXT: vpinsrb $7, -24(%rsp,%r13), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %r10d -; AVX-NEXT: vpinsrb $8, -24(%rsp,%r10), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %r8d -; AVX-NEXT: vpinsrb $9, -24(%rsp,%r8), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $10, -24(%rsp,%rcx), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %edx -; AVX-NEXT: vpinsrb $11, -24(%rsp,%rdx), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: vpinsrb $12, -24(%rsp,%rsi), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %ebp -; AVX-NEXT: vpinsrb $13, -24(%rsp,%rbp), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 -; AVX-NEXT: andl $15, %edi -; AVX-NEXT: vpinsrb $15, -24(%rsp,%rdi), %xmm0, %xmm0 -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r12 -; AVX-NEXT: popq %r13 -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %r15 -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVX1-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: movzbl (%rdi), %r9d +; AVX1-NEXT: andl $15, %r9d +; AVX1-NEXT: movzbl 1(%rdi), %ebx +; AVX1-NEXT: movzbl 2(%rdi), %eax +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movzbl 3(%rdi), %r11d +; AVX1-NEXT: movzbl 4(%rdi), %r14d +; AVX1-NEXT: movzbl 5(%rdi), %r15d +; AVX1-NEXT: movzbl 6(%rdi), %r12d +; AVX1-NEXT: movzbl 7(%rdi), %r13d +; AVX1-NEXT: movzbl 8(%rdi), %r10d +; AVX1-NEXT: movzbl 9(%rdi), %r8d +; AVX1-NEXT: movzbl 10(%rdi), %ecx +; AVX1-NEXT: movzbl 11(%rdi), %edx +; AVX1-NEXT: movzbl 12(%rdi), %esi +; AVX1-NEXT: movzbl 13(%rdi), %ebp +; AVX1-NEXT: movzbl 14(%rdi), %eax +; AVX1-NEXT: movzbl 15(%rdi), %edi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movzbl -24(%rsp,%r9), %r9d +; AVX1-NEXT: vmovd %r9d, %xmm0 +; AVX1-NEXT: andl $15, %ebx +; AVX1-NEXT: vpinsrb $1, -24(%rsp,%rbx), %xmm0, %xmm0 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX1-NEXT: andl $15, %ebx +; AVX1-NEXT: vpinsrb $2, -24(%rsp,%rbx), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %r11d +; AVX1-NEXT: vpinsrb $3, -24(%rsp,%r11), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %r14d +; AVX1-NEXT: vpinsrb $4, -24(%rsp,%r14), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %r15d +; AVX1-NEXT: vpinsrb $5, -24(%rsp,%r15), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %r12d +; AVX1-NEXT: vpinsrb $6, -24(%rsp,%r12), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %r13d +; AVX1-NEXT: vpinsrb $7, -24(%rsp,%r13), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %r10d +; AVX1-NEXT: vpinsrb $8, -24(%rsp,%r10), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %r8d +; AVX1-NEXT: vpinsrb $9, -24(%rsp,%r8), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %ecx +; AVX1-NEXT: vpinsrb $10, -24(%rsp,%rcx), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %edx +; AVX1-NEXT: vpinsrb $11, -24(%rsp,%rdx), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %esi +; AVX1-NEXT: vpinsrb $12, -24(%rsp,%rsi), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %ebp +; AVX1-NEXT: vpinsrb $13, -24(%rsp,%rbp), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %edi +; AVX1-NEXT: vpinsrb $15, -24(%rsp,%rdi), %xmm0, %xmm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movzbl (%rdi), %r9d +; AVX2-NEXT: andl $15, %r9d +; AVX2-NEXT: movzbl 1(%rdi), %ebx +; AVX2-NEXT: movzbl 2(%rdi), %eax +; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movzbl 3(%rdi), %r11d +; AVX2-NEXT: movzbl 4(%rdi), %r14d +; AVX2-NEXT: movzbl 5(%rdi), %r15d +; AVX2-NEXT: movzbl 6(%rdi), %r12d +; AVX2-NEXT: movzbl 7(%rdi), %r13d +; AVX2-NEXT: movzbl 8(%rdi), %r10d +; AVX2-NEXT: movzbl 9(%rdi), %r8d +; AVX2-NEXT: movzbl 10(%rdi), %ecx +; AVX2-NEXT: movzbl 11(%rdi), %edx +; AVX2-NEXT: movzbl 12(%rdi), %esi +; AVX2-NEXT: movzbl 13(%rdi), %ebp +; AVX2-NEXT: movzbl 14(%rdi), %eax +; AVX2-NEXT: movzbl 15(%rdi), %edi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movzbl -24(%rsp,%r9), %r9d +; AVX2-NEXT: vmovd %r9d, %xmm0 +; AVX2-NEXT: andl $15, %ebx +; AVX2-NEXT: vpinsrb $1, -24(%rsp,%rbx), %xmm0, %xmm0 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX2-NEXT: andl $15, %ebx +; AVX2-NEXT: vpinsrb $2, -24(%rsp,%rbx), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %r11d +; AVX2-NEXT: vpinsrb $3, -24(%rsp,%r11), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %r14d +; AVX2-NEXT: vpinsrb $4, -24(%rsp,%r14), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %r15d +; AVX2-NEXT: vpinsrb $5, -24(%rsp,%r15), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %r12d +; AVX2-NEXT: vpinsrb $6, -24(%rsp,%r12), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %r13d +; AVX2-NEXT: vpinsrb $7, -24(%rsp,%r13), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %r10d +; AVX2-NEXT: vpinsrb $8, -24(%rsp,%r10), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %r8d +; AVX2-NEXT: vpinsrb $9, -24(%rsp,%r8), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %ecx +; AVX2-NEXT: vpinsrb $10, -24(%rsp,%rcx), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %edx +; AVX2-NEXT: vpinsrb $11, -24(%rsp,%rdx), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: vpinsrb $12, -24(%rsp,%rsi), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %ebp +; AVX2-NEXT: vpinsrb $13, -24(%rsp,%rbp), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %edi +; AVX2-NEXT: vpinsrb $15, -24(%rsp,%rdi), %xmm0, %xmm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq %p0 = getelementptr inbounds i8, i8* %i, i64 0 %p1 = getelementptr inbounds i8, i8* %i, i64 1 %p2 = getelementptr inbounds i8, i8* %i, i64 2 @@ -1264,21 +1488,37 @@ ; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX-NEXT: # kill: def $edx killed $edx def $rdx -; AVX-NEXT: # kill: def $edi killed $edi def $rdi -; AVX-NEXT: andl $3, %edi -; AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $3, %edx -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $3, %ecx -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX1-NEXT: # kill: def $edx killed $edx def $rdx +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: andl $3, %edi +; AVX1-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $3, %edx +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $3, %ecx +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $edi killed $edi def $rdi +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $3, %ecx +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq %x0 = extractelement <4 x float> %x, i32 %i0 %x1 = extractelement <4 x float> %x, i32 %i1 %y2 = extractelement <4 x float> %y, i32 %i2 @@ -1390,30 +1630,55 @@ ; SSE41-NEXT: pinsrw $5, -24(%rsp,%r9,2), %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $r9d killed $r9d def $r9 -; AVX-NEXT: # kill: def $r8d killed $r8d def $r8 -; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX-NEXT: # kill: def $edx killed $edx def $rdx -; AVX-NEXT: # kill: def $esi killed $esi def $rsi -; AVX-NEXT: # kill: def $edi killed $edi def $rdi -; AVX-NEXT: andl $7, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movzwl -40(%rsp,%rdi,2), %eax -; AVX-NEXT: andl $7, %esi -; AVX-NEXT: andl $7, %edx -; AVX-NEXT: andl $7, %ecx -; AVX-NEXT: andl $7, %r8d -; AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $7, %r9d -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, -40(%rsp,%r8,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $r9d killed $r9d def $r9 +; AVX1-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX1-NEXT: # kill: def $edx killed $edx def $rdx +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: andl $7, %edi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movzwl -40(%rsp,%rdi,2), %eax +; AVX1-NEXT: andl $7, %esi +; AVX1-NEXT: andl $7, %edx +; AVX1-NEXT: andl $7, %ecx +; AVX1-NEXT: andl $7, %r8d +; AVX1-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $7, %r9d +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $4, -40(%rsp,%r8,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 +; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: # kill: def $edi killed $edi def $rdi +; AVX2-NEXT: andl $7, %edi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movzwl -40(%rsp,%rdi,2), %eax +; AVX2-NEXT: andl $7, %esi +; AVX2-NEXT: andl $7, %edx +; AVX2-NEXT: andl $7, %ecx +; AVX2-NEXT: andl $7, %r8d +; AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $7, %r9d +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $4, -40(%rsp,%r8,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 +; AVX2-NEXT: retq %x0 = extractelement <8 x i16> %x, i16 %i0 %y1 = extractelement <8 x i16> %y, i16 %i1 %x2 = extractelement <8 x i16> %x, i16 %i2 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -7,25 +7,45 @@ ; define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { -; ALL-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbp -; ALL-NEXT: movq %rsp, %rbp -; ALL-NEXT: andq $-32, %rsp -; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: andl $3, %esi -; ALL-NEXT: andl $3, %edi -; ALL-NEXT: andl $3, %ecx -; ALL-NEXT: andl $3, %edx -; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: movq %rbp, %rsp -; ALL-NEXT: popq %rbp -; ALL-NEXT: retq +; AVX1-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: andl $3, %edi +; AVX1-NEXT: andl $3, %ecx +; AVX1-NEXT: andl $3, %edx +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: andl $3, %ecx +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq %x0 = extractelement <4 x double> %x, i64 %i0 %x1 = extractelement <4 x double> %x, i64 %i1 %x2 = extractelement <4 x double> %x, i64 %i2 @@ -38,21 +58,37 @@ } define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { -; ALL-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbp -; ALL-NEXT: movq %rsp, %rbp -; ALL-NEXT: andq $-32, %rsp -; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: andl $3, %edx -; ALL-NEXT: andl $3, %esi -; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: movq %rbp, %rsp -; ALL-NEXT: popq %rbp -; ALL-NEXT: retq +; AVX1-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: andl $3, %edx +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: vpbroadcastq (%rsp,%rsi,8), %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq %x0 = extractelement <4 x double> %x, i64 %i0 %x1 = extractelement <4 x double> %x, i64 %i1 %x2 = extractelement <4 x double> %x, i64 %i2 @@ -65,19 +101,33 @@ } define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { -; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64: -; ALL: # %bb.0: -; ALL-NEXT: andl $1, %esi -; ALL-NEXT: andl $1, %edi -; ALL-NEXT: andl $1, %ecx -; ALL-NEXT: andl $1, %edx -; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: andl $1, %esi +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq %x0 = extractelement <2 x double> %x, i64 %i0 %x1 = extractelement <2 x double> %x, i64 %i1 %x2 = extractelement <2 x double> %x, i64 %i2 @@ -90,27 +140,49 @@ } define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { -; ALL-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbp -; ALL-NEXT: movq %rsp, %rbp -; ALL-NEXT: andq $-32, %rsp -; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: andl $3, %edi -; ALL-NEXT: andl $3, %esi -; ALL-NEXT: andl $3, %edx -; ALL-NEXT: andl $3, %ecx -; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: movq %rbp, %rsp -; ALL-NEXT: popq %rbp -; ALL-NEXT: retq +; AVX1-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: andl $3, %edi +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: andl $3, %edx +; AVX1-NEXT: andl $3, %ecx +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: andl $3, %ecx +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq %x0 = extractelement <4 x i64> %x, i64 %i0 %x1 = extractelement <4 x i64> %x, i64 %i1 %x2 = extractelement <4 x i64> %x, i64 %i2 @@ -123,21 +195,37 @@ } define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { -; ALL-LABEL: var_shuffle_v4i64_v4i64_xx00_i64: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbp -; ALL-NEXT: movq %rsp, %rbp -; ALL-NEXT: andq $-32, %rsp -; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: andl $3, %edi -; ALL-NEXT: andl $3, %esi -; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: movq %rbp, %rsp -; ALL-NEXT: popq %rbp -; ALL-NEXT: retq +; AVX1-LABEL: var_shuffle_v4i64_v4i64_xx00_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: andl $3, %edi +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v4i64_v4i64_xx00_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq %x0 = extractelement <4 x i64> %x, i64 %i0 %x1 = extractelement <4 x i64> %x, i64 %i1 %x2 = extractelement <4 x i64> %x, i64 %i2 @@ -150,21 +238,37 @@ } define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { -; ALL-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64: -; ALL: # %bb.0: -; ALL-NEXT: andl $1, %edi -; ALL-NEXT: andl $1, %esi -; ALL-NEXT: andl $1, %edx -; ALL-NEXT: andl $1, %ecx -; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: andl $1, %esi +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq %x0 = extractelement <2 x i64> %x, i64 %i0 %x1 = extractelement <2 x i64> %x, i64 %i1 %x2 = extractelement <2 x i64> %x, i64 %i2 @@ -177,41 +281,77 @@ } define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind { -; ALL-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbp -; ALL-NEXT: movq %rsp, %rbp -; ALL-NEXT: andq $-32, %rsp -; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: # kill: def $r9d killed $r9d def $r9 -; ALL-NEXT: # kill: def $r8d killed $r8d def $r8 -; ALL-NEXT: # kill: def $ecx killed $ecx def $rcx -; ALL-NEXT: # kill: def $edx killed $edx def $rdx -; ALL-NEXT: # kill: def $esi killed $esi def $rsi -; ALL-NEXT: # kill: def $edi killed $edi def $rdi -; ALL-NEXT: movl 24(%rbp), %r10d -; ALL-NEXT: andl $7, %r10d -; ALL-NEXT: movl 16(%rbp), %eax -; ALL-NEXT: andl $7, %eax -; ALL-NEXT: andl $7, %edi -; ALL-NEXT: andl $7, %esi -; ALL-NEXT: andl $7, %edx -; ALL-NEXT: andl $7, %ecx -; ALL-NEXT: andl $7, %r8d -; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: andl $7, %r9d -; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: movq %rbp, %rsp -; ALL-NEXT: popq %rbp -; ALL-NEXT: retq +; AVX1-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: # kill: def $r9d killed $r9d def $r9 +; AVX1-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX1-NEXT: # kill: def $edx killed $edx def $rdx +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: movl 24(%rbp), %r10d +; AVX1-NEXT: andl $7, %r10d +; AVX1-NEXT: movl 16(%rbp), %eax +; AVX1-NEXT: andl $7, %eax +; AVX1-NEXT: andl $7, %edi +; AVX1-NEXT: andl $7, %esi +; AVX1-NEXT: andl $7, %edx +; AVX1-NEXT: andl $7, %ecx +; AVX1-NEXT: andl $7, %r8d +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: andl $7, %r9d +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 +; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: # kill: def $edi killed $edi def $rdi +; AVX2-NEXT: movl 24(%rbp), %r10d +; AVX2-NEXT: andl $7, %r10d +; AVX2-NEXT: movl 16(%rbp), %eax +; AVX2-NEXT: andl $7, %eax +; AVX2-NEXT: andl $7, %edi +; AVX2-NEXT: andl $7, %esi +; AVX2-NEXT: andl $7, %edx +; AVX2-NEXT: andl $7, %ecx +; AVX2-NEXT: andl $7, %r8d +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: andl $7, %r9d +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq %x0 = extractelement <8 x float> %x, i32 %i0 %x1 = extractelement <8 x float> %x, i32 %i1 %x2 = extractelement <8 x float> %x, i32 %i2 @@ -232,35 +372,65 @@ } define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind { -; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32: -; ALL: # %bb.0: -; ALL-NEXT: # kill: def $r9d killed $r9d def $r9 -; ALL-NEXT: # kill: def $r8d killed $r8d def $r8 -; ALL-NEXT: # kill: def $ecx killed $ecx def $rcx -; ALL-NEXT: # kill: def $edx killed $edx def $rdx -; ALL-NEXT: # kill: def $esi killed $esi def $rsi -; ALL-NEXT: # kill: def $edi killed $edi def $rdi -; ALL-NEXT: movl {{[0-9]+}}(%rsp), %r10d -; ALL-NEXT: andl $3, %r10d -; ALL-NEXT: movl {{[0-9]+}}(%rsp), %eax -; ALL-NEXT: andl $3, %eax -; ALL-NEXT: andl $3, %edi -; ALL-NEXT: andl $3, %esi -; ALL-NEXT: andl $3, %edx -; ALL-NEXT: andl $3, %ecx -; ALL-NEXT: andl $3, %r8d -; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; ALL-NEXT: andl $3, %r9d -; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $r9d killed $r9d def $r9 +; AVX1-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX1-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX1-NEXT: # kill: def $edx killed $edx def $rdx +; AVX1-NEXT: # kill: def $esi killed $esi def $rsi +; AVX1-NEXT: # kill: def $edi killed $edi def $rdi +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; AVX1-NEXT: andl $3, %r10d +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: andl $3, %eax +; AVX1-NEXT: andl $3, %edi +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: andl $3, %edx +; AVX1-NEXT: andl $3, %ecx +; AVX1-NEXT: andl $3, %r8d +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andl $3, %r9d +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $r9d killed $r9d def $r9 +; AVX2-NEXT: # kill: def $r8d killed $r8d def $r8 +; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: # kill: def $edx killed $edx def $rdx +; AVX2-NEXT: # kill: def $esi killed $esi def $rsi +; AVX2-NEXT: # kill: def $edi killed $edi def $rdi +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; AVX2-NEXT: andl $3, %r10d +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: andl $3, %eax +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: andl $3, %ecx +; AVX2-NEXT: andl $3, %r8d +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andl $3, %r9d +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq %x0 = extractelement <4 x float> %x, i32 %i0 %x1 = extractelement <4 x float> %x, i32 %i1 %x2 = extractelement <4 x float> %x, i32 %i2 @@ -356,7 +526,7 @@ ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi ; AVX2-NEXT: andl $15, %edi -; AVX2-NEXT: vmovaps %ymm0, (%rsp) +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) ; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: andl $15, %esi @@ -505,7 +675,7 @@ ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi ; AVX2-NEXT: andl $7, %edi -; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: andl $7, %esi @@ -591,31 +761,57 @@ ; define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwind { -; ALL-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbp -; ALL-NEXT: movq %rsp, %rbp -; ALL-NEXT: andq $-32, %rsp -; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: andl $3, %eax -; ALL-NEXT: andl $3, %ecx -; ALL-NEXT: movq 16(%rdi), %rdx -; ALL-NEXT: andl $3, %edx -; ALL-NEXT: movq 24(%rdi), %rsi -; ALL-NEXT: andl $3, %esi -; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: movq %rbp, %rsp -; ALL-NEXT: popq %rbp -; ALL-NEXT: retq +; AVX1-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: movq 8(%rdi), %rcx +; AVX1-NEXT: andl $3, %eax +; AVX1-NEXT: andl $3, %ecx +; AVX1-NEXT: movq 16(%rdi), %rdx +; AVX1-NEXT: andl $3, %edx +; AVX1-NEXT: movq 24(%rdi), %rsi +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: movq 8(%rdi), %rcx +; AVX2-NEXT: andl $3, %eax +; AVX2-NEXT: andl $3, %ecx +; AVX2-NEXT: movq 16(%rdi), %rdx +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: movq 24(%rdi), %rsi +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq %p0 = getelementptr inbounds i64, i64* %i, i32 0 %p1 = getelementptr inbounds i64, i64* %i, i32 1 %p2 = getelementptr inbounds i64, i64* %i, i32 2 @@ -636,25 +832,45 @@ } define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwind { -; ALL-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64: -; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: andl $1, %eax -; ALL-NEXT: andl $1, %ecx -; ALL-NEXT: movq 16(%rdi), %rdx -; ALL-NEXT: andl $1, %edx -; ALL-NEXT: movq 24(%rdi), %rsi -; ALL-NEXT: andl $1, %esi -; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: movq 8(%rdi), %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: movq 16(%rdi), %rdx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: movq 24(%rdi), %rsi +; AVX1-NEXT: andl $1, %esi +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: movq 8(%rdi), %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: movq 16(%rdi), %rdx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: movq 24(%rdi), %rsi +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq %p0 = getelementptr inbounds i64, i64* %i, i32 0 %p1 = getelementptr inbounds i64, i64* %i, i32 1 %p2 = getelementptr inbounds i64, i64* %i, i32 2 diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -2792,9 +2792,9 @@ ; ; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -3190,9 +3190,9 @@ ; ; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3555,9 +3555,9 @@ ; ; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -3953,9 +3953,9 @@ ; ; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -4318,9 +4318,9 @@ ; ; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -4716,9 +4716,9 @@ ; ; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -34,10 +34,10 @@ ; ; AVX2-FAST-LABEL: trunc8i64_8i32: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32: @@ -73,10 +73,10 @@ ; ; AVX2-FAST-LABEL: trunc8i64_8i32_ashr: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_ashr: @@ -116,10 +116,10 @@ ; ; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [1,3,5,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_lshr: @@ -1315,10 +1315,10 @@ ; ; AVX2-FAST-LABEL: trunc2x4i64_8i32: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc2x4i64_8i32: @@ -1808,14 +1808,19 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: trunc16i64_16i8_const: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: trunc16i64_16i8_const: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc16i64_16i8_const: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc16i64_16i8_const: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -1579,29 +1579,44 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] -; AVX-NEXT: retq +; AVX1-LABEL: foldv2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv2i64: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv2i64: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv2i64: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv2i64: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv2i64: @@ -1618,29 +1633,44 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv2i64u: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] -; AVX-NEXT: retq +; AVX1-LABEL: foldv2i64u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv2i64u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv2i64u: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv2i64u: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv2i64u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv2i64u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv2i64u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv2i64u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv2i64u: @@ -1657,29 +1687,44 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] -; AVX-NEXT: retq +; AVX1-LABEL: foldv4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv4i32: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv4i32: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv4i32: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv4i32: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv4i32: @@ -1696,29 +1741,44 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv4i32u: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] -; AVX-NEXT: retq +; AVX1-LABEL: foldv4i32u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv4i32u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv4i32u: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv4i32u: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv4i32u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv4i32u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i32u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i32u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv4i32u: @@ -1735,29 +1795,44 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] -; AVX-NEXT: retq +; AVX1-LABEL: foldv8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv8i16: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv8i16: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv8i16: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv8i16: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv8i16: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv8i16: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16: @@ -1774,29 +1849,44 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv8i16u: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] -; AVX-NEXT: retq +; AVX1-LABEL: foldv8i16u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv8i16u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv8i16u: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv8i16u: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv8i16u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv8i16u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv8i16u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv8i16u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16u: @@ -1813,29 +1903,44 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] -; AVX-NEXT: retq +; AVX1-LABEL: foldv16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv16i8: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv16i8: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv16i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv16i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv16i8: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv16i8: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8: @@ -1852,29 +1957,44 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv16i8u: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] -; AVX-NEXT: retq +; AVX1-LABEL: foldv16i8u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv16i8u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv16i8u: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv16i8u: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv16i8u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv16i8u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv16i8u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv16i8u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8u: diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -1112,103 +1112,393 @@ } define <4 x i64> @foldv4i64() nounwind { -; AVX-LABEL: foldv4i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] -; AVX-NEXT: retq +; AVX1-LABEL: foldv4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv4i64: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv4i64: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] +; AVX512CD-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv4i64: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv4i64: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] +; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] ; BITALG-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0] ; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> , i1 0) ret <4 x i64> %out } define <4 x i64> @foldv4i64u() nounwind { -; AVX-LABEL: foldv4i64u: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] -; AVX-NEXT: retq +; AVX1-LABEL: foldv4i64u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv4i64u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv4i64u: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv4i64u: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] +; AVX512CD-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv4i64u: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv4i64u: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] +; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i64u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i64u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] ; BITALG-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0] ; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> , i1 -1) ret <4 x i64> %out } define <8 x i32> @foldv8i32() nounwind { -; ALL-LABEL: foldv8i32: -; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; ALL-NEXT: ret{{[l|q]}} +; AVX1-LABEL: foldv8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv8i32: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv8i32: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX512CD-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv8i32: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv8i32: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX512VPOPCNTDQVL-NEXT: retq +; +; BITALG_NOVLX-LABEL: foldv8i32: +; BITALG_NOVLX: # %bb.0: +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; BITALG_NOVLX-NEXT: retq +; +; BITALG-LABEL: foldv8i32: +; BITALG: # %bb.0: +; BITALG-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; BITALG-NEXT: retq +; +; X32-AVX-LABEL: foldv8i32: +; X32-AVX: # %bb.0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> , i1 0) ret <8 x i32> %out } define <8 x i32> @foldv8i32u() nounwind { -; ALL-LABEL: foldv8i32u: -; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; ALL-NEXT: ret{{[l|q]}} +; AVX1-LABEL: foldv8i32u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv8i32u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv8i32u: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv8i32u: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX512CD-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv8i32u: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv8i32u: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX512VPOPCNTDQVL-NEXT: retq +; +; BITALG_NOVLX-LABEL: foldv8i32u: +; BITALG_NOVLX: # %bb.0: +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; BITALG_NOVLX-NEXT: retq +; +; BITALG-LABEL: foldv8i32u: +; BITALG: # %bb.0: +; BITALG-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; BITALG-NEXT: retq +; +; X32-AVX-LABEL: foldv8i32u: +; X32-AVX: # %bb.0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> , i1 -1) ret <8 x i32> %out } define <16 x i16> @foldv16i16() nounwind { -; ALL-LABEL: foldv16i16: -; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] -; ALL-NEXT: ret{{[l|q]}} +; AVX1-LABEL: foldv16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv16i16: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv16i16: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX512CD-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv16i16: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv16i16: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQVL-NEXT: retq +; +; BITALG_NOVLX-LABEL: foldv16i16: +; BITALG_NOVLX: # %bb.0: +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; BITALG_NOVLX-NEXT: retq +; +; BITALG-LABEL: foldv16i16: +; BITALG: # %bb.0: +; BITALG-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; BITALG-NEXT: retq +; +; X32-AVX-LABEL: foldv16i16: +; X32-AVX: # %bb.0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> , i1 0) ret <16 x i16> %out } define <16 x i16> @foldv16i16u() nounwind { -; ALL-LABEL: foldv16i16u: -; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] -; ALL-NEXT: ret{{[l|q]}} +; AVX1-LABEL: foldv16i16u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv16i16u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv16i16u: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv16i16u: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX512CD-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv16i16u: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv16i16u: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQVL-NEXT: retq +; +; BITALG_NOVLX-LABEL: foldv16i16u: +; BITALG_NOVLX: # %bb.0: +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; BITALG_NOVLX-NEXT: retq +; +; BITALG-LABEL: foldv16i16u: +; BITALG: # %bb.0: +; BITALG-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; BITALG-NEXT: retq +; +; X32-AVX-LABEL: foldv16i16u: +; X32-AVX: # %bb.0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> , i1 -1) ret <16 x i16> %out } define <32 x i8> @foldv32i8() nounwind { -; ALL-LABEL: foldv32i8: -; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] -; ALL-NEXT: ret{{[l|q]}} +; AVX1-LABEL: foldv32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv32i8: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv32i8: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX512CD-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv32i8: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv32i8: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX512VPOPCNTDQVL-NEXT: retq +; +; BITALG_NOVLX-LABEL: foldv32i8: +; BITALG_NOVLX: # %bb.0: +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; BITALG_NOVLX-NEXT: retq +; +; BITALG-LABEL: foldv32i8: +; BITALG: # %bb.0: +; BITALG-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; BITALG-NEXT: retq +; +; X32-AVX-LABEL: foldv32i8: +; X32-AVX: # %bb.0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> , i1 0) ret <32 x i8> %out } define <32 x i8> @foldv32i8u() nounwind { -; ALL-LABEL: foldv32i8u: -; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] -; ALL-NEXT: ret{{[l|q]}} +; AVX1-LABEL: foldv32i8u: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: foldv32i8u: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: foldv32i8u: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: foldv32i8u: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX512CD-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: foldv32i8u: +; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX512VPOPCNTDQ-NEXT: retq +; +; AVX512VPOPCNTDQVL-LABEL: foldv32i8u: +; AVX512VPOPCNTDQVL: # %bb.0: +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX512VPOPCNTDQVL-NEXT: retq +; +; BITALG_NOVLX-LABEL: foldv32i8u: +; BITALG_NOVLX: # %bb.0: +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; BITALG_NOVLX-NEXT: retq +; +; BITALG-LABEL: foldv32i8u: +; BITALG: # %bb.0: +; BITALG-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; BITALG-NEXT: retq +; +; X32-AVX-LABEL: foldv32i8u: +; X32-AVX: # %bb.0: +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> , i1 -1) ret <32 x i8> %out } diff --git a/llvm/test/CodeGen/X86/vector-width-store-merge.ll b/llvm/test/CodeGen/X86/vector-width-store-merge.ll --- a/llvm/test/CodeGen/X86/vector-width-store-merge.ll +++ b/llvm/test/CodeGen/X86/vector-width-store-merge.ll @@ -8,10 +8,10 @@ define weak_odr dso_local void @A(i8* %src, i8* %dst) local_unnamed_addr #0 { ; CHECK-LABEL: A: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovups (%rdi), %xmm0 -; CHECK-NEXT: vmovups 16(%rdi), %xmm1 -; CHECK-NEXT: vmovups %xmm1, 16(%rsi) -; CHECK-NEXT: vmovups %xmm0, (%rsi) +; CHECK-NEXT: vmovdqu (%rdi), %xmm0 +; CHECK-NEXT: vmovdqu 16(%rdi), %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, 16(%rsi) +; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ; CHECK-NEXT: retq entry: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 32, i1 false) @@ -22,14 +22,14 @@ define weak_odr dso_local void @B(i8* %src, i8* %dst) local_unnamed_addr #0 { ; CHECK-LABEL: B: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovups (%rdi), %xmm0 -; CHECK-NEXT: vmovups 16(%rdi), %xmm1 -; CHECK-NEXT: vmovups 32(%rdi), %xmm2 -; CHECK-NEXT: vmovups 48(%rdi), %xmm3 -; CHECK-NEXT: vmovups %xmm3, 48(%rsi) -; CHECK-NEXT: vmovups %xmm2, 32(%rsi) -; CHECK-NEXT: vmovups %xmm1, 16(%rsi) -; CHECK-NEXT: vmovups %xmm0, (%rsi) +; CHECK-NEXT: vmovdqu (%rdi), %xmm0 +; CHECK-NEXT: vmovdqu 16(%rdi), %xmm1 +; CHECK-NEXT: vmovdqu 32(%rdi), %xmm2 +; CHECK-NEXT: vmovdqu 48(%rdi), %xmm3 +; CHECK-NEXT: vmovdqu %xmm3, 48(%rsi) +; CHECK-NEXT: vmovdqu %xmm2, 32(%rsi) +; CHECK-NEXT: vmovdqu %xmm1, 16(%rsi) +; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ; CHECK-NEXT: retq entry: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 64, i1 false) @@ -40,8 +40,8 @@ define weak_odr dso_local void @C(i8* %src, i8* %dst) local_unnamed_addr #2 { ; CHECK-LABEL: C: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovups (%rdi), %ymm0 -; CHECK-NEXT: vmovups %ymm0, (%rsi) +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 +; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: @@ -53,10 +53,10 @@ define weak_odr dso_local void @D(i8* %src, i8* %dst) local_unnamed_addr #2 { ; CHECK-LABEL: D: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovups (%rdi), %ymm0 -; CHECK-NEXT: vmovups 32(%rdi), %ymm1 -; CHECK-NEXT: vmovups %ymm1, 32(%rsi) -; CHECK-NEXT: vmovups %ymm0, (%rsi) +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 +; CHECK-NEXT: vmovdqu 32(%rdi), %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, 32(%rsi) +; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -2078,11 +2078,23 @@ ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: retq ; -; AVX-LABEL: shuf_zext_4i32_to_2i64_offset2: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: shuf_zext_4i32_to_2i64_offset2: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuf_zext_4i32_to_2i64_offset2: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: shuf_zext_4i32_to_2i64_offset2: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: retq entry: %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <4 x i32> %Z = bitcast <4 x i32> %B to <2 x i64> diff --git a/llvm/test/CodeGen/X86/vector-zmov.ll b/llvm/test/CodeGen/X86/vector-zmov.ll --- a/llvm/test/CodeGen/X86/vector-zmov.ll +++ b/llvm/test/CodeGen/X86/vector-zmov.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 define <4 x i32> @load_zmov_4i32_to_0zzz(<4 x i32> *%ptr) { ; SSE-LABEL: load_zmov_4i32_to_0zzz: @@ -11,10 +11,15 @@ ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; -; AVX-LABEL: load_zmov_4i32_to_0zzz: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: retq +; AVX1-LABEL: load_zmov_4i32_to_0zzz: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zmov_4i32_to_0zzz: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: retq entry: %X = load <4 x i32>, <4 x i32>* %ptr %Y = shufflevector <4 x i32> %X, <4 x i32> zeroinitializer, <4 x i32> @@ -27,10 +32,15 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; -; AVX-LABEL: load_zmov_2i64_to_0z: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: retq +; AVX1-LABEL: load_zmov_2i64_to_0z: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zmov_2i64_to_0z: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: retq entry: %X = load <2 x i64>, <2 x i64>* %ptr %Y = shufflevector <2 x i64> %X, <2 x i64> zeroinitializer, <2 x i32> @@ -59,12 +69,19 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: load_zmov_4i32_to_0zzz_volatile: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: load_zmov_4i32_to_0zzz_volatile: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_zmov_4i32_to_0zzz_volatile: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-NEXT: retq entry: %X = load volatile <4 x i32>, <4 x i32>* %ptr %Y = shufflevector <4 x i32> %X, <4 x i32> zeroinitializer, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll --- a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll +++ b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll @@ -14,9 +14,9 @@ ; X86-NEXT: andl $-64, %esp ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl 456(%ebp), %esi -; X86-NEXT: vmovaps 328(%ebp), %zmm3 -; X86-NEXT: vmovaps 200(%ebp), %zmm4 -; X86-NEXT: vmovaps 72(%ebp), %zmm5 +; X86-NEXT: vmovdqa64 328(%ebp), %zmm3 +; X86-NEXT: vmovdqa64 200(%ebp), %zmm4 +; X86-NEXT: vmovdqa64 72(%ebp), %zmm5 ; X86-NEXT: vp2intersectd %zmm1, %zmm0, %k0 ; X86-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill ; X86-NEXT: kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill @@ -33,7 +33,7 @@ ; X86-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill ; X86-NEXT: kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill ; X86-NEXT: vzeroupper -; X86-NEXT: calll dummy +; X86-NEXT: calll dummy@PLT ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload ; X86-NEXT: kmovw %k0, %eax @@ -71,7 +71,7 @@ ; X64-NEXT: andq $-64, %rsp ; X64-NEXT: subq $64, %rsp ; X64-NEXT: movq %rdi, %r14 -; X64-NEXT: vmovaps 16(%rbp), %zmm8 +; X64-NEXT: vmovdqa64 16(%rbp), %zmm8 ; X64-NEXT: vp2intersectd %zmm1, %zmm0, %k0 ; X64-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; X64-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill @@ -88,7 +88,7 @@ ; X64-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; X64-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; X64-NEXT: vzeroupper -; X64-NEXT: callq dummy +; X64-NEXT: callq dummy@PLT ; X64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; X64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; X64-NEXT: kmovw %k0, %eax diff --git a/llvm/test/CodeGen/X86/vselect-2.ll b/llvm/test/CodeGen/X86/vselect-2.ll --- a/llvm/test/CodeGen/X86/vselect-2.ll +++ b/llvm/test/CodeGen/X86/vselect-2.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) { ; SSE2-LABEL: test1: @@ -15,10 +15,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test1: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test1: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq %select = select <4 x i1>, <4 x i32> %A, <4 x i32> %B ret <4 x i32> %select } @@ -34,10 +39,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test2: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test2: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %select = select <4 x i1>, <4 x i32> %A, <4 x i32> %B ret <4 x i32> %select } @@ -53,10 +63,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test3: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test3: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq %select = select <4 x i1>, <4 x float> %A, <4 x float> %B ret <4 x float> %select } @@ -72,10 +87,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test4: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test4: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %select = select <4 x i1>, <4 x float> %A, <4 x float> %B ret <4 x float> %select } diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 ; Verify that we don't emit packed vector shifts instructions if the ; condition used by the vector select is a vector of constants. @@ -19,10 +19,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test1: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: test1: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq %1 = select <4 x i1> , <4 x float> %a, <4 x float> %b ret <4 x float> %1 } @@ -38,10 +43,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test2: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test2: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq %1 = select <4 x i1> , <4 x float> %a, <4 x float> %b ret <4 x float> %1 } @@ -57,10 +67,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test3: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test3: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %1 = select <4 x i1> , <4 x float> %a, <4 x float> %b ret <4 x float> %1 } @@ -71,10 +86,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test4: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test4: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = select <4 x i1> , <4 x float> %a, <4 x float> %b ret <4 x float> %1 } @@ -114,10 +134,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test7: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test7: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test7: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq %1 = select <8 x i1> , <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } @@ -133,10 +158,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test8: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test8: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %1 = select <8 x i1> , <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } @@ -147,10 +177,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test9: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test9: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test9: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = select <8 x i1> , <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } @@ -195,10 +230,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test12: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test12: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test12: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = select <8 x i1> , <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } @@ -209,10 +249,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test13: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test13: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test13: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = select <8 x i1> , <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } @@ -249,10 +294,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test16: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = select <4 x i1> , <4 x float> %a, <4 x float> %b ret <4 x float> %1 } @@ -263,10 +313,15 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test17: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test17: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test17: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = select <8 x i1> , <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } @@ -282,10 +337,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test18: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test18: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test18: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: retq %1 = select <4 x i1> , <4 x float> %a, <4 x float> %b ret <4 x float> %1 } @@ -301,10 +361,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test19: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test19: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test19: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: retq %1 = select <4 x i1> , <4 x i32> %a, <4 x i32> %b ret <4 x i32> %1 } @@ -320,10 +385,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test20: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test20: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test20: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %1 = select <2 x i1> , <2 x double> %a, <2 x double> %b ret <2 x double> %1 } @@ -339,10 +409,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test21: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test21: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test21: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %1 = select <2 x i1> , <2 x i64> %a, <2 x i64> %b ret <2 x i64> %1 } @@ -359,10 +434,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test22: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test22: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test22: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %1 = select <4 x i1> , <4 x float> %a, <4 x float> %b ret <4 x float> %1 } @@ -379,10 +459,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test23: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test23: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test23: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %1 = select <4 x i1> , <4 x i32> %a, <4 x i32> %b ret <4 x i32> %1 } @@ -398,10 +483,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test24: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test24: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq %1 = select <2 x i1> , <2 x double> %a, <2 x double> %b ret <2 x double> %1 } @@ -417,10 +507,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: test25: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: test25: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test25: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq %1 = select <2 x i1> , <2 x i64> %a, <2 x i64> %b ret <2 x i64> %1 } @@ -468,11 +563,17 @@ ; SSE-NEXT: movaps %xmm0, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: select_illegal: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps %ymm7, %ymm3 -; AVX-NEXT: vmovaps %ymm6, %ymm2 -; AVX-NEXT: retq +; AVX1-LABEL: select_illegal: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps %ymm7, %ymm3 +; AVX1-NEXT: vmovaps %ymm6, %ymm2 +; AVX1-NEXT: retq +; +; AVX2-LABEL: select_illegal: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa %ymm7, %ymm3 +; AVX2-NEXT: vmovdqa %ymm6, %ymm2 +; AVX2-NEXT: retq %sel = select <16 x i1> , <16 x double> %a, <16 x double> %b ret <16 x double> %sel } diff --git a/llvm/test/CodeGen/X86/vzero-excess.ll b/llvm/test/CodeGen/X86/vzero-excess.ll --- a/llvm/test/CodeGen/X86/vzero-excess.ll +++ b/llvm/test/CodeGen/X86/vzero-excess.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq the_unknown +; CHECK-NEXT: callq the_unknown@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; CHECK-NEXT: vaddps (%rbx), %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -37,7 +37,7 @@ ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq the_unknown +; CHECK-NEXT: callq the_unknown@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; CHECK-NEXT: addq $56, %rsp ; CHECK-NEXT: retq @@ -54,7 +54,7 @@ ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: vzeroall -; CHECK-NEXT: callq the_unknown +; CHECK-NEXT: callq the_unknown@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; CHECK-NEXT: vaddps (%rbx), %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -79,7 +79,7 @@ ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: vzeroall -; CHECK-NEXT: callq the_unknown +; CHECK-NEXT: callq the_unknown@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; CHECK-NEXT: addq $56, %rsp ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/widen_load-3.ll b/llvm/test/CodeGen/X86/widen_load-3.ll --- a/llvm/test/CodeGen/X86/widen_load-3.ll +++ b/llvm/test/CodeGen/X86/widen_load-3.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-linux -mattr=+sse4.2 | FileCheck %s --check-prefix=X86-SSE -; RUN: llc < %s -mtriple=i686-linux -mattr=+avx | FileCheck %s --check-prefix=X86-AVX -; RUN: llc < %s -mtriple=i686-linux -mattr=+avx2 | FileCheck %s --check-prefix=X86-AVX +; RUN: llc < %s -mtriple=i686-linux -mattr=+avx | FileCheck %s --check-prefixes=X86-AVX,X86-AVX1 +; RUN: llc < %s -mtriple=i686-linux -mattr=+avx2 | FileCheck %s --check-prefixes=X86-AVX,X86-AVX2 ; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse4.2 | FileCheck %s --check-prefix=X64-SSE -; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx | FileCheck %s --check-prefix=X64-AVX -; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx2 | FileCheck %s --check-prefix=X64-AVX +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2 ; PR27708 @@ -25,19 +25,33 @@ ; X86-SSE-NEXT: movaps %xmm0, (%eax) ; X86-SSE-NEXT: retl $4 ; -; X86-AVX-LABEL: load7_aligned: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vmovaps (%ecx), %ymm0 -; X86-AVX-NEXT: vmovaps 48(%ecx), %xmm1 -; X86-AVX-NEXT: vextractps $1, %xmm1, 52(%eax) -; X86-AVX-NEXT: vmovss %xmm1, 48(%eax) -; X86-AVX-NEXT: vmovaps 32(%ecx), %xmm1 -; X86-AVX-NEXT: vmovaps %xmm1, 32(%eax) -; X86-AVX-NEXT: vmovaps %ymm0, (%eax) -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl $4 +; X86-AVX1-LABEL: load7_aligned: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: vmovaps (%ecx), %ymm0 +; X86-AVX1-NEXT: vmovaps 48(%ecx), %xmm1 +; X86-AVX1-NEXT: vextractps $1, %xmm1, 52(%eax) +; X86-AVX1-NEXT: vmovss %xmm1, 48(%eax) +; X86-AVX1-NEXT: vmovaps 32(%ecx), %xmm1 +; X86-AVX1-NEXT: vmovaps %xmm1, 32(%eax) +; X86-AVX1-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl $4 +; +; X86-AVX2-LABEL: load7_aligned: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vmovdqa (%ecx), %ymm0 +; X86-AVX2-NEXT: vmovdqa 48(%ecx), %xmm1 +; X86-AVX2-NEXT: vpextrd $1, %xmm1, 52(%eax) +; X86-AVX2-NEXT: vmovd %xmm1, 48(%eax) +; X86-AVX2-NEXT: vmovdqa 32(%ecx), %xmm1 +; X86-AVX2-NEXT: vmovdqa %xmm1, 32(%eax) +; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax) +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl $4 ; ; X64-SSE-LABEL: load7_aligned: ; X64-SSE: # %bb.0: @@ -52,17 +66,29 @@ ; X64-SSE-NEXT: movaps %xmm0, (%rdi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: load7_aligned: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: movq %rdi, %rax -; X64-AVX-NEXT: vmovaps (%rsi), %ymm0 -; X64-AVX-NEXT: movq 48(%rsi), %rcx -; X64-AVX-NEXT: movq %rcx, 48(%rdi) -; X64-AVX-NEXT: vmovaps 32(%rsi), %xmm1 -; X64-AVX-NEXT: vmovaps %xmm1, 32(%rdi) -; X64-AVX-NEXT: vmovaps %ymm0, (%rdi) -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: load7_aligned: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: movq %rdi, %rax +; X64-AVX1-NEXT: vmovaps (%rsi), %ymm0 +; X64-AVX1-NEXT: movq 48(%rsi), %rcx +; X64-AVX1-NEXT: movq %rcx, 48(%rdi) +; X64-AVX1-NEXT: vmovaps 32(%rsi), %xmm1 +; X64-AVX1-NEXT: vmovaps %xmm1, 32(%rdi) +; X64-AVX1-NEXT: vmovaps %ymm0, (%rdi) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: load7_aligned: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: movq %rdi, %rax +; X64-AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; X64-AVX2-NEXT: movq 48(%rsi), %rcx +; X64-AVX2-NEXT: movq %rcx, 48(%rdi) +; X64-AVX2-NEXT: vmovdqa 32(%rsi), %xmm1 +; X64-AVX2-NEXT: vmovdqa %xmm1, 32(%rdi) +; X64-AVX2-NEXT: vmovdqa %ymm0, (%rdi) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %x1 = load <7 x i64>, <7 x i64>* %x ret <7 x i64> %x1 } @@ -84,20 +110,35 @@ ; X86-SSE-NEXT: movaps %xmm0, (%eax) ; X86-SSE-NEXT: retl $4 ; -; X86-AVX-LABEL: load7_unaligned: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vmovups (%ecx), %ymm0 -; X86-AVX-NEXT: vmovups 32(%ecx), %xmm1 -; X86-AVX-NEXT: movl 48(%ecx), %edx -; X86-AVX-NEXT: movl 52(%ecx), %ecx -; X86-AVX-NEXT: movl %ecx, 52(%eax) -; X86-AVX-NEXT: movl %edx, 48(%eax) -; X86-AVX-NEXT: vmovaps %xmm1, 32(%eax) -; X86-AVX-NEXT: vmovaps %ymm0, (%eax) -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl $4 +; X86-AVX1-LABEL: load7_unaligned: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-AVX1-NEXT: vmovups 32(%ecx), %xmm1 +; X86-AVX1-NEXT: movl 48(%ecx), %edx +; X86-AVX1-NEXT: movl 52(%ecx), %ecx +; X86-AVX1-NEXT: movl %ecx, 52(%eax) +; X86-AVX1-NEXT: movl %edx, 48(%eax) +; X86-AVX1-NEXT: vmovaps %xmm1, 32(%eax) +; X86-AVX1-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl $4 +; +; X86-AVX2-LABEL: load7_unaligned: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: vmovdqu (%ecx), %ymm0 +; X86-AVX2-NEXT: vmovdqu 32(%ecx), %xmm1 +; X86-AVX2-NEXT: movl 48(%ecx), %edx +; X86-AVX2-NEXT: movl 52(%ecx), %ecx +; X86-AVX2-NEXT: movl %ecx, 52(%eax) +; X86-AVX2-NEXT: movl %edx, 48(%eax) +; X86-AVX2-NEXT: vmovdqa %xmm1, 32(%eax) +; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax) +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl $4 ; ; X64-SSE-LABEL: load7_unaligned: ; X64-SSE: # %bb.0: @@ -112,17 +153,29 @@ ; X64-SSE-NEXT: movaps %xmm0, (%rdi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: load7_unaligned: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: movq %rdi, %rax -; X64-AVX-NEXT: vmovups (%rsi), %ymm0 -; X64-AVX-NEXT: vmovups 32(%rsi), %xmm1 -; X64-AVX-NEXT: movq 48(%rsi), %rcx -; X64-AVX-NEXT: movq %rcx, 48(%rdi) -; X64-AVX-NEXT: vmovaps %xmm1, 32(%rdi) -; X64-AVX-NEXT: vmovaps %ymm0, (%rdi) -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: load7_unaligned: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: movq %rdi, %rax +; X64-AVX1-NEXT: vmovups (%rsi), %ymm0 +; X64-AVX1-NEXT: vmovups 32(%rsi), %xmm1 +; X64-AVX1-NEXT: movq 48(%rsi), %rcx +; X64-AVX1-NEXT: movq %rcx, 48(%rdi) +; X64-AVX1-NEXT: vmovaps %xmm1, 32(%rdi) +; X64-AVX1-NEXT: vmovaps %ymm0, (%rdi) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: load7_unaligned: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: movq %rdi, %rax +; X64-AVX2-NEXT: vmovdqu (%rsi), %ymm0 +; X64-AVX2-NEXT: vmovdqu 32(%rsi), %xmm1 +; X64-AVX2-NEXT: movq 48(%rsi), %rcx +; X64-AVX2-NEXT: movq %rcx, 48(%rdi) +; X64-AVX2-NEXT: vmovdqa %xmm1, 32(%rdi) +; X64-AVX2-NEXT: vmovdqa %ymm0, (%rdi) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %x1 = load <7 x i64>, <7 x i64>* %x, align 1 ret <7 x i64> %x1 } @@ -141,16 +194,27 @@ ; X86-SSE-NEXT: movups %xmm1, (%eax) ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: load_split: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: vmovups (%edx), %ymm0 -; X86-AVX-NEXT: vmovups %xmm0, (%ecx) -; X86-AVX-NEXT: vextractf128 $1, %ymm0, (%eax) -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: load_split: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: vmovups (%edx), %ymm0 +; X86-AVX1-NEXT: vmovups %xmm0, (%ecx) +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, (%eax) +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: load_split: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: vmovdqu (%edx), %ymm0 +; X86-AVX2-NEXT: vmovdqu %xmm0, (%ecx) +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, (%eax) +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE-LABEL: load_split: ; X64-SSE: # %bb.0: @@ -160,13 +224,21 @@ ; X64-SSE-NEXT: movups %xmm1, (%rdx) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: load_split: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX-NEXT: vmovups %xmm0, (%rsi) -; X64-AVX-NEXT: vextractf128 $1, %ymm0, (%rdx) -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: load_split: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovups %xmm0, (%rsi) +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, (%rdx) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: load_split: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovdqu %xmm0, (%rsi) +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, (%rdx) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %t256 = load <8 x float>, <8 x float>* %ld, align 1 %b128 = shufflevector <8 x float> %t256, <8 x float> undef, <4 x i32> store <4 x float> %b128, <4 x float>* %st1, align 1 @@ -189,18 +261,31 @@ ; X86-SSE-NEXT: movups %xmm1, (%eax,%ecx,4) ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: load_split_more: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: vmovups (%edx), %ymm0 -; X86-AVX-NEXT: movl (%ecx), %edx -; X86-AVX-NEXT: vmovups %xmm0, (%eax,%edx,4) -; X86-AVX-NEXT: movl 4(%ecx), %ecx -; X86-AVX-NEXT: vextractf128 $1, %ymm0, (%eax,%ecx,4) -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: load_split_more: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: vmovups (%edx), %ymm0 +; X86-AVX1-NEXT: movl (%ecx), %edx +; X86-AVX1-NEXT: vmovups %xmm0, (%eax,%edx,4) +; X86-AVX1-NEXT: movl 4(%ecx), %ecx +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, (%eax,%ecx,4) +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: load_split_more: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: vmovdqu (%edx), %ymm0 +; X86-AVX2-NEXT: movl (%ecx), %edx +; X86-AVX2-NEXT: vmovdqu %xmm0, (%eax,%edx,4) +; X86-AVX2-NEXT: movl 4(%ecx), %ecx +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, (%eax,%ecx,4) +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE-LABEL: load_split_more: ; X64-SSE: # %bb.0: @@ -212,15 +297,25 @@ ; X64-SSE-NEXT: movups %xmm1, (%rdx,%rax,4) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: load_split_more: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX-NEXT: movslq (%rsi), %rax -; X64-AVX-NEXT: vmovups %xmm0, (%rdx,%rax,4) -; X64-AVX-NEXT: movslq 4(%rsi), %rax -; X64-AVX-NEXT: vextractf128 $1, %ymm0, (%rdx,%rax,4) -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: load_split_more: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX1-NEXT: movslq (%rsi), %rax +; X64-AVX1-NEXT: vmovups %xmm0, (%rdx,%rax,4) +; X64-AVX1-NEXT: movslq 4(%rsi), %rax +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, (%rdx,%rax,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: load_split_more: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; X64-AVX2-NEXT: movslq (%rsi), %rax +; X64-AVX2-NEXT: vmovdqu %xmm0, (%rdx,%rax,4) +; X64-AVX2-NEXT: movslq 4(%rsi), %rax +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, (%rdx,%rax,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %v.i = bitcast float* %src to <8 x float>* %tmp = load <8 x float>, <8 x float>* %v.i, align 1 %tmp1 = load i32, i32* %idx, align 4 diff --git a/llvm/test/CodeGen/X86/widened-broadcast.ll b/llvm/test/CodeGen/X86/widened-broadcast.ll --- a/llvm/test/CodeGen/X86/widened-broadcast.ll +++ b/llvm/test/CodeGen/X86/widened-broadcast.ll @@ -19,10 +19,20 @@ ; SSE42-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] ; SSE42-NEXT: retq ; -; AVX-LABEL: load_splat_4f32_4f32_0101: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_4f32_4f32_0101: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_4f32_4f32_0101: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_4f32_4f32_0101: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX512-NEXT: retq entry: %ld = load <4 x float>, <4 x float>* %ptr %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> @@ -43,10 +53,20 @@ ; SSE42-NEXT: movapd %xmm0, %xmm1 ; SSE42-NEXT: retq ; -; AVX-LABEL: load_splat_8f32_4f32_01010101: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_8f32_4f32_01010101: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_8f32_4f32_01010101: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_8f32_4f32_01010101: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <4 x float>, <4 x float>* %ptr %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> @@ -67,10 +87,20 @@ ; SSE42-NEXT: movapd %xmm0, %xmm1 ; SSE42-NEXT: retq ; -; AVX-LABEL: load_splat_8f32_8f32_01010101: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_8f32_8f32_01010101: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_8f32_8f32_01010101: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_8f32_8f32_01010101: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <8 x float>, <8 x float>* %ptr %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> @@ -90,12 +120,12 @@ ; ; AVX2-LABEL: load_splat_4i32_4i32_0101: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_4i32_4i32_0101: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX512-NEXT: vpbroadcastq (%rdi), %xmm0 ; AVX512-NEXT: retq entry: %ld = load <4 x i32>, <4 x i32>* %ptr @@ -110,10 +140,20 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_8i32_4i32_01010101: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_8i32_4i32_01010101: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_8i32_4i32_01010101: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_8i32_4i32_01010101: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <4 x i32>, <4 x i32>* %ptr %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> @@ -127,10 +167,20 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_8i32_8i32_01010101: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_8i32_8i32_01010101: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_8i32_8i32_01010101: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_8i32_8i32_01010101: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <8 x i32>, <8 x i32>* %ptr %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> @@ -150,12 +200,12 @@ ; ; AVX2-LABEL: load_splat_8i16_8i16_01010101: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX2-NEXT: vpbroadcastd (%rdi), %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_8i16_8i16_01010101: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX512-NEXT: vpbroadcastd (%rdi), %xmm0 ; AVX512-NEXT: retq entry: %ld = load <8 x i16>, <8 x i16>* %ptr @@ -176,12 +226,12 @@ ; ; AVX2-LABEL: load_splat_8i16_8i16_01230123: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_8i16_8i16_01230123: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX512-NEXT: vpbroadcastq (%rdi), %xmm0 ; AVX512-NEXT: retq entry: %ld = load <8 x i16>, <8 x i16>* %ptr @@ -196,10 +246,20 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_16i16_8i16_0101010101010101: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_16i16_8i16_0101010101010101: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_16i16_8i16_0101010101010101: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_16i16_8i16_0101010101010101: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <8 x i16>, <8 x i16>* %ptr %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> @@ -213,10 +273,20 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_16i16_8i16_0123012301230123: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_16i16_8i16_0123012301230123: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_16i16_8i16_0123012301230123: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_16i16_8i16_0123012301230123: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <8 x i16>, <8 x i16>* %ptr %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> @@ -230,10 +300,20 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_16i16_16i16_0101010101010101: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_16i16_16i16_0101010101010101: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_16i16_16i16_0101010101010101: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_16i16_16i16_0101010101010101: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <16 x i16>, <16 x i16>* %ptr %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> @@ -247,10 +327,20 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_16i16_16i16_0123012301230123: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_16i16_16i16_0123012301230123: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_16i16_16i16_0123012301230123: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_16i16_16i16_0123012301230123: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <16 x i16>, <16 x i16>* %ptr %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> @@ -298,12 +388,12 @@ ; ; AVX2-LABEL: load_splat_16i8_16i8_0123012301230123: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX2-NEXT: vpbroadcastd (%rdi), %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_16i8_16i8_0123012301230123: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX512-NEXT: vpbroadcastd (%rdi), %xmm0 ; AVX512-NEXT: retq entry: %ld = load <16 x i8>, <16 x i8>* %ptr @@ -324,12 +414,12 @@ ; ; AVX2-LABEL: load_splat_16i8_16i8_0123456701234567: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_16i8_16i8_0123456701234567: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX512-NEXT: vpbroadcastq (%rdi), %xmm0 ; AVX512-NEXT: retq entry: %ld = load <16 x i8>, <16 x i8>* %ptr @@ -374,10 +464,20 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <16 x i8>, <16 x i8>* %ptr %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> @@ -391,10 +491,20 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <16 x i8>, <16 x i8>* %ptr %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> @@ -438,10 +548,20 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_32i8_32i8_01230123012301230123012301230123: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_32i8_32i8_01230123012301230123012301230123: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_32i8_32i8_01230123012301230123012301230123: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_32i8_32i8_01230123012301230123012301230123: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <32 x i8>, <32 x i8>* %ptr %ret = shufflevector <32 x i8> %ld, <32 x i8> undef, <32 x i32> @@ -455,10 +575,20 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_32i8_32i8_01234567012345670123456701234567: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_32i8_32i8_01234567012345670123456701234567: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_32i8_32i8_01234567012345670123456701234567: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_32i8_32i8_01234567012345670123456701234567: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <32 x i8>, <32 x i8>* %ptr %ret = shufflevector <32 x i8> %ld, <32 x i8> undef, <32 x i32> @@ -472,10 +602,20 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_4f32_8f32_0000: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_4f32_8f32_0000: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_4f32_8f32_0000: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastd (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_4f32_8f32_0000: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastd (%rdi), %xmm0 +; AVX512-NEXT: retq entry: %ld = load <8 x float>, <8 x float>* %ptr %ret = shufflevector <8 x float> %ld, <8 x float> undef, <4 x i32> zeroinitializer @@ -496,10 +636,20 @@ ; SSE42-NEXT: movapd %xmm0, %xmm1 ; SSE42-NEXT: retq ; -; AVX-LABEL: load_splat_8f32_16f32_89898989: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastsd 32(%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_8f32_16f32_89898989: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vbroadcastsd 32(%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_8f32_16f32_89898989: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastq 32(%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_8f32_16f32_89898989: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastq 32(%rdi), %ymm0 +; AVX512-NEXT: retq entry: %ld = load <16 x float>, <16 x float>* %ptr %ret = shufflevector <16 x float> %ld, <16 x float> undef, <8 x i32> @@ -514,10 +664,20 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_4i32_2i32_0101: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_4i32_2i32_0101: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_4i32_2i32_0101: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_4i32_2i32_0101: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq (%rdi), %xmm0 +; AVX512-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> ret <4 x i32> %res @@ -531,10 +691,20 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: load_splat_8i32_2i32_0101: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: load_splat_8i32_2i32_0101: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_splat_8i32_2i32_0101: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_splat_8i32_2i32_0101: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -558,13 +728,13 @@ ; ; AVX2-LABEL: load_splat_16i32_2i32_0101: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: vmovaps %ymm0, %ymm1 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_16i32_2i32_0101: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastsd (%rdi), %zmm0 +; AVX512-NEXT: vpbroadcastq (%rdi), %zmm0 ; AVX512-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -147,35 +147,35 @@ ; ; AVX2-LABEL: store_factorf64_4: ; AVX2: # %bb.0: -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vmovups %ymm0, 96(%rdi) -; AVX2-NEXT: vmovups %ymm3, 64(%rdi) -; AVX2-NEXT: vmovups %ymm4, 32(%rdi) -; AVX2-NEXT: vmovups %ymm2, (%rdi) +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) +; AVX2-NEXT: vmovdqu %ymm3, 64(%rdi) +; AVX2-NEXT: vmovdqu %ymm4, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm2, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: store_factorf64_4: ; AVX512: # %bb.0: -; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 -; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512-NEXT: vmovups %zmm0, 64(%rdi) -; AVX512-NEXT: vmovups %zmm1, (%rdi) +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rdi) +; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> @@ -205,35 +205,35 @@ ; ; AVX2-LABEL: store_factori64_4: ; AVX2: # %bb.0: -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vmovups %ymm0, 96(%rdi) -; AVX2-NEXT: vmovups %ymm3, 64(%rdi) -; AVX2-NEXT: vmovups %ymm4, 32(%rdi) -; AVX2-NEXT: vmovups %ymm2, (%rdi) +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) +; AVX2-NEXT: vmovdqu %ymm3, 64(%rdi) +; AVX2-NEXT: vmovdqu %ymm4, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm2, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: store_factori64_4: ; AVX512: # %bb.0: -; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 -; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512-NEXT: vmovups %zmm0, 64(%rdi) -; AVX512-NEXT: vmovups %zmm1, (%rdi) +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rdi) +; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> @@ -1659,20 +1659,20 @@ ; ; AVX2-LABEL: splat2_v4f64_load_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-NEXT: vmovups %ymm0, 32(%rsi) -; AVX2-NEXT: vmovups %ymm1, (%rsi) +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: splat2_v4f64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] -; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x double>, <4 x double>* %s, align 8 @@ -1697,20 +1697,20 @@ ; ; AVX2-LABEL: splat2_v4i64_load_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-NEXT: vmovups %ymm0, 32(%rsi) -; AVX2-NEXT: vmovups %ymm1, (%rsi) +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: splat2_v4i64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] -; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x i64>, <4 x i64>* %s, align 8 @@ -1743,20 +1743,20 @@ ; ; AVX2-LABEL: splat4_v8f32_load_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovups (%rdi), %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-NEXT: vmovups %ymm0, 96(%rsi) -; AVX2-NEXT: vmovups %ymm3, 64(%rsi) -; AVX2-NEXT: vmovups %ymm1, 32(%rsi) -; AVX2-NEXT: vmovups %ymm2, (%rsi) +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu (%rdi), %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vmovdqu %ymm0, 96(%rsi) +; AVX2-NEXT: vmovdqu %ymm3, 64(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm2, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1803,20 +1803,20 @@ ; ; AVX2-LABEL: splat4_v8i32_load_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovups (%rdi), %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-NEXT: vmovups %ymm0, 96(%rsi) -; AVX2-NEXT: vmovups %ymm3, 64(%rsi) -; AVX2-NEXT: vmovups %ymm1, 32(%rsi) -; AVX2-NEXT: vmovups %ymm2, (%rsi) +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu (%rdi), %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vmovdqu %ymm0, 96(%rsi) +; AVX2-NEXT: vmovdqu %ymm3, 64(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm2, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1855,27 +1855,27 @@ ; ; AVX2-LABEL: splat4_v4f64_load_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm1 -; AVX2-NEXT: vbroadcastsd 8(%rdi), %ymm2 -; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm3 -; AVX2-NEXT: vmovups %ymm3, 96(%rsi) -; AVX2-NEXT: vmovups %ymm1, 64(%rsi) -; AVX2-NEXT: vmovups %ymm2, 32(%rsi) -; AVX2-NEXT: vmovups %ymm0, (%rsi) +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: vpbroadcastq 16(%rdi), %ymm1 +; AVX2-NEXT: vpbroadcastq 8(%rdi), %ymm2 +; AVX2-NEXT: vpbroadcastq 24(%rdi), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, 96(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, 64(%rsi) +; AVX2-NEXT: vmovdqu %ymm2, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: splat4_v4f64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1 -; AVX512-NEXT: vbroadcastsd 8(%rdi), %ymm2 -; AVX512-NEXT: vbroadcastsd 24(%rdi), %ymm3 -; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vmovups %zmm1, 64(%rsi) -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: vpbroadcastq 16(%rdi), %ymm1 +; AVX512-NEXT: vpbroadcastq 8(%rdi), %ymm2 +; AVX512-NEXT: vpbroadcastq 24(%rdi), %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, 64(%rsi) +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x double>, <4 x double>* %s, align 8 @@ -1902,27 +1902,27 @@ ; ; AVX2-LABEL: splat4_v4i64_load_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm1 -; AVX2-NEXT: vbroadcastsd 8(%rdi), %ymm2 -; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm3 -; AVX2-NEXT: vmovups %ymm3, 96(%rsi) -; AVX2-NEXT: vmovups %ymm1, 64(%rsi) -; AVX2-NEXT: vmovups %ymm2, 32(%rsi) -; AVX2-NEXT: vmovups %ymm0, (%rsi) +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: vpbroadcastq 16(%rdi), %ymm1 +; AVX2-NEXT: vpbroadcastq 8(%rdi), %ymm2 +; AVX2-NEXT: vpbroadcastq 24(%rdi), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, 96(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, 64(%rsi) +; AVX2-NEXT: vmovdqu %ymm2, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: splat4_v4i64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1 -; AVX512-NEXT: vbroadcastsd 8(%rdi), %ymm2 -; AVX512-NEXT: vbroadcastsd 24(%rdi), %ymm3 -; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vmovups %zmm1, 64(%rsi) -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: vpbroadcastq 16(%rdi), %ymm1 +; AVX512-NEXT: vpbroadcastq 8(%rdi), %ymm2 +; AVX512-NEXT: vpbroadcastq 24(%rdi), %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, 64(%rsi) +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x i64>, <4 x i64>* %s, align 8 diff --git a/llvm/test/CodeGen/X86/x86-interrupt_cc.ll b/llvm/test/CodeGen/X86/x86-interrupt_cc.ll --- a/llvm/test/CodeGen/X86/x86-interrupt_cc.ll +++ b/llvm/test/CodeGen/X86/x86-interrupt_cc.ll @@ -29,86 +29,86 @@ ; CHECK64-KNL-NEXT: .cfi_def_cfa_offset 80 ; CHECK64-KNL-NEXT: subq $2096, %rsp ## encoding: [0x48,0x81,0xec,0x30,0x08,0x00,0x00] ; CHECK64-KNL-NEXT: ## imm = 0x830 -; CHECK64-KNL-NEXT: kmovw %k7, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xbc,0x24,0x2e,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw %k6, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xb4,0x24,0x2c,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw %k5, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xac,0x24,0x2a,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw %k4, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xa4,0x24,0x28,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw %k3, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x9c,0x24,0x26,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw %k2, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x94,0x24,0x24,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x8c,0x24,0x22,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x84,0x24,0x20,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x7c,0x24,0x1f] -; CHECK64-KNL-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x74,0x24,0x1e] -; CHECK64-KNL-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x6c,0x24,0x1d] -; CHECK64-KNL-NEXT: vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x64,0x24,0x1c] -; CHECK64-KNL-NEXT: vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x5c,0x24,0x1b] -; CHECK64-KNL-NEXT: vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x54,0x24,0x1a] -; CHECK64-KNL-NEXT: vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x4c,0x24,0x19] -; CHECK64-KNL-NEXT: vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x44,0x24,0x18] -; CHECK64-KNL-NEXT: vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x7c,0x24,0x17] -; CHECK64-KNL-NEXT: vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x74,0x24,0x16] -; CHECK64-KNL-NEXT: vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x6c,0x24,0x15] -; CHECK64-KNL-NEXT: vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x64,0x24,0x14] -; CHECK64-KNL-NEXT: vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x5c,0x24,0x13] -; CHECK64-KNL-NEXT: vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x54,0x24,0x12] -; CHECK64-KNL-NEXT: vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x4c,0x24,0x11] -; CHECK64-KNL-NEXT: vmovups %zmm16, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x44,0x24,0x10] -; CHECK64-KNL-NEXT: vmovups %zmm15, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x7c,0x24,0x0f] -; CHECK64-KNL-NEXT: vmovups %zmm14, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x74,0x24,0x0e] -; CHECK64-KNL-NEXT: vmovups %zmm13, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x6c,0x24,0x0d] -; CHECK64-KNL-NEXT: vmovups %zmm12, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x64,0x24,0x0c] -; CHECK64-KNL-NEXT: vmovups %zmm11, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x5c,0x24,0x0b] -; CHECK64-KNL-NEXT: vmovups %zmm10, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x54,0x24,0x0a] -; CHECK64-KNL-NEXT: vmovups %zmm9, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x4c,0x24,0x09] -; CHECK64-KNL-NEXT: vmovups %zmm8, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x44,0x24,0x08] -; CHECK64-KNL-NEXT: vmovups %zmm7, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x7c,0x24,0x07] -; CHECK64-KNL-NEXT: vmovups %zmm6, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x74,0x24,0x06] -; CHECK64-KNL-NEXT: vmovups %zmm5, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x6c,0x24,0x05] -; CHECK64-KNL-NEXT: vmovups %zmm4, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x64,0x24,0x04] -; CHECK64-KNL-NEXT: vmovups %zmm3, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x5c,0x24,0x03] -; CHECK64-KNL-NEXT: vmovups %zmm2, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x54,0x24,0x02] -; CHECK64-KNL-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x4c,0x24,0x01] -; CHECK64-KNL-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x04,0x24] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x7c,0x24,0x1f] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x74,0x24,0x1e] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x6c,0x24,0x1d] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x64,0x24,0x1c] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x5c,0x24,0x1b] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x54,0x24,0x1a] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x4c,0x24,0x19] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x44,0x24,0x18] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x7c,0x24,0x17] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x74,0x24,0x16] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x6c,0x24,0x15] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x64,0x24,0x14] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x5c,0x24,0x13] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x54,0x24,0x12] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x4c,0x24,0x11] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x44,0x24,0x10] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x7c,0x24,0x0f] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x74,0x24,0x0e] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x6c,0x24,0x0d] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x64,0x24,0x0c] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x5c,0x24,0x0b] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x54,0x24,0x0a] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x4c,0x24,0x09] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x44,0x24,0x08] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x7c,0x24,0x07] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x74,0x24,0x06] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x6c,0x24,0x05] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x64,0x24,0x04] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x5c,0x24,0x03] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x54,0x24,0x02] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x4c,0x24,0x01] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm0, (%rsp) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x04,0x24] ; CHECK64-KNL-NEXT: .cfi_def_cfa_offset 2176 ; CHECK64-KNL-NEXT: .cfi_offset %rcx, -80 ; CHECK64-KNL-NEXT: .cfi_offset %rdx, -72 @@ -162,85 +162,85 @@ ; CHECK64-KNL-NEXT: cld ## encoding: [0xfc] ; CHECK64-KNL-NEXT: callq _bar ## encoding: [0xe8,A,A,A,A] ; CHECK64-KNL-NEXT: ## fixup A - offset: 1, value: _bar-4, kind: reloc_branch_4byte_pcrel -; CHECK64-KNL-NEXT: vmovups (%rsp), %zmm0 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x04,0x24] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm1 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x4c,0x24,0x01] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm2 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x54,0x24,0x02] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm3 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x5c,0x24,0x03] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm4 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x64,0x24,0x04] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm5 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x6c,0x24,0x05] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm6 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x74,0x24,0x06] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm7 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x7c,0x24,0x07] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm8 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x44,0x24,0x08] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm9 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x4c,0x24,0x09] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm10 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x54,0x24,0x0a] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm11 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x5c,0x24,0x0b] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm12 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x64,0x24,0x0c] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm13 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x6c,0x24,0x0d] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm14 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x74,0x24,0x0e] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm15 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x7c,0x24,0x0f] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm16 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x44,0x24,0x10] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x4c,0x24,0x11] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x54,0x24,0x12] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x5c,0x24,0x13] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x64,0x24,0x14] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x6c,0x24,0x15] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x74,0x24,0x16] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x7c,0x24,0x17] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x44,0x24,0x18] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x4c,0x24,0x19] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x54,0x24,0x1a] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x5c,0x24,0x1b] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x64,0x24,0x1c] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x6c,0x24,0x1d] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x74,0x24,0x1e] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x7c,0x24,0x1f] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload +; CHECK64-KNL-NEXT: vmovdqu64 (%rsp), %zmm0 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x04,0x24] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x4c,0x24,0x01] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x54,0x24,0x02] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x5c,0x24,0x03] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x64,0x24,0x04] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x6c,0x24,0x05] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x74,0x24,0x06] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x7c,0x24,0x07] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x44,0x24,0x08] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x4c,0x24,0x09] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x54,0x24,0x0a] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x5c,0x24,0x0b] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x64,0x24,0x0c] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x6c,0x24,0x0d] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x74,0x24,0x0e] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x7c,0x24,0x0f] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x44,0x24,0x10] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x4c,0x24,0x11] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x54,0x24,0x12] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x5c,0x24,0x13] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x64,0x24,0x14] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x6c,0x24,0x15] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x74,0x24,0x16] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x7c,0x24,0x17] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x44,0x24,0x18] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x4c,0x24,0x19] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x54,0x24,0x1a] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x5c,0x24,0x1b] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x64,0x24,0x1c] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x6c,0x24,0x1d] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x74,0x24,0x1e] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x7c,0x24,0x1f] +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x84,0x24,0x20,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x8c,0x24,0x22,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ## 2-byte Reload +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x94,0x24,0x24,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k3 ## 2-byte Reload +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x9c,0x24,0x26,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k4 ## 2-byte Reload +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xa4,0x24,0x28,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k5 ## 2-byte Reload +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xac,0x24,0x2a,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k6 ## 2-byte Reload +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xb4,0x24,0x2c,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k7 ## 2-byte Reload +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xbc,0x24,0x2e,0x08,0x00,0x00] ; CHECK64-KNL-NEXT: addq $2096, %rsp ## encoding: [0x48,0x81,0xc4,0x30,0x08,0x00,0x00] ; CHECK64-KNL-NEXT: ## imm = 0x830 @@ -277,86 +277,86 @@ ; CHECK64-SKX-NEXT: .cfi_def_cfa_offset 80 ; CHECK64-SKX-NEXT: subq $2160, %rsp ## encoding: [0x48,0x81,0xec,0x70,0x08,0x00,0x00] ; CHECK64-SKX-NEXT: ## imm = 0x870 -; CHECK64-SKX-NEXT: kmovq %k7, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xbc,0x24,0x68,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq %k6, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xb4,0x24,0x60,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq %k5, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xac,0x24,0x58,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq %k4, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xa4,0x24,0x50,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq %k3, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x9c,0x24,0x48,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq %k2, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x94,0x24,0x40,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq %k1, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x8c,0x24,0x38,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x84,0x24,0x30,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x7c,0x24,0x1f] -; CHECK64-SKX-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x74,0x24,0x1e] -; CHECK64-SKX-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x6c,0x24,0x1d] -; CHECK64-SKX-NEXT: vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x64,0x24,0x1c] -; CHECK64-SKX-NEXT: vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x5c,0x24,0x1b] -; CHECK64-SKX-NEXT: vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x54,0x24,0x1a] -; CHECK64-SKX-NEXT: vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x4c,0x24,0x19] -; CHECK64-SKX-NEXT: vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x44,0x24,0x18] -; CHECK64-SKX-NEXT: vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x7c,0x24,0x17] -; CHECK64-SKX-NEXT: vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x74,0x24,0x16] -; CHECK64-SKX-NEXT: vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x6c,0x24,0x15] -; CHECK64-SKX-NEXT: vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x64,0x24,0x14] -; CHECK64-SKX-NEXT: vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x5c,0x24,0x13] -; CHECK64-SKX-NEXT: vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x54,0x24,0x12] -; CHECK64-SKX-NEXT: vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x4c,0x24,0x11] -; CHECK64-SKX-NEXT: vmovups %zmm16, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x44,0x24,0x10] -; CHECK64-SKX-NEXT: vmovups %zmm15, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x7c,0x24,0x0f] -; CHECK64-SKX-NEXT: vmovups %zmm14, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x74,0x24,0x0e] -; CHECK64-SKX-NEXT: vmovups %zmm13, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x6c,0x24,0x0d] -; CHECK64-SKX-NEXT: vmovups %zmm12, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x64,0x24,0x0c] -; CHECK64-SKX-NEXT: vmovups %zmm11, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x5c,0x24,0x0b] -; CHECK64-SKX-NEXT: vmovups %zmm10, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x54,0x24,0x0a] -; CHECK64-SKX-NEXT: vmovups %zmm9, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x4c,0x24,0x09] -; CHECK64-SKX-NEXT: vmovups %zmm8, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x44,0x24,0x08] -; CHECK64-SKX-NEXT: vmovups %zmm7, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x7c,0x24,0x07] -; CHECK64-SKX-NEXT: vmovups %zmm6, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x74,0x24,0x06] -; CHECK64-SKX-NEXT: vmovups %zmm5, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x6c,0x24,0x05] -; CHECK64-SKX-NEXT: vmovups %zmm4, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x64,0x24,0x04] -; CHECK64-SKX-NEXT: vmovups %zmm3, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x5c,0x24,0x03] -; CHECK64-SKX-NEXT: vmovups %zmm2, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x54,0x24,0x02] -; CHECK64-SKX-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x4c,0x24,0x01] -; CHECK64-SKX-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x04,0x24] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x7c,0x24,0x1f] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x74,0x24,0x1e] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x6c,0x24,0x1d] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x64,0x24,0x1c] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x5c,0x24,0x1b] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x54,0x24,0x1a] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x4c,0x24,0x19] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x44,0x24,0x18] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x7c,0x24,0x17] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x74,0x24,0x16] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x6c,0x24,0x15] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x64,0x24,0x14] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x5c,0x24,0x13] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x54,0x24,0x12] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x4c,0x24,0x11] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x44,0x24,0x10] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x7c,0x24,0x0f] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x74,0x24,0x0e] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x6c,0x24,0x0d] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x64,0x24,0x0c] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x5c,0x24,0x0b] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x54,0x24,0x0a] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x4c,0x24,0x09] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x44,0x24,0x08] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x7c,0x24,0x07] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x74,0x24,0x06] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x6c,0x24,0x05] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x64,0x24,0x04] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x5c,0x24,0x03] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x54,0x24,0x02] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x4c,0x24,0x01] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm0, (%rsp) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x04,0x24] ; CHECK64-SKX-NEXT: .cfi_def_cfa_offset 2240 ; CHECK64-SKX-NEXT: .cfi_offset %rcx, -80 ; CHECK64-SKX-NEXT: .cfi_offset %rdx, -72 @@ -411,85 +411,85 @@ ; CHECK64-SKX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK64-SKX-NEXT: callq _bar ## encoding: [0xe8,A,A,A,A] ; CHECK64-SKX-NEXT: ## fixup A - offset: 1, value: _bar-4, kind: reloc_branch_4byte_pcrel -; CHECK64-SKX-NEXT: vmovups (%rsp), %zmm0 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x04,0x24] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm1 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x4c,0x24,0x01] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm2 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x54,0x24,0x02] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm3 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x5c,0x24,0x03] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm4 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x64,0x24,0x04] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm5 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x6c,0x24,0x05] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm6 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x74,0x24,0x06] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm7 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x7c,0x24,0x07] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm8 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x44,0x24,0x08] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm9 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x4c,0x24,0x09] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm10 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x54,0x24,0x0a] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm11 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x5c,0x24,0x0b] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm12 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x64,0x24,0x0c] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm13 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x6c,0x24,0x0d] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm14 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x74,0x24,0x0e] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm15 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x7c,0x24,0x0f] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm16 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x44,0x24,0x10] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x4c,0x24,0x11] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x54,0x24,0x12] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x5c,0x24,0x13] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x64,0x24,0x14] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x6c,0x24,0x15] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x74,0x24,0x16] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x7c,0x24,0x17] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x44,0x24,0x18] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x4c,0x24,0x19] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x54,0x24,0x1a] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x5c,0x24,0x1b] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x64,0x24,0x1c] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x6c,0x24,0x1d] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x74,0x24,0x1e] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x7c,0x24,0x1f] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload +; CHECK64-SKX-NEXT: vmovdqu64 (%rsp), %zmm0 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x04,0x24] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x4c,0x24,0x01] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x54,0x24,0x02] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x5c,0x24,0x03] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x64,0x24,0x04] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x6c,0x24,0x05] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x74,0x24,0x06] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x7c,0x24,0x07] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x44,0x24,0x08] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x4c,0x24,0x09] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x54,0x24,0x0a] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x5c,0x24,0x0b] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x64,0x24,0x0c] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x6c,0x24,0x0d] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x74,0x24,0x0e] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x7c,0x24,0x0f] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x44,0x24,0x10] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x4c,0x24,0x11] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x54,0x24,0x12] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x5c,0x24,0x13] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x64,0x24,0x14] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x6c,0x24,0x15] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x74,0x24,0x16] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x7c,0x24,0x17] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x44,0x24,0x18] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x4c,0x24,0x19] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x54,0x24,0x1a] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x5c,0x24,0x1b] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x64,0x24,0x1c] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x6c,0x24,0x1d] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x74,0x24,0x1e] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x7c,0x24,0x1f] +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x84,0x24,0x30,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x8c,0x24,0x38,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k2 ## 8-byte Reload +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x94,0x24,0x40,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k3 ## 8-byte Reload +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x9c,0x24,0x48,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k4 ## 8-byte Reload +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xa4,0x24,0x50,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k5 ## 8-byte Reload +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xac,0x24,0x58,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k6 ## 8-byte Reload +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xb4,0x24,0x60,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k7 ## 8-byte Reload +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xbc,0x24,0x68,0x08,0x00,0x00] ; CHECK64-SKX-NEXT: addq $2160, %rsp ## encoding: [0x48,0x81,0xc4,0x70,0x08,0x00,0x00] ; CHECK64-SKX-NEXT: ## imm = 0x870 @@ -514,38 +514,38 @@ ; CHECK32-KNL-NEXT: .cfi_def_cfa_offset 16 ; CHECK32-KNL-NEXT: subl $560, %esp ## encoding: [0x81,0xec,0x30,0x02,0x00,0x00] ; CHECK32-KNL-NEXT: ## imm = 0x230 -; CHECK32-KNL-NEXT: kmovw %k7, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xbc,0x24,0x2e,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw %k6, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xb4,0x24,0x2c,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw %k5, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xac,0x24,0x2a,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw %k4, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xa4,0x24,0x28,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw %k3, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k3, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x9c,0x24,0x26,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw %k2, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x94,0x24,0x24,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw %k1, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x8c,0x24,0x22,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw %k0, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x84,0x24,0x20,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: vmovups %zmm7, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x7c,0x24,0x07] -; CHECK32-KNL-NEXT: vmovups %zmm6, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x74,0x24,0x06] -; CHECK32-KNL-NEXT: vmovups %zmm5, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x6c,0x24,0x05] -; CHECK32-KNL-NEXT: vmovups %zmm4, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x64,0x24,0x04] -; CHECK32-KNL-NEXT: vmovups %zmm3, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x5c,0x24,0x03] -; CHECK32-KNL-NEXT: vmovups %zmm2, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x54,0x24,0x02] -; CHECK32-KNL-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x4c,0x24,0x01] -; CHECK32-KNL-NEXT: vmovups %zmm0, (%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x04,0x24] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x7c,0x24,0x07] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x74,0x24,0x06] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x6c,0x24,0x05] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x64,0x24,0x04] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x5c,0x24,0x03] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x54,0x24,0x02] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x4c,0x24,0x01] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm0, (%esp) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x04,0x24] ; CHECK32-KNL-NEXT: .cfi_def_cfa_offset 576 ; CHECK32-KNL-NEXT: .cfi_offset %eax, -16 ; CHECK32-KNL-NEXT: .cfi_offset %ecx, -12 @@ -569,37 +569,37 @@ ; CHECK32-KNL-NEXT: cld ## encoding: [0xfc] ; CHECK32-KNL-NEXT: calll _bar ## encoding: [0xe8,A,A,A,A] ; CHECK32-KNL-NEXT: ## fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4 -; CHECK32-KNL-NEXT: vmovups (%esp), %zmm0 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x04,0x24] -; CHECK32-KNL-NEXT: vmovups {{[0-9]+}}(%esp), %zmm1 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x4c,0x24,0x01] -; CHECK32-KNL-NEXT: vmovups {{[0-9]+}}(%esp), %zmm2 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x54,0x24,0x02] -; CHECK32-KNL-NEXT: vmovups {{[0-9]+}}(%esp), %zmm3 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x5c,0x24,0x03] -; CHECK32-KNL-NEXT: vmovups {{[0-9]+}}(%esp), %zmm4 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x64,0x24,0x04] -; CHECK32-KNL-NEXT: vmovups {{[0-9]+}}(%esp), %zmm5 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x6c,0x24,0x05] -; CHECK32-KNL-NEXT: vmovups {{[0-9]+}}(%esp), %zmm6 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x74,0x24,0x06] -; CHECK32-KNL-NEXT: vmovups {{[0-9]+}}(%esp), %zmm7 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x7c,0x24,0x07] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## 2-byte Reload +; CHECK32-KNL-NEXT: vmovdqu64 (%esp), %zmm0 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x04,0x24] +; CHECK32-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm1 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x4c,0x24,0x01] +; CHECK32-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm2 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x54,0x24,0x02] +; CHECK32-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm3 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x5c,0x24,0x03] +; CHECK32-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm4 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x64,0x24,0x04] +; CHECK32-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm5 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x6c,0x24,0x05] +; CHECK32-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm6 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x74,0x24,0x06] +; CHECK32-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm7 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x7c,0x24,0x07] +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x84,0x24,0x20,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## 2-byte Reload +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x8c,0x24,0x22,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k2 ## 2-byte Reload +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x94,0x24,0x24,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k3 ## 2-byte Reload +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k3 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x9c,0x24,0x26,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k4 ## 2-byte Reload +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k4 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xa4,0x24,0x28,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k5 ## 2-byte Reload +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k5 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xac,0x24,0x2a,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k6 ## 2-byte Reload +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k6 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xb4,0x24,0x2c,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k7 ## 2-byte Reload +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xbc,0x24,0x2e,0x02,0x00,0x00] ; CHECK32-KNL-NEXT: addl $560, %esp ## encoding: [0x81,0xc4,0x30,0x02,0x00,0x00] ; CHECK32-KNL-NEXT: ## imm = 0x230 @@ -618,38 +618,38 @@ ; CHECK32-SKX-NEXT: .cfi_def_cfa_offset 16 ; CHECK32-SKX-NEXT: subl $624, %esp ## encoding: [0x81,0xec,0x70,0x02,0x00,0x00] ; CHECK32-SKX-NEXT: ## imm = 0x270 -; CHECK32-SKX-NEXT: kmovq %k7, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k7, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xbc,0x24,0x68,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq %k6, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xb4,0x24,0x60,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq %k5, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xac,0x24,0x58,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq %k4, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xa4,0x24,0x50,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq %k3, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k3, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x9c,0x24,0x48,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq %k2, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x94,0x24,0x40,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq %k1, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x8c,0x24,0x38,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x84,0x24,0x30,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: vmovups %zmm7, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x7c,0x24,0x07] -; CHECK32-SKX-NEXT: vmovups %zmm6, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x74,0x24,0x06] -; CHECK32-SKX-NEXT: vmovups %zmm5, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x6c,0x24,0x05] -; CHECK32-SKX-NEXT: vmovups %zmm4, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x64,0x24,0x04] -; CHECK32-SKX-NEXT: vmovups %zmm3, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x5c,0x24,0x03] -; CHECK32-SKX-NEXT: vmovups %zmm2, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x54,0x24,0x02] -; CHECK32-SKX-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x4c,0x24,0x01] -; CHECK32-SKX-NEXT: vmovups %zmm0, (%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x04,0x24] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x7c,0x24,0x07] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x74,0x24,0x06] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x6c,0x24,0x05] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x64,0x24,0x04] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x5c,0x24,0x03] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x54,0x24,0x02] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x4c,0x24,0x01] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm0, (%esp) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x04,0x24] ; CHECK32-SKX-NEXT: .cfi_def_cfa_offset 640 ; CHECK32-SKX-NEXT: .cfi_offset %eax, -16 ; CHECK32-SKX-NEXT: .cfi_offset %ecx, -12 @@ -674,37 +674,37 @@ ; CHECK32-SKX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK32-SKX-NEXT: calll _bar ## encoding: [0xe8,A,A,A,A] ; CHECK32-SKX-NEXT: ## fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4 -; CHECK32-SKX-NEXT: vmovups (%esp), %zmm0 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x04,0x24] -; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm1 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x4c,0x24,0x01] -; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm2 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x54,0x24,0x02] -; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm3 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x5c,0x24,0x03] -; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm4 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x64,0x24,0x04] -; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm5 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x6c,0x24,0x05] -; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm6 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x74,0x24,0x06] -; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm7 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x7c,0x24,0x07] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ## 8-byte Reload +; CHECK32-SKX-NEXT: vmovdqu64 (%esp), %zmm0 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x04,0x24] +; CHECK32-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm1 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x4c,0x24,0x01] +; CHECK32-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm2 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x54,0x24,0x02] +; CHECK32-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm3 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x5c,0x24,0x03] +; CHECK32-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm4 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x64,0x24,0x04] +; CHECK32-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm5 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x6c,0x24,0x05] +; CHECK32-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm6 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x74,0x24,0x06] +; CHECK32-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm7 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x7c,0x24,0x07] +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k0 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x84,0x24,0x30,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ## 8-byte Reload +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x8c,0x24,0x38,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k2 ## 8-byte Reload +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x94,0x24,0x40,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k3 ## 8-byte Reload +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k3 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x9c,0x24,0x48,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k4 ## 8-byte Reload +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k4 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xa4,0x24,0x50,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k5 ## 8-byte Reload +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k5 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xac,0x24,0x58,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k6 ## 8-byte Reload +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k6 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xb4,0x24,0x60,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k7 ## 8-byte Reload +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xbc,0x24,0x68,0x02,0x00,0x00] ; CHECK32-SKX-NEXT: addl $624, %esp ## encoding: [0x81,0xc4,0x70,0x02,0x00,0x00] ; CHECK32-SKX-NEXT: ## imm = 0x270 diff --git a/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll --- a/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll @@ -9,8 +9,8 @@ ; CHECK-LABEL: broadcast128: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: retq %1 = alloca <2 x i64>, align 16 %2 = bitcast <2 x i64>* %1 to i8*