diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1000,6 +1000,8 @@ /// legal as the hook is used before type legalization. bool isSafeMemOpType(MVT VT) const override; + bool isMemoryAccessesFast(EVT VT, Align Alignment) const; + /// Returns true if the target allows unaligned memory accesses of the /// specified type. Returns whether it is "fast" in the last argument. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2730,24 +2730,27 @@ return true; } +bool X86TargetLowering::isMemoryAccessesFast(EVT VT, Align Alignment) const { + bool IsMisaligned = (8 * Alignment.value()) % VT.getSizeInBits() != 0; + if (!IsMisaligned) + return true; + switch (VT.getSizeInBits()) { + default: + // 8-byte and under are always assumed to be fast. + return true; + case 128: + return !Subtarget.isUnalignedMem16Slow(); + case 256: + return !Subtarget.isUnalignedMem32Slow(); + // TODO: What about AVX-512 (512-bit) accesses? + } +} + bool X86TargetLowering::allowsMisalignedMemoryAccesses( EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const { - if (Fast) { - switch (VT.getSizeInBits()) { - default: - // 8-byte and under are always assumed to be fast. - *Fast = 1; - break; - case 128: - *Fast = !Subtarget.isUnalignedMem16Slow(); - break; - case 256: - *Fast = !Subtarget.isUnalignedMem32Slow(); - break; - // TODO: What about AVX-512 (512-bit) accesses? - } - } + if (Fast) + *Fast = isMemoryAccessesFast(VT, Alignment); // NonTemporal vector memory ops must be aligned. if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { // NT loads can only be vector aligned, so if its less aligned than the @@ -2767,7 +2770,38 @@ unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const { - return allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags, Fast); + if (Fast) + *Fast = isMemoryAccessesFast(VT, Alignment); + if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { + if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags, + /*Fast=*/nullptr)) + return true; + // NonTemporal vector memory ops are special, and must be aligned. + bool IsMisaligned = (8 * Alignment.value()) % VT.getSizeInBits() != 0; + if (IsMisaligned) + return false; + switch (VT.getSizeInBits()) { + case 128: + if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41()) + return true; + if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2()) + return true; + return false; + case 256: + if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2()) + return true; + if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX()) + return true; + return false; + case 512: + if (Subtarget.hasAVX512()) + return true; + return false; + default: + return false; // Don't have NonTemporal vector memory ops of this size. + } + } + return true; } /// Return the entry encoding for a jump table in the diff --git a/llvm/test/CodeGen/X86/avx-arith.ll b/llvm/test/CodeGen/X86/avx-arith.ll --- a/llvm/test/CodeGen/X86/avx-arith.ll +++ b/llvm/test/CodeGen/X86/avx-arith.ll @@ -14,9 +14,7 @@ define <4 x double> @addpd256fold(<4 x double> %y) nounwind uwtable readnone ssp { ; CHECK-LABEL: addpd256fold: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4.5E+0,3.3999999999999999E+0,2.2999999999999998E+0,1.2E+0] -; CHECK-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm1, %ymm1 -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq entry: %add.i = fadd <4 x double> %y, @@ -36,9 +34,7 @@ define <8 x float> @addps256fold(<8 x float> %y) nounwind uwtable readnone ssp { ; CHECK-LABEL: addps256fold: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4.5E+0,3.4000001E+0,2.29999995E+0,1.20000005E+0,4.5E+0,3.4000001E+0,2.29999995E+0,1.20000005E+0] -; CHECK-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm1, %ymm1 -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq entry: %add.i = fadd <8 x float> %y, @@ -58,9 +54,7 @@ define <4 x double> @subpd256fold(<4 x double> %y, ptr nocapture %x) nounwind uwtable readonly ssp { ; CHECK-LABEL: subpd256fold: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmovaps (%rdi), %xmm1 -; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 -; CHECK-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vsubpd (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq entry: %tmp2 = load <4 x double>, ptr %x, align 32 @@ -81,9 +75,7 @@ define <8 x float> @subps256fold(<8 x float> %y, ptr nocapture %x) nounwind uwtable readonly ssp { ; CHECK-LABEL: subps256fold: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmovaps (%rdi), %xmm1 -; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 -; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vsubps (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq entry: %tmp2 = load <8 x float>, ptr %x, align 32 @@ -104,9 +96,7 @@ define <4 x double> @mulpd256fold(<4 x double> %y) nounwind uwtable readnone ssp { ; CHECK-LABEL: mulpd256fold: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4.5E+0,3.3999999999999999E+0,2.2999999999999998E+0,1.2E+0] -; CHECK-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm1, %ymm1 -; CHECK-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq entry: %mul.i = fmul <4 x double> %y, @@ -126,9 +116,7 @@ define <8 x float> @mulps256fold(<8 x float> %y) nounwind uwtable readnone ssp { ; CHECK-LABEL: mulps256fold: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4.5E+0,3.4000001E+0,2.29999995E+0,1.20000005E+0,4.5E+0,3.4000001E+0,2.29999995E+0,1.20000005E+0] -; CHECK-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm1, %ymm1 -; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq entry: %mul.i = fmul <8 x float> %y, @@ -148,9 +136,7 @@ define <4 x double> @divpd256fold(<4 x double> %y) nounwind uwtable readnone ssp { ; CHECK-LABEL: divpd256fold: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4.5E+0,3.3999999999999999E+0,2.2999999999999998E+0,1.2E+0] -; CHECK-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm1, %ymm1 -; CHECK-NEXT: vdivpd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vdivpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq entry: %div.i = fdiv <4 x double> %y, @@ -170,9 +156,7 @@ define <8 x float> @divps256fold(<8 x float> %y) nounwind uwtable readnone ssp { ; CHECK-LABEL: divps256fold: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4.5E+0,3.4000001E+0,2.29999995E+0,1.20000005E+0,4.5E+0,3.4000001E+0,2.29999995E+0,1.20000005E+0] -; CHECK-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm1, %ymm1 -; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vdivps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq entry: %div.i = fdiv <8 x float> %y, diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll --- a/llvm/test/CodeGen/X86/avx-load-store.ll +++ b/llvm/test/CodeGen/X86/avx-load-store.ll @@ -8,39 +8,24 @@ ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $144, %rsp +; CHECK-NEXT: subq $96, %rsp ; CHECK-NEXT: movq %rdx, %rbx ; CHECK-NEXT: movq %rsi, %r14 ; CHECK-NEXT: movq %rdi, %r15 -; CHECK-NEXT: vmovaps (%rdi), %xmm1 +; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovaps (%rsi), %ymm1 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovaps 16(%rdi), %xmm0 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: vmovaps (%rsi), %xmm2 -; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovaps 16(%rsi), %xmm1 -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; CHECK-NEXT: vmovaps (%rdx), %xmm3 -; CHECK-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovaps 16(%rdx), %xmm2 -; CHECK-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; CHECK-NEXT: vmovaps (%rdx), %ymm2 +; CHECK-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; CHECK-NEXT: callq dummy@PLT ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vmovaps %xmm0, (%r15) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 16(%r15) +; CHECK-NEXT: vmovaps %ymm0, (%r15) ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vmovaps %xmm0, (%r14) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 16(%r14) -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vmovaps %xmm0, (%rbx) -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 16(%rbx) -; CHECK-NEXT: addq $144, %rsp +; CHECK-NEXT: vmovaps %ymm0, (%r14) +; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovaps %ymm0, (%rbx) +; CHECK-NEXT: addq $96, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 @@ -124,8 +109,7 @@ define void @storev16i16(<16 x i16> %a) nounwind { ; CHECK-LABEL: storev16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax) -; CHECK-NEXT: vmovaps %xmm0, (%rax) +; CHECK-NEXT: vmovaps %ymm0, (%rax) ; ; CHECK_O0-LABEL: storev16i16: ; CHECK_O0: # %bb.0: @@ -152,8 +136,7 @@ define void @storev32i8(<32 x i8> %a) nounwind { ; CHECK-LABEL: storev32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax) -; CHECK-NEXT: vmovaps %xmm0, (%rax) +; CHECK-NEXT: vmovaps %ymm0, (%rax) ; ; CHECK_O0-LABEL: storev32i8: ; CHECK_O0: # %bb.0: @@ -306,19 +289,14 @@ define void @add4i64a64(ptr %ret, ptr %bp) nounwind { ; CHECK-LABEL: add4i64a64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rsi), %xmm0 -; CHECK-NEXT: vmovaps 16(%rsi), %xmm1 -; CHECK-NEXT: vmovaps %xmm0, (%rdi) -; CHECK-NEXT: vmovaps %xmm1, 16(%rdi) +; CHECK-NEXT: vmovaps (%rsi), %ymm0 +; CHECK-NEXT: vmovaps %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq ; ; CHECK_O0-LABEL: add4i64a64: ; CHECK_O0: # %bb.0: -; CHECK_O0-NEXT: vmovdqa (%rsi), %xmm2 -; CHECK_O0-NEXT: vmovdqa 16(%rsi), %xmm1 -; CHECK_O0-NEXT: # implicit-def: $ymm0 -; CHECK_O0-NEXT: vmovaps %xmm2, %xmm0 -; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK_O0-NEXT: vmovaps (%rsi), %ymm0 ; CHECK_O0-NEXT: vmovdqa %ymm0, (%rdi) ; CHECK_O0-NEXT: vzeroupper ; CHECK_O0-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx-varargs-x86_64.ll b/llvm/test/CodeGen/X86/avx-varargs-x86_64.ll --- a/llvm/test/CodeGen/X86/avx-varargs-x86_64.ll +++ b/llvm/test/CodeGen/X86/avx-varargs-x86_64.ll @@ -19,12 +19,11 @@ ; CHECK-NEXT: andq $-32, %rsp ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: movq _x@GOTPCREL(%rip), %rax -; CHECK-NEXT: vmovaps (%rax), %xmm0 -; CHECK-NEXT: vmovaps 16(%rax), %xmm1 -; CHECK-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %xmm0, (%rsp) +; CHECK-NEXT: vmovaps (%rax), %ymm0 +; CHECK-NEXT: vmovaps %ymm0, (%rsp) ; CHECK-NEXT: movl $1, %edi ; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/fold-vector-sext-crash2.ll b/llvm/test/CodeGen/X86/fold-vector-sext-crash2.ll --- a/llvm/test/CodeGen/X86/fold-vector-sext-crash2.ll +++ b/llvm/test/CodeGen/X86/fold-vector-sext-crash2.ll @@ -29,14 +29,13 @@ ; X64-LABEL: test_sext1: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, 16(%rdi) +; X64-NEXT: movaps %xmm0, (%rdi) ; X64-NEXT: movq $-1, 56(%rdi) ; X64-NEXT: movq $-1, 48(%rdi) ; X64-NEXT: movq $-1, 40(%rdi) ; X64-NEXT: movq $-99, 32(%rdi) -; X64-NEXT: movq $0, 24(%rdi) -; X64-NEXT: movq $0, 16(%rdi) -; X64-NEXT: movq $0, 8(%rdi) -; X64-NEXT: movq $0, (%rdi) ; X64-NEXT: retq %Se = sext <2 x i8> to <2 x i256> %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> @@ -68,14 +67,13 @@ ; X64-LABEL: test_sext2: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, 16(%rdi) +; X64-NEXT: movaps %xmm0, (%rdi) ; X64-NEXT: movq $-1, 56(%rdi) ; X64-NEXT: movq $-1, 48(%rdi) ; X64-NEXT: movq $-1, 40(%rdi) ; X64-NEXT: movq $-1999, 32(%rdi) # imm = 0xF831 -; X64-NEXT: movq $0, 24(%rdi) -; X64-NEXT: movq $0, 16(%rdi) -; X64-NEXT: movq $0, 8(%rdi) -; X64-NEXT: movq $0, (%rdi) ; X64-NEXT: retq %Se = sext <2 x i128> to <2 x i256> %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> @@ -107,14 +105,12 @@ ; X64-LABEL: test_zext1: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq $0, 56(%rdi) -; X64-NEXT: movq $0, 48(%rdi) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, 48(%rdi) +; X64-NEXT: movaps %xmm0, 16(%rdi) +; X64-NEXT: movaps %xmm0, (%rdi) ; X64-NEXT: movq $0, 40(%rdi) ; X64-NEXT: movq $254, 32(%rdi) -; X64-NEXT: movq $0, 24(%rdi) -; X64-NEXT: movq $0, 16(%rdi) -; X64-NEXT: movq $0, 8(%rdi) -; X64-NEXT: movq $0, (%rdi) ; X64-NEXT: retq %Se = zext <2 x i8> to <2 x i256> %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> @@ -146,14 +142,12 @@ ; X64-LABEL: test_zext2: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq $0, 56(%rdi) -; X64-NEXT: movq $0, 48(%rdi) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, 48(%rdi) +; X64-NEXT: movaps %xmm0, 16(%rdi) +; X64-NEXT: movaps %xmm0, (%rdi) ; X64-NEXT: movq $-1, 40(%rdi) ; X64-NEXT: movq $-2, 32(%rdi) -; X64-NEXT: movq $0, 24(%rdi) -; X64-NEXT: movq $0, 16(%rdi) -; X64-NEXT: movq $0, 8(%rdi) -; X64-NEXT: movq $0, (%rdi) ; X64-NEXT: retq %Se = zext <2 x i128> to <2 x i256> %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> diff --git a/llvm/test/CodeGen/X86/fp-load-trunc.ll b/llvm/test/CodeGen/X86/fp-load-trunc.ll --- a/llvm/test/CodeGen/X86/fp-load-trunc.ll +++ b/llvm/test/CodeGen/X86/fp-load-trunc.ll @@ -58,10 +58,7 @@ ; AVX-LABEL: test3: ; AVX: # %bb.0: ; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-NEXT: vmovaps (%eax), %xmm0 -; AVX-NEXT: vinsertf128 $1, 16(%eax), %ymm0, %ymm0 -; AVX-NEXT: vcvtpd2ps %ymm0, %xmm0 -; AVX-NEXT: vzeroupper +; AVX-NEXT: vcvtpd2psy (%eax), %xmm0 ; AVX-NEXT: retl %x = load <4 x double>, ptr %p %y = fptrunc <4 x double> %x to <4 x float> @@ -83,12 +80,8 @@ ; AVX-LABEL: test4: ; AVX: # %bb.0: ; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-NEXT: vmovaps (%eax), %xmm0 -; AVX-NEXT: vmovaps 32(%eax), %xmm1 -; AVX-NEXT: vinsertf128 $1, 48(%eax), %ymm1, %ymm1 -; AVX-NEXT: vinsertf128 $1, 16(%eax), %ymm0, %ymm0 -; AVX-NEXT: vcvtpd2ps %ymm0, %xmm0 -; AVX-NEXT: vcvtpd2ps %ymm1, %xmm1 +; AVX-NEXT: vcvtpd2psy (%eax), %xmm0 +; AVX-NEXT: vcvtpd2psy 32(%eax), %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: retl %x = load <8 x double>, ptr %p diff --git a/llvm/test/CodeGen/X86/i128-fpconv-win64-strict.ll b/llvm/test/CodeGen/X86/i128-fpconv-win64-strict.ll --- a/llvm/test/CodeGen/X86/i128-fpconv-win64-strict.ll +++ b/llvm/test/CodeGen/X86/i128-fpconv-win64-strict.ll @@ -92,10 +92,8 @@ ; WIN64-LABEL: i128_to_double: ; WIN64: # %bb.0: ; WIN64-NEXT: subq $56, %rsp -; WIN64-NEXT: movq (%rcx), %rax -; WIN64-NEXT: movq 8(%rcx), %rcx -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movaps (%rcx), %xmm0 +; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: callq __floattidf ; WIN64-NEXT: addq $56, %rsp @@ -109,10 +107,8 @@ ; WIN64-LABEL: ui128_to_double: ; WIN64: # %bb.0: ; WIN64-NEXT: subq $56, %rsp -; WIN64-NEXT: movq (%rcx), %rax -; WIN64-NEXT: movq 8(%rcx), %rcx -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movaps (%rcx), %xmm0 +; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: callq __floatuntidf ; WIN64-NEXT: addq $56, %rsp @@ -126,10 +122,8 @@ ; WIN64-LABEL: i128_to_float: ; WIN64: # %bb.0: ; WIN64-NEXT: subq $56, %rsp -; WIN64-NEXT: movq (%rcx), %rax -; WIN64-NEXT: movq 8(%rcx), %rcx -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movaps (%rcx), %xmm0 +; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: callq __floattisf ; WIN64-NEXT: addq $56, %rsp @@ -143,10 +137,8 @@ ; WIN64-LABEL: ui128_to_float: ; WIN64: # %bb.0: ; WIN64-NEXT: subq $56, %rsp -; WIN64-NEXT: movq (%rcx), %rax -; WIN64-NEXT: movq 8(%rcx), %rcx -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movaps (%rcx), %xmm0 +; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: callq __floatuntisf ; WIN64-NEXT: addq $56, %rsp @@ -162,10 +154,8 @@ ; WIN64-NEXT: pushq %rsi ; WIN64-NEXT: subq $64, %rsp ; WIN64-NEXT: movq %rcx, %rsi -; WIN64-NEXT: movq (%rdx), %rax -; WIN64-NEXT: movq 8(%rdx), %rcx -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movaps (%rdx), %xmm0 +; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-NEXT: callq __floattixf @@ -187,10 +177,8 @@ ; WIN64-NEXT: pushq %rsi ; WIN64-NEXT: subq $64, %rsp ; WIN64-NEXT: movq %rcx, %rsi -; WIN64-NEXT: movq (%rdx), %rax -; WIN64-NEXT: movq 8(%rdx), %rcx -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movaps (%rdx), %xmm0 +; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-NEXT: callq __floatuntixf diff --git a/llvm/test/CodeGen/X86/i128-fpconv-win64.ll b/llvm/test/CodeGen/X86/i128-fpconv-win64.ll --- a/llvm/test/CodeGen/X86/i128-fpconv-win64.ll +++ b/llvm/test/CodeGen/X86/i128-fpconv-win64.ll @@ -92,10 +92,8 @@ ; WIN64-LABEL: i128_to_double: ; WIN64: # %bb.0: ; WIN64-NEXT: subq $56, %rsp -; WIN64-NEXT: movq (%rcx), %rax -; WIN64-NEXT: movq 8(%rcx), %rcx -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movaps (%rcx), %xmm0 +; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: callq __floattidf ; WIN64-NEXT: addq $56, %rsp @@ -109,10 +107,8 @@ ; WIN64-LABEL: ui128_to_double: ; WIN64: # %bb.0: ; WIN64-NEXT: subq $56, %rsp -; WIN64-NEXT: movq (%rcx), %rax -; WIN64-NEXT: movq 8(%rcx), %rcx -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movaps (%rcx), %xmm0 +; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: callq __floatuntidf ; WIN64-NEXT: addq $56, %rsp @@ -126,10 +122,8 @@ ; WIN64-LABEL: i128_to_float: ; WIN64: # %bb.0: ; WIN64-NEXT: subq $56, %rsp -; WIN64-NEXT: movq (%rcx), %rax -; WIN64-NEXT: movq 8(%rcx), %rcx -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movaps (%rcx), %xmm0 +; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: callq __floattisf ; WIN64-NEXT: addq $56, %rsp @@ -143,10 +137,8 @@ ; WIN64-LABEL: ui128_to_float: ; WIN64: # %bb.0: ; WIN64-NEXT: subq $56, %rsp -; WIN64-NEXT: movq (%rcx), %rax -; WIN64-NEXT: movq 8(%rcx), %rcx -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movaps (%rcx), %xmm0 +; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: callq __floatuntisf ; WIN64-NEXT: addq $56, %rsp @@ -162,10 +154,8 @@ ; WIN64-NEXT: pushq %rsi ; WIN64-NEXT: subq $64, %rsp ; WIN64-NEXT: movq %rcx, %rsi -; WIN64-NEXT: movq (%rdx), %rax -; WIN64-NEXT: movq 8(%rdx), %rcx -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movaps (%rdx), %xmm0 +; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-NEXT: callq __floattixf @@ -187,10 +177,8 @@ ; WIN64-NEXT: pushq %rsi ; WIN64-NEXT: subq $64, %rsp ; WIN64-NEXT: movq %rcx, %rsi -; WIN64-NEXT: movq (%rdx), %rax -; WIN64-NEXT: movq 8(%rdx), %rcx -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movaps (%rdx), %xmm0 +; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-NEXT: callq __floatuntixf diff --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll --- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll +++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll @@ -58,9 +58,9 @@ ; X64-NEXT: movq %rdi, 40(%rax) ; X64-NEXT: movq %r9, 32(%rax) ; X64-NEXT: movq %rsi, 24(%rax) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, (%rax) ; X64-NEXT: movq $0, 16(%rax) -; X64-NEXT: movq $0, 8(%rax) -; X64-NEXT: movq $0, (%rax) ; X64-NEXT: retq %Amt = insertelement <2 x i256> , i256 255, i32 0 %Out = shl <2 x i256> %In, %Amt @@ -153,8 +153,8 @@ ; X64-NEXT: movq %rsi, 40(%rdi) ; X64-NEXT: movq %r9, 32(%rdi) ; X64-NEXT: movq %r8, (%rdi) -; X64-NEXT: movq $0, 24(%rdi) -; X64-NEXT: movq $0, 16(%rdi) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, 16(%rdi) ; X64-NEXT: movq $0, 8(%rdi) ; X64-NEXT: retq %Amt = insertelement <2 x i256> , i256 255, i32 0 diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -4967,23 +4967,13 @@ } define void @top_bits_unset_stack() { -; SSE2-LABEL: top_bits_unset_stack: -; SSE2: ## %bb.0: ## %entry -; SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: retq -; -; SSE4-LABEL: top_bits_unset_stack: -; SSE4: ## %bb.0: ## %entry -; SSE4-NEXT: xorps %xmm0, %xmm0 -; SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE4-NEXT: retq +; SSE-LABEL: top_bits_unset_stack: +; SSE: ## %bb.0: ## %entry +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: top_bits_unset_stack: ; AVX1OR2: ## %bb.0: ## %entry diff --git a/llvm/test/CodeGen/X86/memset-zero.ll b/llvm/test/CodeGen/X86/memset-zero.ll --- a/llvm/test/CodeGen/X86/memset-zero.ll +++ b/llvm/test/CodeGen/X86/memset-zero.ll @@ -608,8 +608,8 @@ ; SANDYBRIDGE-LABEL: memset_32_align32: ; SANDYBRIDGE: # %bb.0: # %entry ; SANDYBRIDGE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SANDYBRIDGE-NEXT: vmovaps %xmm0, 16(%rdi) -; SANDYBRIDGE-NEXT: vmovaps %xmm0, (%rdi) +; SANDYBRIDGE-NEXT: vmovaps %ymm0, (%rdi) +; SANDYBRIDGE-NEXT: vzeroupper ; SANDYBRIDGE-NEXT: retq ; ; SKYLAKE-LABEL: memset_32_align32: @@ -802,10 +802,9 @@ ; SANDYBRIDGE-LABEL: memset_64_align64: ; SANDYBRIDGE: # %bb.0: # %entry ; SANDYBRIDGE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SANDYBRIDGE-NEXT: vmovaps %xmm0, 16(%rdi) -; SANDYBRIDGE-NEXT: vmovaps %xmm0, (%rdi) -; SANDYBRIDGE-NEXT: vmovaps %xmm0, 48(%rdi) -; SANDYBRIDGE-NEXT: vmovaps %xmm0, 32(%rdi) +; SANDYBRIDGE-NEXT: vmovaps %ymm0, 32(%rdi) +; SANDYBRIDGE-NEXT: vmovaps %ymm0, (%rdi) +; SANDYBRIDGE-NEXT: vzeroupper ; SANDYBRIDGE-NEXT: retq ; ; SKYLAKE-LABEL: memset_64_align64: diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -1071,36 +1071,12 @@ ; ; X86-SSE1-LABEL: merge_4i32_i32_combine: ; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl %ebp -; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE1-NEXT: .cfi_offset %ebp, -8 -; X86-SSE1-NEXT: movl %esp, %ebp -; X86-SSE1-NEXT: .cfi_def_cfa_register %ebp -; X86-SSE1-NEXT: pushl %edi -; X86-SSE1-NEXT: pushl %esi -; X86-SSE1-NEXT: andl $-16, %esp -; X86-SSE1-NEXT: subl $16, %esp -; X86-SSE1-NEXT: .cfi_offset %esi, -16 -; X86-SSE1-NEXT: .cfi_offset %edi, -12 -; X86-SSE1-NEXT: movl 8(%ebp), %eax -; X86-SSE1-NEXT: movl 12(%ebp), %ecx +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-SSE1-NEXT: andps %xmm0, %xmm1 -; X86-SSE1-NEXT: movaps %xmm1, (%esp) -; X86-SSE1-NEXT: movl (%esp), %ecx -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE1-NEXT: movl %edi, 12(%eax) -; X86-SSE1-NEXT: movl %esi, 8(%eax) -; X86-SSE1-NEXT: movl %edx, 4(%eax) -; X86-SSE1-NEXT: movl %ecx, (%eax) -; X86-SSE1-NEXT: leal -8(%ebp), %esp -; X86-SSE1-NEXT: popl %esi -; X86-SSE1-NEXT: popl %edi -; X86-SSE1-NEXT: popl %ebp -; X86-SSE1-NEXT: .cfi_def_cfa %esp, 4 +; X86-SSE1-NEXT: movaps %xmm1, (%eax) ; X86-SSE1-NEXT: retl ; ; X86-SSE41-LABEL: merge_4i32_i32_combine: diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -46,13 +46,20 @@ ; X64-SSE41-NEXT: movntdq %xmm1, 16(%rsi) ; X64-SSE41-NEXT: retq ; -; X64-AVX-LABEL: merge_2_v4f32_align32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0 -; X64-AVX-NEXT: vmovntdqa 16(%rdi), %xmm1 -; X64-AVX-NEXT: vmovntdq %xmm0, (%rsi) -; X64-AVX-NEXT: vmovntdq %xmm1, 16(%rsi) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: merge_2_v4f32_align32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; X64-AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 +; X64-AVX1-NEXT: vmovntdq %xmm0, (%rsi) +; X64-AVX1-NEXT: vmovntdq %xmm1, 16(%rsi) +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: merge_2_v4f32_align32: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovntdq %ymm0, (%rsi) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0 %2 = load <4 x float>, ptr %a0, align 32, !nontemporal !0 %3 = load <4 x float>, ptr %1, align 16, !nontemporal !0 @@ -499,6 +506,3 @@ } !0 = !{i32 1} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; X64-AVX1: {{.*}} -; X64-AVX2: {{.*}} diff --git a/llvm/test/CodeGen/X86/nontemporal-loads-2.ll b/llvm/test/CodeGen/X86/nontemporal-loads-2.ll --- a/llvm/test/CodeGen/X86/nontemporal-loads-2.ll +++ b/llvm/test/CodeGen/X86/nontemporal-loads-2.ll @@ -205,14 +205,10 @@ ; AVX-NEXT: movq %rsp, %rbp ; AVX-NEXT: andq $-32, %rsp ; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: movq 24(%rdi), %rax -; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq 16(%rdi), %rax -; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq %rax, (%rsp) +; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) ; AVX-NEXT: vmovaps (%rsp), %ymm0 ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp @@ -240,14 +236,10 @@ ; AVX-NEXT: movq %rsp, %rbp ; AVX-NEXT: andq $-32, %rsp ; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: movq 24(%rdi), %rax -; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq 16(%rdi), %rax -; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq %rax, (%rsp) +; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) ; AVX-NEXT: vmovaps (%rsp), %ymm0 ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp @@ -275,14 +267,10 @@ ; AVX-NEXT: movq %rsp, %rbp ; AVX-NEXT: andq $-32, %rsp ; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: movq 24(%rdi), %rax -; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq 16(%rdi), %rax -; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq %rax, (%rsp) +; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) ; AVX-NEXT: vmovaps (%rsp), %ymm0 ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp @@ -310,14 +298,10 @@ ; AVX-NEXT: movq %rsp, %rbp ; AVX-NEXT: andq $-32, %rsp ; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: movq 24(%rdi), %rax -; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq 16(%rdi), %rax -; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq %rax, (%rsp) +; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) ; AVX-NEXT: vmovaps (%rsp), %ymm0 ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp @@ -345,14 +329,10 @@ ; AVX-NEXT: movq %rsp, %rbp ; AVX-NEXT: andq $-32, %rsp ; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: movq 24(%rdi), %rax -; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq 16(%rdi), %rax -; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq %rax, (%rsp) +; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) ; AVX-NEXT: vmovaps (%rsp), %ymm0 ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp @@ -380,14 +360,10 @@ ; AVX-NEXT: movq %rsp, %rbp ; AVX-NEXT: andq $-32, %rsp ; AVX-NEXT: subq $64, %rsp -; AVX-NEXT: movq 24(%rdi), %rax -; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq 16(%rdi), %rax -; AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX-NEXT: movq %rax, (%rsp) +; AVX-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) ; AVX-NEXT: vmovaps (%rsp), %ymm0 ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp @@ -595,22 +571,14 @@ ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $96, %rsp -; AVX1-NEXT: movq 24(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 16(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq (%rdi), %rax -; AVX1-NEXT: movq 8(%rdi), %rcx -; AVX1-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %rax, (%rsp) -; AVX1-NEXT: movq 56(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 48(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 40(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 32(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX1-NEXT: vmovaps (%rsp), %ymm0 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX1-NEXT: movq %rbp, %rsp @@ -623,22 +591,14 @@ ; AVX2-NEXT: movq %rsp, %rbp ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $96, %rsp -; AVX2-NEXT: movq 24(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 16(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, (%rsp) -; AVX2-NEXT: movq 56(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 48(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 32(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsp) +; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovaps (%rsp), %ymm0 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp @@ -651,22 +611,14 @@ ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-64, %rsp ; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: movq 56(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 48(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 32(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 24(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, (%rsp) +; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) ; AVX512-NEXT: vmovaps (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp @@ -698,22 +650,14 @@ ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $96, %rsp -; AVX1-NEXT: movq 24(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 16(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq (%rdi), %rax -; AVX1-NEXT: movq 8(%rdi), %rcx -; AVX1-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %rax, (%rsp) -; AVX1-NEXT: movq 56(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 48(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 40(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 32(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX1-NEXT: vmovaps (%rsp), %ymm0 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX1-NEXT: movq %rbp, %rsp @@ -726,22 +670,14 @@ ; AVX2-NEXT: movq %rsp, %rbp ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $96, %rsp -; AVX2-NEXT: movq 24(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 16(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, (%rsp) -; AVX2-NEXT: movq 56(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 48(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 32(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsp) +; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovaps (%rsp), %ymm0 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp @@ -754,22 +690,14 @@ ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-64, %rsp ; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: movq 56(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 48(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 32(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 24(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, (%rsp) +; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) ; AVX512-NEXT: vmovaps (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp @@ -801,22 +729,14 @@ ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $96, %rsp -; AVX1-NEXT: movq 24(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 16(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq (%rdi), %rax -; AVX1-NEXT: movq 8(%rdi), %rcx -; AVX1-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %rax, (%rsp) -; AVX1-NEXT: movq 56(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 48(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 40(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 32(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX1-NEXT: vmovaps (%rsp), %ymm0 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX1-NEXT: movq %rbp, %rsp @@ -829,22 +749,14 @@ ; AVX2-NEXT: movq %rsp, %rbp ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $96, %rsp -; AVX2-NEXT: movq 24(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 16(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, (%rsp) -; AVX2-NEXT: movq 56(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 48(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 32(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsp) +; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovaps (%rsp), %ymm0 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp @@ -857,22 +769,14 @@ ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-64, %rsp ; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: movq 56(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 48(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 32(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 24(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, (%rsp) +; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) ; AVX512-NEXT: vmovaps (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp @@ -904,22 +808,14 @@ ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $96, %rsp -; AVX1-NEXT: movq 24(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 16(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq (%rdi), %rax -; AVX1-NEXT: movq 8(%rdi), %rcx -; AVX1-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %rax, (%rsp) -; AVX1-NEXT: movq 56(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 48(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 40(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 32(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX1-NEXT: vmovaps (%rsp), %ymm0 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX1-NEXT: movq %rbp, %rsp @@ -932,22 +828,14 @@ ; AVX2-NEXT: movq %rsp, %rbp ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $96, %rsp -; AVX2-NEXT: movq 24(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 16(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, (%rsp) -; AVX2-NEXT: movq 56(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 48(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 32(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsp) +; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovaps (%rsp), %ymm0 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp @@ -960,22 +848,14 @@ ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-64, %rsp ; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: movq 56(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 48(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 32(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 24(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, (%rsp) +; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) ; AVX512-NEXT: vmovaps (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp @@ -1007,22 +887,14 @@ ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $96, %rsp -; AVX1-NEXT: movq 24(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 16(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq (%rdi), %rax -; AVX1-NEXT: movq 8(%rdi), %rcx -; AVX1-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %rax, (%rsp) -; AVX1-NEXT: movq 56(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 48(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 40(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 32(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX1-NEXT: vmovaps (%rsp), %ymm0 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX1-NEXT: movq %rbp, %rsp @@ -1035,22 +907,14 @@ ; AVX2-NEXT: movq %rsp, %rbp ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $96, %rsp -; AVX2-NEXT: movq 24(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 16(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, (%rsp) -; AVX2-NEXT: movq 56(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 48(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 32(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsp) +; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovaps (%rsp), %ymm0 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp @@ -1063,22 +927,14 @@ ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-64, %rsp ; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: movq 56(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 48(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 32(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 24(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, (%rsp) +; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) ; AVX512-NEXT: vmovaps (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp @@ -1110,22 +966,14 @@ ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $96, %rsp -; AVX1-NEXT: movq 24(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 16(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq (%rdi), %rax -; AVX1-NEXT: movq 8(%rdi), %rcx -; AVX1-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %rax, (%rsp) -; AVX1-NEXT: movq 56(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 48(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 40(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX1-NEXT: movq 32(%rdi), %rax -; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX1-NEXT: vmovaps (%rsp), %ymm0 ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX1-NEXT: movq %rbp, %rsp @@ -1138,22 +986,14 @@ ; AVX2-NEXT: movq %rsp, %rbp ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $96, %rsp -; AVX2-NEXT: movq 24(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 16(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, (%rsp) -; AVX2-NEXT: movq 56(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 48(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 32(%rdi), %rax -; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsp) +; AVX2-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovaps (%rsp), %ymm0 ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 ; AVX2-NEXT: movq %rbp, %rsp @@ -1166,22 +1006,14 @@ ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-64, %rsp ; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: movq 56(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 48(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 32(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 24(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, (%rsp) +; AVX512-NEXT: vmovntdqa 48(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 32(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) ; AVX512-NEXT: vmovaps (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp @@ -1229,22 +1061,10 @@ ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-64, %rsp ; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: movq 56(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 48(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 32(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 24(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, (%rsp) +; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) ; AVX512-NEXT: vmovaps (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp @@ -1292,22 +1112,10 @@ ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-64, %rsp ; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: movq 56(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 48(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 32(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 24(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, (%rsp) +; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) ; AVX512-NEXT: vmovaps (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp @@ -1355,22 +1163,10 @@ ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-64, %rsp ; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: movq 56(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 48(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 32(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 24(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, (%rsp) +; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) ; AVX512-NEXT: vmovaps (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp @@ -1418,22 +1214,10 @@ ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-64, %rsp ; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: movq 56(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 48(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 32(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 24(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, (%rsp) +; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) ; AVX512-NEXT: vmovaps (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp @@ -1481,22 +1265,10 @@ ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-64, %rsp ; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: movq 56(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 48(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 32(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 24(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, (%rsp) +; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) ; AVX512-NEXT: vmovaps (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp @@ -1544,22 +1316,10 @@ ; AVX512-NEXT: movq %rsp, %rbp ; AVX512-NEXT: andq $-64, %rsp ; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: movq 56(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 48(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 32(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 24(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %rax -; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rax, (%rsp) +; AVX512-NEXT: vmovntdqa 32(%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsp) ; AVX512-NEXT: vmovaps (%rsp), %zmm0 ; AVX512-NEXT: movq %rbp, %rsp ; AVX512-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll --- a/llvm/test/CodeGen/X86/recip-fastmath.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath.ll @@ -720,8 +720,7 @@ ; ; SANDY-LABEL: v8f32_no_estimate: ; SANDY: # %bb.0: -; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm1, %ymm1 +; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: retq ; @@ -803,8 +802,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm2, %ymm2 +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -930,8 +928,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm3, %ymm3 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -1027,8 +1024,7 @@ ; ; SANDY-LABEL: v16f32_no_estimate: ; SANDY: # %bb.0: -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm2, %ymm2 +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; SANDY-NEXT: retq @@ -1145,8 +1141,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm3, %ymm3 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: vrcpps %ymm1, %ymm4 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 @@ -1332,8 +1327,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; SANDY-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm4, %ymm4 +; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll --- a/llvm/test/CodeGen/X86/recip-fastmath2.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll @@ -771,8 +771,7 @@ ; SANDY-LABEL: v8f32_one_step2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm2, %ymm2 +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 @@ -881,14 +880,11 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm2, %ymm2 +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm2, %ymm2 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm1 +; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: retq ; @@ -1031,13 +1027,11 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm3, %ymm3 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm2, %ymm2 +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 @@ -1112,55 +1106,11 @@ ; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: retq ; -; AVX-RECIP-LABEL: v8f32_no_step2: -; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-RECIP-NEXT: retq -; -; FMA-RECIP-LABEL: v8f32_no_step2: -; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 -; FMA-RECIP-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; FMA-RECIP-NEXT: retq -; -; BDVER2-LABEL: v8f32_no_step2: -; BDVER2: # %bb.0: -; BDVER2-NEXT: vrcpps %ymm0, %ymm0 -; BDVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; BDVER2-NEXT: retq -; -; BTVER2-LABEL: v8f32_no_step2: -; BTVER2: # %bb.0: -; BTVER2-NEXT: vrcpps %ymm0, %ymm0 -; BTVER2-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; BTVER2-NEXT: retq -; -; SANDY-LABEL: v8f32_no_step2: -; SANDY: # %bb.0: -; SANDY-NEXT: vrcpps %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm1, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: retq -; -; HASWELL-LABEL: v8f32_no_step2: -; HASWELL: # %bb.0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; HASWELL-NEXT: retq -; -; HASWELL-NO-FMA-LABEL: v8f32_no_step2: -; HASWELL-NO-FMA: # %bb.0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: retq -; -; AVX512-LABEL: v8f32_no_step2: -; AVX512: # %bb.0: -; AVX512-NEXT: vrcpps %ymm0, %ymm0 -; AVX512-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: v8f32_no_step2: +; AVX: # %bb.0: +; AVX-NEXT: vrcpps %ymm0, %ymm0 +; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -1273,20 +1223,18 @@ ; SANDY-LABEL: v16f32_one_step2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm3, %ymm3 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; SANDY-NEXT: vmulps %ymm4, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; SANDY-NEXT: vrcpps %ymm1, %ymm3 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm2, %ymm2 ; SANDY-NEXT: vaddps %ymm0, %ymm4, %ymm0 -; SANDY-NEXT: vmulps %ymm2, %ymm3, %ymm4 +; SANDY-NEXT: vrcpps %ymm1, %ymm2 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 -; SANDY-NEXT: vsubps %ymm1, %ymm2, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm3, %ymm1 +; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 +; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 ; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; SANDY-NEXT: retq ; @@ -1447,8 +1395,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm3, %ymm3 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 @@ -1457,12 +1404,8 @@ ; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 ; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm2, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm3, %ymm3 -; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm3 +; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 +; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 ; SANDY-NEXT: vmulps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 ; SANDY-NEXT: retq @@ -1676,30 +1619,27 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; SANDY-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm4, %ymm4 +; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm3, %ymm3 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm5 ; SANDY-NEXT: vmulps %ymm5, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; SANDY-NEXT: vrcpps %ymm1, %ymm3 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm5, %ymm0 -; SANDY-NEXT: vmulps %ymm3, %ymm1, %ymm2 -; SANDY-NEXT: vsubps %ymm2, %ymm4, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm3, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm4 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm4, %ymm4 -; SANDY-NEXT: vaddps %ymm2, %ymm3, %ymm2 -; SANDY-NEXT: vmulps %ymm4, %ymm2, %ymm3 -; SANDY-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; SANDY-NEXT: vsubps %ymm1, %ymm4, %ymm1 +; SANDY-NEXT: vrcpps %ymm1, %ymm2 +; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 +; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 +; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; SANDY-NEXT: vaddps %ymm1, %ymm3, %ymm1 +; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v16f32_two_step2: @@ -1871,13 +1811,9 @@ ; SANDY-LABEL: v16f32_no_step2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm2, %ymm2 +; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; SANDY-NEXT: vrcpps %ymm1, %ymm1 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] -; SANDY-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm3, %ymm3 -; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; SANDY-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; SANDY-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v16f32_no_step2: diff --git a/llvm/test/CodeGen/X86/sandybridge-loads.ll b/llvm/test/CodeGen/X86/sandybridge-loads.ll --- a/llvm/test/CodeGen/X86/sandybridge-loads.ll +++ b/llvm/test/CodeGen/X86/sandybridge-loads.ll @@ -6,18 +6,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %xmm0 ; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vmovaps (%rsi), %xmm1 -; CHECK-NEXT: vinsertf128 $1, 16(%rsi), %ymm1, %ymm1 +; CHECK-NEXT: vmovaps (%rsi), %ymm1 ; CHECK-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovaps (%rdx), %xmm2 -; CHECK-NEXT: vinsertf128 $1, 16(%rdx), %ymm2, %ymm2 +; CHECK-NEXT: vmovaps (%rdx), %ymm2 ; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] -; CHECK-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm2, %ymm2 ; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax) -; CHECK-NEXT: vmovaps %xmm0, (%rax) +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vmovaps %ymm0, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %v0 = load <8 x float>, ptr %a, align 16 ; <---- unaligned! @@ -34,14 +29,12 @@ define void @widestores(ptr %a, ptr %b, ptr %c) nounwind { ; CHECK-LABEL: widestores: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 -; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 -; CHECK-NEXT: vmovaps (%rsi), %xmm2 -; CHECK-NEXT: vmovaps 16(%rsi), %xmm3 -; CHECK-NEXT: vmovaps %xmm1, 16(%rsi) -; CHECK-NEXT: vmovaps %xmm0, (%rsi) -; CHECK-NEXT: vmovaps %xmm2, (%rdi) -; CHECK-NEXT: vmovaps %xmm3, 16(%rdi) +; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovaps (%rsi), %ymm1 +; CHECK-NEXT: vmovaps %ymm0, (%rsi) +; CHECK-NEXT: vextractf128 $1, %ymm1, 16(%rdi) +; CHECK-NEXT: vmovaps %xmm1, (%rdi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %v0 = load <8 x float>, ptr %a, align 32 %v1 = load <8 x float>, ptr %b, align 32 @@ -53,14 +46,13 @@ define void @widestores_unaligned_load(ptr %a, ptr %b, ptr %c) nounwind { ; CHECK-LABEL: widestores_unaligned_load: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 -; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 -; CHECK-NEXT: vmovaps (%rsi), %xmm2 -; CHECK-NEXT: vmovaps 16(%rsi), %xmm3 -; CHECK-NEXT: vmovaps %xmm1, 16(%rsi) -; CHECK-NEXT: vmovaps %xmm0, (%rsi) -; CHECK-NEXT: vmovaps %xmm2, (%rdi) -; CHECK-NEXT: vmovaps %xmm3, 16(%rdi) +; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovaps (%rsi), %xmm1 +; CHECK-NEXT: vmovaps 16(%rsi), %xmm2 +; CHECK-NEXT: vmovaps %ymm0, (%rsi) +; CHECK-NEXT: vmovaps %xmm2, 16(%rdi) +; CHECK-NEXT: vmovaps %xmm1, (%rdi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %v0 = load <8 x float>, ptr %a, align 32 ; <--- aligned %v1 = load <8 x float>, ptr %b, align 16 ; <--- unaligned diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -739,10 +739,9 @@ ; ; x86_64-LABEL: test_lshr_v2i128_outofrange_sum: ; x86_64: # %bb.0: # %entry -; x86_64-NEXT: movq $0, 24(%r8) -; x86_64-NEXT: movq $0, 16(%r8) -; x86_64-NEXT: movq $0, 8(%r8) -; x86_64-NEXT: movq $0, (%r8) +; x86_64-NEXT: xorps %xmm0, %xmm0 +; x86_64-NEXT: movaps %xmm0, 16(%r8) +; x86_64-NEXT: movaps %xmm0, (%r8) ; x86_64-NEXT: retq entry: %0 = lshr <2 x i128> %x, @@ -767,10 +766,9 @@ ; ; x86_64-LABEL: test_ashr_v2i128_outofrange_sum: ; x86_64: # %bb.0: # %entry -; x86_64-NEXT: movq $0, 24(%r8) -; x86_64-NEXT: movq $0, 16(%r8) -; x86_64-NEXT: movq $0, 8(%r8) -; x86_64-NEXT: movq $0, (%r8) +; x86_64-NEXT: xorps %xmm0, %xmm0 +; x86_64-NEXT: movaps %xmm0, 16(%r8) +; x86_64-NEXT: movaps %xmm0, (%r8) ; x86_64-NEXT: retq entry: %0 = ashr <2 x i128> %x, @@ -795,10 +793,9 @@ ; ; x86_64-LABEL: test_shl_v2i128_outofrange_sum: ; x86_64: # %bb.0: # %entry -; x86_64-NEXT: movq $0, 24(%r8) -; x86_64-NEXT: movq $0, 16(%r8) -; x86_64-NEXT: movq $0, 8(%r8) -; x86_64-NEXT: movq $0, (%r8) +; x86_64-NEXT: xorps %xmm0, %xmm0 +; x86_64-NEXT: movaps %xmm0, 16(%r8) +; x86_64-NEXT: movaps %xmm0, (%r8) ; x86_64-NEXT: retq entry: %0 = shl <2 x i128> %x, @@ -836,14 +833,11 @@ ; x86_64-LABEL: shl_sext_shl_outofrange: ; x86_64: # %bb.0: ; x86_64-NEXT: movq %rdi, %rax -; x86_64-NEXT: movq $0, 56(%rdi) -; x86_64-NEXT: movq $0, 48(%rdi) -; x86_64-NEXT: movq $0, 40(%rdi) -; x86_64-NEXT: movq $0, 32(%rdi) -; x86_64-NEXT: movq $0, 24(%rdi) -; x86_64-NEXT: movq $0, 16(%rdi) -; x86_64-NEXT: movq $0, 8(%rdi) -; x86_64-NEXT: movq $0, (%rdi) +; x86_64-NEXT: xorps %xmm0, %xmm0 +; x86_64-NEXT: movaps %xmm0, 48(%rdi) +; x86_64-NEXT: movaps %xmm0, 32(%rdi) +; x86_64-NEXT: movaps %xmm0, 16(%rdi) +; x86_64-NEXT: movaps %xmm0, (%rdi) ; x86_64-NEXT: retq %1 = shl <2 x i128> %a0, %2 = sext <2 x i128> %1 to <2 x i256> @@ -876,14 +870,11 @@ ; x86_64-LABEL: shl_zext_shl_outofrange: ; x86_64: # %bb.0: ; x86_64-NEXT: movq %rdi, %rax -; x86_64-NEXT: movq $0, 56(%rdi) -; x86_64-NEXT: movq $0, 48(%rdi) -; x86_64-NEXT: movq $0, 40(%rdi) -; x86_64-NEXT: movq $0, 32(%rdi) -; x86_64-NEXT: movq $0, 24(%rdi) -; x86_64-NEXT: movq $0, 16(%rdi) -; x86_64-NEXT: movq $0, 8(%rdi) -; x86_64-NEXT: movq $0, (%rdi) +; x86_64-NEXT: xorps %xmm0, %xmm0 +; x86_64-NEXT: movaps %xmm0, 48(%rdi) +; x86_64-NEXT: movaps %xmm0, 32(%rdi) +; x86_64-NEXT: movaps %xmm0, 16(%rdi) +; x86_64-NEXT: movaps %xmm0, (%rdi) ; x86_64-NEXT: retq %1 = shl <2 x i128> %a0, %2 = zext <2 x i128> %1 to <2 x i256> @@ -916,14 +907,11 @@ ; x86_64-LABEL: shl_zext_lshr_outofrange: ; x86_64: # %bb.0: ; x86_64-NEXT: movq %rdi, %rax -; x86_64-NEXT: movq $0, 56(%rdi) -; x86_64-NEXT: movq $0, 48(%rdi) -; x86_64-NEXT: movq $0, 40(%rdi) -; x86_64-NEXT: movq $0, 32(%rdi) -; x86_64-NEXT: movq $0, 24(%rdi) -; x86_64-NEXT: movq $0, 16(%rdi) -; x86_64-NEXT: movq $0, 8(%rdi) -; x86_64-NEXT: movq $0, (%rdi) +; x86_64-NEXT: xorps %xmm0, %xmm0 +; x86_64-NEXT: movaps %xmm0, 48(%rdi) +; x86_64-NEXT: movaps %xmm0, 32(%rdi) +; x86_64-NEXT: movaps %xmm0, 16(%rdi) +; x86_64-NEXT: movaps %xmm0, (%rdi) ; x86_64-NEXT: retq %1 = lshr <2 x i128> %a0, %2 = zext <2 x i128> %1 to <2 x i256> diff --git a/llvm/test/CodeGen/X86/single_elt_vector_memory_operation.ll b/llvm/test/CodeGen/X86/single_elt_vector_memory_operation.ll --- a/llvm/test/CodeGen/X86/single_elt_vector_memory_operation.ll +++ b/llvm/test/CodeGen/X86/single_elt_vector_memory_operation.ll @@ -14,9 +14,9 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, (%rdx) +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) ; SSE2-NEXT: movq %xmm1, 8(%rdx) -; SSE2-NEXT: movq $0, 24(%rdx) -; SSE2-NEXT: movq $0, 16(%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_single_128bit_elt_vector: @@ -103,14 +103,13 @@ ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: movaps %xmm4, 48(%rdx) +; SSE2-NEXT: movaps %xmm4, 32(%rdx) ; SSE2-NEXT: movq %xmm1, 16(%rdx) ; SSE2-NEXT: movq %xmm0, (%rdx) ; SSE2-NEXT: movq %xmm3, 24(%rdx) ; SSE2-NEXT: movq %xmm2, 8(%rdx) -; SSE2-NEXT: movq $0, 56(%rdx) -; SSE2-NEXT: movq $0, 48(%rdx) -; SSE2-NEXT: movq $0, 40(%rdx) -; SSE2-NEXT: movq $0, 32(%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_single_256bit_elt_vector: @@ -219,6 +218,11 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] +; SSE2-NEXT: xorps %xmm8, %xmm8 +; SSE2-NEXT: movaps %xmm8, 112(%rdx) +; SSE2-NEXT: movaps %xmm8, 96(%rdx) +; SSE2-NEXT: movaps %xmm8, 80(%rdx) +; SSE2-NEXT: movaps %xmm8, 64(%rdx) ; SSE2-NEXT: movq %xmm3, 48(%rdx) ; SSE2-NEXT: movq %xmm2, 32(%rdx) ; SSE2-NEXT: movq %xmm1, 16(%rdx) @@ -227,14 +231,6 @@ ; SSE2-NEXT: movq %xmm6, 40(%rdx) ; SSE2-NEXT: movq %xmm5, 24(%rdx) ; SSE2-NEXT: movq %xmm4, 8(%rdx) -; SSE2-NEXT: movq $0, 112(%rdx) -; SSE2-NEXT: movq $0, 120(%rdx) -; SSE2-NEXT: movq $0, 96(%rdx) -; SSE2-NEXT: movq $0, 104(%rdx) -; SSE2-NEXT: movq $0, 80(%rdx) -; SSE2-NEXT: movq $0, 88(%rdx) -; SSE2-NEXT: movq $0, 64(%rdx) -; SSE2-NEXT: movq $0, 72(%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: load_single_512bit_elt_vector: diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll --- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll @@ -147,20 +147,13 @@ ; SNB: # %bb.0: ; SNB-NEXT: vrsqrtps %ymm0, %ymm1 ; SNB-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SNB-NEXT: vmovaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; SNB-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm3, %ymm3 -; SNB-NEXT: vmulps %ymm3, %ymm2, %ymm3 +; SNB-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 ; SNB-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; SNB-NEXT: vmovaps {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; SNB-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm2, %ymm2 -; SNB-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; SNB-NEXT: vmovaps {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; SNB-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm2, %ymm2 +; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1 -; SNB-NEXT: vmovaps {{.*#+}} xmm3 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] -; SNB-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm3, %ymm3 -; SNB-NEXT: vandps %ymm2, %ymm0, %ymm0 -; SNB-NEXT: vcmpleps %ymm0, %ymm3, %ymm0 +; SNB-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; SNB-NEXT: vmovaps {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; SNB-NEXT: vcmpleps %ymm0, %ymm2, %ymm0 ; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0 ; SNB-NEXT: retq ; @@ -347,15 +340,11 @@ ; SNB-LABEL: v8f32_daz: ; SNB: # %bb.0: ; SNB-NEXT: vrsqrtps %ymm0, %ymm1 -; SNB-NEXT: vmovaps {{.*#+}} xmm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; SNB-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm2, %ymm2 -; SNB-NEXT: vmulps %ymm1, %ymm0, %ymm3 -; SNB-NEXT: vmulps %ymm2, %ymm3, %ymm2 -; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1 -; SNB-NEXT: vmovaps {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; SNB-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %ymm3, %ymm3 -; SNB-NEXT: vaddps %ymm3, %ymm1, %ymm1 +; SNB-NEXT: vmulps %ymm1, %ymm0, %ymm2 +; SNB-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 ; SNB-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1 ; SNB-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; SNB-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll --- a/llvm/test/CodeGen/X86/sse1.ll +++ b/llvm/test/CodeGen/X86/sse1.ll @@ -144,35 +144,32 @@ define <4 x i32> @PR30512(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-LABEL: PR30512: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $48, %esp -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl 16(%ebp), %edx -; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpl 40(%ebp), %esi -; X86-NEXT: movl 20(%ebp), %esi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sete %bl ; X86-NEXT: negl %ebx ; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpl 36(%ebp), %esi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sete %bl ; X86-NEXT: negl %ebx ; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpl 32(%ebp), %edx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx ; X86-NEXT: sete %bl ; X86-NEXT: negl %ebx ; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl 28(%ebp), %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sete %dl ; X86-NEXT: negl %edx ; X86-NEXT: movl %edx, (%esp) @@ -184,20 +181,11 @@ ; X86-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; X86-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; X86-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movaps %xmm2, (%eax) +; X86-NEXT: addl $16, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: PR30512: @@ -231,15 +219,7 @@ ; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; X64-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; X64-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; X64-NEXT: movl %edx, 8(%rax) -; X64-NEXT: movl %ecx, (%rax) -; X64-NEXT: shrq $32, %rcx -; X64-NEXT: shrq $32, %rdx -; X64-NEXT: movl %edx, 12(%rax) -; X64-NEXT: movl %ecx, 4(%rax) +; X64-NEXT: movaps %xmm2, (%rax) ; X64-NEXT: retq %cmp = icmp eq <4 x i32> %x, %y %zext = zext <4 x i1> %cmp to <4 x i32> diff --git a/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll --- a/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll +++ b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -81,8 +81,7 @@ define <8 x float> @combine_16_byte_loads_aligned(ptr %ptr) { ; AVXSLOW-LABEL: combine_16_byte_loads_aligned: ; AVXSLOW: # %bb.0: -; AVXSLOW-NEXT: vmovaps 48(%rdi), %xmm0 -; AVXSLOW-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 +; AVXSLOW-NEXT: vmovaps 48(%rdi), %ymm0 ; AVXSLOW-NEXT: retq ; ; AVXFAST-LABEL: combine_16_byte_loads_aligned: diff --git a/llvm/test/CodeGen/X86/unaligned-load.ll b/llvm/test/CodeGen/X86/unaligned-load.ll --- a/llvm/test/CodeGen/X86/unaligned-load.ll +++ b/llvm/test/CodeGen/X86/unaligned-load.ll @@ -66,17 +66,15 @@ ; I386-LABEL: func_aligned: ; I386: ## %bb.0: ## %entry ; I386-NEXT: subl $44, %esp +; I386-NEXT: movaps {{.*#+}} xmm0 = [1498564676,1313821779,1380982853,1095911247] ; I386-NEXT: .p2align 4, 0x90 ; I386-NEXT: LBB1_1: ## %bb ; I386-NEXT: ## =>This Inner Loop Header: Depth=1 +; I386-NEXT: movaps %xmm0, (%esp) ; I386-NEXT: movl $4673097, {{[0-9]+}}(%esp) ## imm = 0x474E49 ; I386-NEXT: movl $1230132307, {{[0-9]+}}(%esp) ## imm = 0x49525453 ; I386-NEXT: movl $541347367, {{[0-9]+}}(%esp) ## imm = 0x20444E27 ; I386-NEXT: movl $840969293, {{[0-9]+}}(%esp) ## imm = 0x32202C4D -; I386-NEXT: movl $1095911247, {{[0-9]+}}(%esp) ## imm = 0x4152474F -; I386-NEXT: movl $1380982853, {{[0-9]+}}(%esp) ## imm = 0x52502045 -; I386-NEXT: movl $1313821779, {{[0-9]+}}(%esp) ## imm = 0x4E4F5453 -; I386-NEXT: movl $1498564676, (%esp) ## imm = 0x59524844 ; I386-NEXT: jmp LBB1_1 ; ; CORE2-LABEL: func_aligned: diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll @@ -16,15 +16,7 @@ ; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andps (%rsi), %xmm0 ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_constant_varx_mone: @@ -61,15 +53,7 @@ ; CHECK-SSE1-NEXT: movaps (%rsi), %xmm0 ; CHECK-SSE1-NEXT: andnps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_varx_mone: @@ -103,15 +87,7 @@ ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rsi), %xmm0 ; CHECK-SSE1-NEXT: orps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_constant_varx_mone_invmask: @@ -146,15 +122,7 @@ ; CHECK-SSE1-NEXT: xorps %xmm1, %xmm2 ; CHECK-SSE1-NEXT: andnps %xmm2, %xmm0 ; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask: @@ -194,15 +162,7 @@ ; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_constant_varx_42: @@ -239,15 +199,7 @@ ; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_varx_42: @@ -284,15 +236,7 @@ ; CHECK-SSE1-NEXT: andnps (%rsi), %xmm1 ; CHECK-SSE1-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_constant_varx_42_invmask: @@ -330,15 +274,7 @@ ; CHECK-SSE1-NEXT: andnps (%rsi), %xmm1 ; CHECK-SSE1-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_varx_42_invmask: @@ -372,15 +308,7 @@ ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0 ; CHECK-SSE1-NEXT: orps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_constant_mone_vary: @@ -410,15 +338,7 @@ ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: orps (%rdx), %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_mone_vary: @@ -451,15 +371,7 @@ ; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andps (%rdx), %xmm0 ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_constant_mone_vary_invmask: @@ -497,15 +409,7 @@ ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE1-NEXT: orps (%rdx), %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_mone_vary_invmask: @@ -540,15 +444,7 @@ ; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps (%rdx), %xmm0 ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_constant_42_vary: @@ -585,15 +481,7 @@ ; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1 ; CHECK-SSE1-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_42_vary: @@ -630,15 +518,7 @@ ; CHECK-SSE1-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE1-NEXT: andps (%rdx), %xmm0 ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_constant_42_vary_invmask: @@ -676,15 +556,7 @@ ; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_42_vary_invmask: diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -1047,15 +1047,7 @@ ; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps (%rdx), %xmm0 ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i32: @@ -1118,15 +1110,7 @@ ; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 ; CHECK-SSE1-NEXT: andnps (%rdx), %xmm0 ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i32_undef: @@ -3165,15 +3149,7 @@ ; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1 ; CHECK-SSE1-NEXT: andps (%rsi), %xmm0 ; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; CHECK-SSE1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; CHECK-SSE1-NEXT: movl %edx, 8(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, (%rdi) -; CHECK-SSE1-NEXT: shrq $32, %rcx -; CHECK-SSE1-NEXT: shrq $32, %rdx -; CHECK-SSE1-NEXT: movl %edx, 12(%rdi) -; CHECK-SSE1-NEXT: movl %ecx, 4(%rdi) +; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_v4i32: diff --git a/llvm/test/CodeGen/X86/vector-gep.ll b/llvm/test/CodeGen/X86/vector-gep.ll --- a/llvm/test/CodeGen/X86/vector-gep.ll +++ b/llvm/test/CodeGen/X86/vector-gep.ll @@ -122,21 +122,21 @@ ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: andl $-32, %esp ; CHECK-NEXT: subl $160, %esp -; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm3 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm3 ; CHECK-NEXT: vbroadcastss 12(%ebp), %xmm5 ; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 ; CHECK-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm5, %xmm1 -; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm5, %xmm1 -; CHECK-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpaddd %xmm2, %xmm2, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill @@ -152,41 +152,41 @@ ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqa 104(%ebp), %xmm0 +; CHECK-NEXT: vmovdqa 72(%ebp), %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, (%esp) # 16-byte Spill -; CHECK-NEXT: vmovdqa 120(%ebp), %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm2 -; CHECK-NEXT: vmovdqa 72(%ebp), %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm1 ; CHECK-NEXT: vmovdqa 88(%ebp), %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm2 +; CHECK-NEXT: vmovdqa 104(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm1 +; CHECK-NEXT: vmovdqa 120(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 -; CHECK-NEXT: vmovdqa 168(%ebp), %xmm6 +; CHECK-NEXT: vmovdqa 136(%ebp), %xmm6 ; CHECK-NEXT: vpaddd %xmm6, %xmm6, %xmm6 ; CHECK-NEXT: vpaddd %xmm6, %xmm5, %xmm6 -; CHECK-NEXT: vmovdqa 184(%ebp), %xmm7 +; CHECK-NEXT: vmovdqa 152(%ebp), %xmm7 ; CHECK-NEXT: vpaddd %xmm7, %xmm7, %xmm7 ; CHECK-NEXT: vpaddd %xmm7, %xmm5, %xmm7 -; CHECK-NEXT: vmovdqa 136(%ebp), %xmm4 +; CHECK-NEXT: vmovdqa 168(%ebp), %xmm4 ; CHECK-NEXT: vpaddd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vmovdqa 152(%ebp), %xmm3 +; CHECK-NEXT: vmovdqa 184(%ebp), %xmm3 ; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 ; CHECK-NEXT: movl 8(%ebp), %eax -; CHECK-NEXT: vmovdqa %xmm3, 208(%eax) -; CHECK-NEXT: vmovdqa %xmm4, 192(%eax) -; CHECK-NEXT: vmovdqa %xmm7, 240(%eax) -; CHECK-NEXT: vmovdqa %xmm6, 224(%eax) -; CHECK-NEXT: vmovdqa %xmm0, 144(%eax) -; CHECK-NEXT: vmovdqa %xmm1, 128(%eax) -; CHECK-NEXT: vmovdqa %xmm2, 176(%eax) +; CHECK-NEXT: vmovdqa %xmm3, 240(%eax) +; CHECK-NEXT: vmovdqa %xmm4, 224(%eax) +; CHECK-NEXT: vmovdqa %xmm7, 208(%eax) +; CHECK-NEXT: vmovdqa %xmm6, 192(%eax) +; CHECK-NEXT: vmovdqa %xmm0, 176(%eax) +; CHECK-NEXT: vmovdqa %xmm1, 160(%eax) +; CHECK-NEXT: vmovdqa %xmm2, 144(%eax) ; CHECK-NEXT: vmovaps (%esp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 160(%eax) +; CHECK-NEXT: vmovaps %xmm0, 128(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vmovaps %xmm0, 112(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload @@ -196,13 +196,13 @@ ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vmovaps %xmm0, 64(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 16(%eax) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, (%eax) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vmovaps %xmm0, 48(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vmovaps %xmm0, 32(%eax) +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovaps %xmm0, 16(%eax) +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovaps %xmm0, (%eax) ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: vzeroupper