Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -2106,10 +2106,9 @@ return true; } -bool X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, - unsigned, - MachineMemOperand::Flags, - bool *Fast) const { +bool X86TargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast) const { if (Fast) { switch (VT.getSizeInBits()) { default: @@ -2125,6 +2124,16 @@ // TODO: What about AVX-512 (512-bit) accesses? } } + // NonTemporal vector memory ops must be aligned. + if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { + // NT loads can only be vector aligned, so if its less aligned than the + // minimum vector size (which we can split the vector down to), we might as + // well use a regular unaligned vector load. + // We don't have any NT loads pre-SSE41. + if (!!(Flags & MachineMemOperand::MOLoad)) + return (Align < 16 || !Subtarget.hasSSE41()); + return false; + } // Misaligned accesses of any size are always allowed. return true; } Index: llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -163,7 +163,7 @@ ret void } -; FIXME: AVX2 can't perform NT-load-ymm on 16-byte aligned memory. +; AVX2 can't perform NT-load-ymm on 16-byte aligned memory. ; Must be kept seperate as VMOVNTDQA xmm. define void @merge_2_v4f32_align16_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind { ; X86-LABEL: merge_2_v4f32_align16_ntload: @@ -200,20 +200,13 @@ ; X64-SSE41-NEXT: movdqa %xmm1, 16(%rsi) ; X64-SSE41-NEXT: retq ; -; X64-AVX1-LABEL: merge_2_v4f32_align16_ntload: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovntdqa (%rdi), %xmm0 -; X64-AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 -; X64-AVX1-NEXT: vmovdqa %xmm1, 16(%rsi) -; X64-AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: merge_2_v4f32_align16_ntload: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovups %ymm0, (%rsi) -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: merge_2_v4f32_align16_ntload: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0 +; X64-AVX-NEXT: vmovntdqa 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX-NEXT: vmovdqa %xmm1, 16(%rsi) +; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* %3 = load <4 x float>, <4 x float>* %a0, align 16, !nontemporal !0 @@ -225,7 +218,7 @@ ret void } -; FIXME: AVX can't perform NT-store-ymm on 16-byte aligned memory. +; AVX can't perform NT-store-ymm on 16-byte aligned memory. ; Must be kept seperate as VMOVNTPS xmm. define void @merge_2_v4f32_align16_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind { ; X86-LABEL: merge_2_v4f32_align16_ntstore: @@ -248,9 +241,10 @@ ; ; X64-AVX-LABEL: merge_2_v4f32_align16_ntstore: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX-NEXT: vmovups %ymm0, (%rsi) -; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovntps %xmm0, (%rsi) +; X64-AVX-NEXT: vmovntps %xmm1, 16(%rsi) ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* @@ -263,7 +257,7 @@ ret void } -; FIXME: Nothing can perform NT-load-vector on 1-byte aligned memory. +; Nothing can perform NT-load-vector on 1-byte aligned memory. ; Just perform regular loads. define void @merge_2_v4f32_align1_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind { ; X86-LABEL: merge_2_v4f32_align1_ntload: @@ -301,32 +295,71 @@ ret void } -; FIXME: Nothing can perform NT-store-vector on 1-byte aligned memory. +; Nothing can perform NT-store-vector on 1-byte aligned memory. ; Must be scalarized to use MOVTNI/MOVNTSD. define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind { ; X86-LABEL: merge_2_v4f32_align1_ntstore: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $48, %esp +; X86-NEXT: movl 12(%ebp), %eax +; X86-NEXT: movl 8(%ebp), %ecx ; X86-NEXT: movups (%ecx), %xmm0 ; X86-NEXT: movups 16(%ecx), %xmm1 -; X86-NEXT: movups %xmm0, (%eax) -; X86-NEXT: movups %xmm1, 16(%eax) +; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movntil %ecx, 12(%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movntil %ecx, 8(%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movntil %edx, 4(%eax) +; X86-NEXT: movntil %ecx, (%eax) +; X86-NEXT: movaps %xmm1, (%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movntil %ecx, 28(%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movntil %ecx, 24(%eax) +; X86-NEXT: movl (%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movntil %edx, 20(%eax) +; X86-NEXT: movntil %ecx, 16(%eax) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-SSE-LABEL: merge_2_v4f32_align1_ntstore: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movups (%rdi), %xmm0 ; X64-SSE-NEXT: movups 16(%rdi), %xmm1 -; X64-SSE-NEXT: movups %xmm0, (%rsi) -; X64-SSE-NEXT: movups %xmm1, 16(%rsi) +; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; X64-SSE-NEXT: movntiq %rcx, 8(%rsi) +; X64-SSE-NEXT: movntiq %rax, (%rsi) +; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; X64-SSE-NEXT: movntiq %rcx, 24(%rsi) +; X64-SSE-NEXT: movntiq %rax, 16(%rsi) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: merge_2_v4f32_align1_ntstore: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX-NEXT: vmovups %ymm0, (%rsi) -; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; X64-AVX-NEXT: movntiq %rcx, 8(%rsi) +; X64-AVX-NEXT: movntiq %rax, (%rsi) +; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; X64-AVX-NEXT: movntiq %rcx, 24(%rsi) +; X64-AVX-NEXT: movntiq %rax, 16(%rsi) ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* @@ -339,32 +372,71 @@ ret void } -; FIXME: Nothing can perform NT-load-vector on 1-byte aligned memory. +; Nothing can perform NT-load-vector on 1-byte aligned memory. ; Just perform regular loads and scalarize NT-stores. define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind { ; X86-LABEL: merge_2_v4f32_align1: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $48, %esp +; X86-NEXT: movl 12(%ebp), %eax +; X86-NEXT: movl 8(%ebp), %ecx ; X86-NEXT: movups (%ecx), %xmm0 ; X86-NEXT: movups 16(%ecx), %xmm1 -; X86-NEXT: movups %xmm0, (%eax) -; X86-NEXT: movups %xmm1, 16(%eax) +; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movntil %ecx, 12(%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movntil %ecx, 8(%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movntil %edx, 4(%eax) +; X86-NEXT: movntil %ecx, (%eax) +; X86-NEXT: movaps %xmm1, (%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movntil %ecx, 28(%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movntil %ecx, 24(%eax) +; X86-NEXT: movl (%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movntil %edx, 20(%eax) +; X86-NEXT: movntil %ecx, 16(%eax) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-SSE-LABEL: merge_2_v4f32_align1: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movups (%rdi), %xmm0 ; X64-SSE-NEXT: movups 16(%rdi), %xmm1 -; X64-SSE-NEXT: movups %xmm0, (%rsi) -; X64-SSE-NEXT: movups %xmm1, 16(%rsi) +; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; X64-SSE-NEXT: movntiq %rcx, 8(%rsi) +; X64-SSE-NEXT: movntiq %rax, (%rsi) +; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; X64-SSE-NEXT: movntiq %rcx, 24(%rsi) +; X64-SSE-NEXT: movntiq %rax, 16(%rsi) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: merge_2_v4f32_align1: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX-NEXT: vmovups %ymm0, (%rsi) -; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1 +; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; X64-AVX-NEXT: movntiq %rcx, 8(%rsi) +; X64-AVX-NEXT: movntiq %rax, (%rsi) +; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; X64-AVX-NEXT: movntiq %rcx, 24(%rsi) +; X64-AVX-NEXT: movntiq %rax, 16(%rsi) ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* Index: llvm/trunk/test/CodeGen/X86/nontemporal-3.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/nontemporal-3.ll +++ llvm/trunk/test/CodeGen/X86/nontemporal-3.ll @@ -15,19 +15,31 @@ ; SSE-LABEL: test_zero_v2f64_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v2f64_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %xmm0, (%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v2f64_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %xmm0, (%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <2 x double> zeroinitializer, <2 x double>* %dst, align 1, !nontemporal !1 ret void @@ -37,19 +49,31 @@ ; SSE-LABEL: test_zero_v4f32_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v4f32_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %xmm0, (%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v4f32_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %xmm0, (%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <4 x float> zeroinitializer, <4 x float>* %dst, align 1, !nontemporal !1 ret void @@ -59,19 +83,31 @@ ; SSE-LABEL: test_zero_v2i64_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v2i64_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %xmm0, (%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v2i64_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %xmm0, (%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 1, !nontemporal !1 ret void @@ -81,19 +117,31 @@ ; SSE-LABEL: test_zero_v4i32_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v4i32_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %xmm0, (%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v4i32_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %xmm0, (%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 1, !nontemporal !1 ret void @@ -103,19 +151,31 @@ ; SSE-LABEL: test_zero_v8i16_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8i16_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %xmm0, (%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8i16_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %xmm0, (%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 1, !nontemporal !1 ret void @@ -125,19 +185,31 @@ ; SSE-LABEL: test_zero_v16i8_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v16i8_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %xmm0, (%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16i8_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %xmm0, (%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 1, !nontemporal !1 ret void @@ -149,8 +221,16 @@ ; SSE-LABEL: test_zero_v4f64_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, 16(%rdi) -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v4f64_align1: @@ -174,8 +254,16 @@ ; SSE-LABEL: test_zero_v8f32_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, 16(%rdi) -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8f32_align1: @@ -199,8 +287,16 @@ ; SSE-LABEL: test_zero_v4i64_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, 16(%rdi) -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v4i64_align1: @@ -224,8 +320,16 @@ ; SSE-LABEL: test_zero_v8i32_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, 16(%rdi) -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8i32_align1: @@ -249,8 +353,16 @@ ; SSE-LABEL: test_zero_v16i16_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, 16(%rdi) -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v16i16_align1: @@ -274,8 +386,16 @@ ; SSE-LABEL: test_zero_v32i8_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, 16(%rdi) -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v32i8_align1: @@ -451,10 +571,26 @@ ; SSE-LABEL: test_zero_v8f64_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, 48(%rdi) -; SSE-NEXT: movups %xmm0, 32(%rdi) -; SSE-NEXT: movups %xmm0, 16(%rdi) -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8f64_align1: @@ -467,8 +603,30 @@ ; ; AVX512-LABEL: test_zero_v8f64_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %zmm0, (%rsp) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 40(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 16(%rdi) +; AVX512-NEXT: movq (%rsp), %rax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <8 x double> zeroinitializer, <8 x double>* %dst, align 1, !nontemporal !1 @@ -479,10 +637,26 @@ ; SSE-LABEL: test_zero_v16f32_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, 48(%rdi) -; SSE-NEXT: movups %xmm0, 32(%rdi) -; SSE-NEXT: movups %xmm0, 16(%rdi) -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v16f32_align1: @@ -495,8 +669,30 @@ ; ; AVX512-LABEL: test_zero_v16f32_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %zmm0, (%rsp) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 40(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 16(%rdi) +; AVX512-NEXT: movq (%rsp), %rax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <16 x float> zeroinitializer, <16 x float>* %dst, align 1, !nontemporal !1 @@ -507,10 +703,26 @@ ; SSE-LABEL: test_zero_v8i64_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, 48(%rdi) -; SSE-NEXT: movups %xmm0, 32(%rdi) -; SSE-NEXT: movups %xmm0, 16(%rdi) -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8i64_align1: @@ -523,8 +735,30 @@ ; ; AVX512-LABEL: test_zero_v8i64_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %zmm0, (%rsp) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 40(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 16(%rdi) +; AVX512-NEXT: movq (%rsp), %rax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 1, !nontemporal !1 @@ -535,10 +769,26 @@ ; SSE-LABEL: test_zero_v16i32_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, 48(%rdi) -; SSE-NEXT: movups %xmm0, 32(%rdi) -; SSE-NEXT: movups %xmm0, 16(%rdi) -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v16i32_align1: @@ -551,8 +801,30 @@ ; ; AVX512-LABEL: test_zero_v16i32_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %zmm0, (%rsp) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 40(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movntiq %rax, 16(%rdi) +; AVX512-NEXT: movq (%rsp), %rax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 1, !nontemporal !1 @@ -563,10 +835,26 @@ ; SSE-LABEL: test_zero_v32i16_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, 48(%rdi) -; SSE-NEXT: movups %xmm0, 32(%rdi) -; SSE-NEXT: movups %xmm0, 16(%rdi) -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v32i16_align1: @@ -587,8 +875,30 @@ ; ; AVX512BW-LABEL: test_zero_v32i16_align1: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: movq %rsp, %rbp +; AVX512BW-NEXT: andq $-64, %rsp +; AVX512BW-NEXT: subq $128, %rsp ; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movntiq %rax, 56(%rdi) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movntiq %rax, 48(%rdi) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movntiq %rax, 40(%rdi) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movntiq %rax, 32(%rdi) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movntiq %rax, 24(%rdi) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movntiq %rax, 16(%rdi) +; AVX512BW-NEXT: movq (%rsp), %rax +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512BW-NEXT: movntiq %rcx, 8(%rdi) +; AVX512BW-NEXT: movntiq %rax, (%rdi) +; AVX512BW-NEXT: movq %rbp, %rsp +; AVX512BW-NEXT: popq %rbp ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 1, !nontemporal !1 @@ -599,10 +909,26 @@ ; SSE-LABEL: test_zero_v64i8_align1: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, 48(%rdi) -; SSE-NEXT: movups %xmm0, 32(%rdi) -; SSE-NEXT: movups %xmm0, 16(%rdi) -; SSE-NEXT: movups %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v64i8_align1: @@ -623,8 +949,30 @@ ; ; AVX512BW-LABEL: test_zero_v64i8_align1: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: movq %rsp, %rbp +; AVX512BW-NEXT: andq $-64, %rsp +; AVX512BW-NEXT: subq $128, %rsp ; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movntiq %rax, 56(%rdi) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movntiq %rax, 48(%rdi) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movntiq %rax, 40(%rdi) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movntiq %rax, 32(%rdi) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movntiq %rax, 24(%rdi) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movntiq %rax, 16(%rdi) +; AVX512BW-NEXT: movq (%rsp), %rax +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512BW-NEXT: movntiq %rcx, 8(%rdi) +; AVX512BW-NEXT: movntiq %rax, (%rdi) +; AVX512BW-NEXT: movq %rbp, %rsp +; AVX512BW-NEXT: popq %rbp ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 1, !nontemporal !1 @@ -651,8 +999,22 @@ ; ; AVX512-LABEL: test_zero_v8f64_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %zmm0, (%rsp) +; AVX512-NEXT: vmovaps (%rsp), %xmm0 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 +; AVX512-NEXT: vmovntps %xmm3, 48(%rdi) +; AVX512-NEXT: vmovntps %xmm2, 32(%rdi) +; AVX512-NEXT: vmovntps %xmm1, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, (%rdi) +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <8 x double> zeroinitializer, <8 x double>* %dst, align 16, !nontemporal !1 @@ -679,8 +1041,22 @@ ; ; AVX512-LABEL: test_zero_v16f32_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %zmm0, (%rsp) +; AVX512-NEXT: vmovaps (%rsp), %xmm0 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 +; AVX512-NEXT: vmovntps %xmm3, 48(%rdi) +; AVX512-NEXT: vmovntps %xmm2, 32(%rdi) +; AVX512-NEXT: vmovntps %xmm1, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, (%rdi) +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <16 x float> zeroinitializer, <16 x float>* %dst, align 16, !nontemporal !1 @@ -707,8 +1083,22 @@ ; ; AVX512-LABEL: test_zero_v8i64_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %zmm0, (%rsp) +; AVX512-NEXT: vmovaps (%rsp), %xmm0 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 +; AVX512-NEXT: vmovntps %xmm3, 48(%rdi) +; AVX512-NEXT: vmovntps %xmm2, 32(%rdi) +; AVX512-NEXT: vmovntps %xmm1, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, (%rdi) +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 16, !nontemporal !1 @@ -735,8 +1125,22 @@ ; ; AVX512-LABEL: test_zero_v16i32_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %zmm0, (%rsp) +; AVX512-NEXT: vmovaps (%rsp), %xmm0 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 +; AVX512-NEXT: vmovntps %xmm3, 48(%rdi) +; AVX512-NEXT: vmovntps %xmm2, 32(%rdi) +; AVX512-NEXT: vmovntps %xmm1, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, (%rdi) +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 16, !nontemporal !1 @@ -771,8 +1175,22 @@ ; ; AVX512BW-LABEL: test_zero_v32i16_align16: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: movq %rsp, %rbp +; AVX512BW-NEXT: andq $-64, %rsp +; AVX512BW-NEXT: subq $128, %rsp ; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) +; AVX512BW-NEXT: vmovaps (%rsp), %xmm0 +; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 +; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2 +; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 +; AVX512BW-NEXT: vmovntps %xmm3, 48(%rdi) +; AVX512BW-NEXT: vmovntps %xmm2, 32(%rdi) +; AVX512BW-NEXT: vmovntps %xmm1, 16(%rdi) +; AVX512BW-NEXT: vmovntps %xmm0, (%rdi) +; AVX512BW-NEXT: movq %rbp, %rsp +; AVX512BW-NEXT: popq %rbp ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 16, !nontemporal !1 @@ -807,8 +1225,22 @@ ; ; AVX512BW-LABEL: test_zero_v64i8_align16: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: movq %rsp, %rbp +; AVX512BW-NEXT: andq $-64, %rsp +; AVX512BW-NEXT: subq $128, %rsp ; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) +; AVX512BW-NEXT: vmovaps (%rsp), %xmm0 +; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 +; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2 +; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 +; AVX512BW-NEXT: vmovntps %xmm3, 48(%rdi) +; AVX512BW-NEXT: vmovntps %xmm2, 32(%rdi) +; AVX512BW-NEXT: vmovntps %xmm1, 16(%rdi) +; AVX512BW-NEXT: vmovntps %xmm0, (%rdi) +; AVX512BW-NEXT: movq %rbp, %rsp +; AVX512BW-NEXT: popq %rbp ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 16, !nontemporal !1 @@ -835,8 +1267,18 @@ ; ; AVX512-LABEL: test_zero_v8f64_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %zmm0, (%rsp) +; AVX512-NEXT: vmovaps (%rsp), %ymm0 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX512-NEXT: vmovntps %ymm1, 32(%rdi) +; AVX512-NEXT: vmovntps %ymm0, (%rdi) +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <8 x double> zeroinitializer, <8 x double>* %dst, align 32, !nontemporal !1 @@ -863,8 +1305,18 @@ ; ; AVX512-LABEL: test_zero_v16f32_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %zmm0, (%rsp) +; AVX512-NEXT: vmovaps (%rsp), %ymm0 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX512-NEXT: vmovntps %ymm1, 32(%rdi) +; AVX512-NEXT: vmovntps %ymm0, (%rdi) +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <16 x float> zeroinitializer, <16 x float>* %dst, align 32, !nontemporal !1 @@ -891,8 +1343,18 @@ ; ; AVX512-LABEL: test_zero_v8i64_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %zmm0, (%rsp) +; AVX512-NEXT: vmovaps (%rsp), %ymm0 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX512-NEXT: vmovntps %ymm1, 32(%rdi) +; AVX512-NEXT: vmovntps %ymm0, (%rdi) +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 32, !nontemporal !1 @@ -919,8 +1381,18 @@ ; ; AVX512-LABEL: test_zero_v16i32_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $128, %rsp +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %zmm0, (%rsp) +; AVX512-NEXT: vmovaps (%rsp), %ymm0 +; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX512-NEXT: vmovntps %ymm1, 32(%rdi) +; AVX512-NEXT: vmovntps %ymm0, (%rdi) +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 32, !nontemporal !1 @@ -955,8 +1427,18 @@ ; ; AVX512BW-LABEL: test_zero_v32i16_align32: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: movq %rsp, %rbp +; AVX512BW-NEXT: andq $-64, %rsp +; AVX512BW-NEXT: subq $128, %rsp ; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) +; AVX512BW-NEXT: vmovaps (%rsp), %ymm0 +; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX512BW-NEXT: vmovntps %ymm1, 32(%rdi) +; AVX512BW-NEXT: vmovntps %ymm0, (%rdi) +; AVX512BW-NEXT: movq %rbp, %rsp +; AVX512BW-NEXT: popq %rbp ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 32, !nontemporal !1 @@ -991,8 +1473,18 @@ ; ; AVX512BW-LABEL: test_zero_v64i8_align32: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbp +; AVX512BW-NEXT: movq %rsp, %rbp +; AVX512BW-NEXT: andq $-64, %rsp +; AVX512BW-NEXT: subq $128, %rsp ; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) +; AVX512BW-NEXT: vmovaps (%rsp), %ymm0 +; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX512BW-NEXT: vmovntps %ymm1, 32(%rdi) +; AVX512BW-NEXT: vmovntps %ymm0, (%rdi) +; AVX512BW-NEXT: movq %rbp, %rsp +; AVX512BW-NEXT: popq %rbp ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 32, !nontemporal !1