diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -486,7 +486,7 @@ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, CCIfSubtarget<"is64Bit()", CCDelegateTo>, - CCDelegateTo + CCIfNotSubtarget<"is64Bit()", CCDelegateTo>, ]>; //===----------------------------------------------------------------------===// @@ -524,6 +524,19 @@ CCIfSubtarget<"hasSSE2()", CCPromoteToType>>>, + // The first 3 __m64 vector arguments are passed in mmx registers if the + // call is not a vararg call. + // FIXME: This is what we used to inherit from CC_X86_32_Common, but might + // not make sense. + CCIfNotVarArg>>, + + // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are + // passed in the parameter area. + // FIXME: This is what we used to inherit from CC_X86_32_Common, but might + // not make sense. + CCIfType<[x86mmx], CCAssignToStack<8, 4>>, + // Boolean vectors of AVX-512 are passed in SIMD registers. // The call from AVX to AVX-512 function should work, // since the boolean types in AVX/AVX2 are promoted by default. @@ -994,7 +1007,7 @@ CCIfSubtarget<"isTargetWin64()", CCDelegateTo>, CCIfSubtarget<"is64Bit()", CCDelegateTo>, - CCDelegateTo + CCIfNotSubtarget<"is64Bit()", CCDelegateTo>, ]>; //===----------------------------------------------------------------------===// @@ -1049,7 +1062,7 @@ def CC_X86 : CallingConv<[ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo>, CCIfSubtarget<"is64Bit()", CCDelegateTo>, - CCDelegateTo + CCIfNotSubtarget<"is64Bit()", CCDelegateTo>, ]>; //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/X86/pr11334.ll b/llvm/test/CodeGen/X86/pr11334.ll --- a/llvm/test/CodeGen/X86/pr11334.ll +++ b/llvm/test/CodeGen/X86/pr11334.ll @@ -20,14 +20,12 @@ define <3 x double> @v3f2d_ext_vec(<3 x float> %v1) nounwind { ; SSE-LABEL: v3f2d_ext_vec: ; SSE: # %bb.0: # %entry -; SSE-NEXT: cvtps2pd %xmm0, %xmm2 +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: cvtps2pd %xmm0, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvtps2pd %xmm0, %xmm0 -; SSE-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: fldl -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlps %xmm0, 16(%rdi) +; SSE-NEXT: movaps %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: v3f2d_ext_vec: diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -84,14 +84,13 @@ define <3 x double> @constrained_vector_fdiv_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fdiv_v3f64: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0] ; CHECK-NEXT: divpd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: divsd {{.*}}(%rip), %xmm1 -; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movsd %xmm1, 16(%rdi) +; CHECK-NEXT: movapd %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fdiv_v3f64: @@ -277,26 +276,32 @@ define <3 x double> @constrained_vector_frem_v3f64() #0 { ; CHECK-LABEL: constrained_vector_frem_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: callq fmod -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: callq fmod -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: callq fmod -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -476,14 +481,13 @@ define <3 x double> @constrained_vector_fmul_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fmul_v3f64: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] ; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: mulsd {{.*}}(%rip), %xmm1 -; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movsd %xmm1, 16(%rdi) +; CHECK-NEXT: movapd %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fmul_v3f64: @@ -613,14 +617,13 @@ define <3 x double> @constrained_vector_fadd_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fadd_v3f64: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] ; CHECK-NEXT: addpd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: xorpd %xmm1, %xmm1 ; CHECK-NEXT: addsd {{.*}}(%rip), %xmm1 -; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movsd %xmm1, 16(%rdi) +; CHECK-NEXT: movapd %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fadd_v3f64: @@ -751,15 +754,14 @@ define <3 x double> @constrained_vector_fsub_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fsub_v3f64: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: xorpd %xmm0, %xmm0 ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: subsd %xmm0, %xmm1 ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308] ; CHECK-NEXT: subpd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movsd %xmm1, 16(%rdi) +; CHECK-NEXT: movapd %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fsub_v3f64: @@ -885,13 +887,12 @@ define <3 x double> @constrained_vector_sqrt_v3f64() #0 { ; CHECK-LABEL: constrained_vector_sqrt_v3f64: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: sqrtsd %xmm0, %xmm1 -; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: sqrtsd %xmm0, %xmm0 +; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: movsd %xmm0, 16(%rdi) +; CHECK-NEXT: movapd %xmm1, (%rdi) ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_sqrt_v3f64: @@ -1062,26 +1063,32 @@ define <3 x double> @constrained_vector_pow_v3f64() #0 { ; CHECK-LABEL: constrained_vector_pow_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: callq pow -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: callq pow -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: callq pow -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -1318,26 +1325,32 @@ define <3 x double> @constrained_vector_powi_v3f64() #0 { ; CHECK-LABEL: constrained_vector_powi_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movl $3, %edi ; CHECK-NEXT: callq __powidf2 -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movl $3, %edi ; CHECK-NEXT: callq __powidf2 -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movl $3, %edi ; CHECK-NEXT: callq __powidf2 -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -1558,23 +1571,29 @@ define <3 x double> @constrained_vector_sin_v3f64() #0 { ; CHECK-LABEL: constrained_vector_sin_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq sin -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq sin -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq sin -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -1782,23 +1801,29 @@ define <3 x double> @constrained_vector_cos_v3f64() #0 { ; CHECK-LABEL: constrained_vector_cos_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq cos -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq cos -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq cos -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -2006,23 +2031,29 @@ define <3 x double> @constrained_vector_exp_v3f64() #0 { ; CHECK-LABEL: constrained_vector_exp_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq exp -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq exp -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq exp -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -2230,23 +2261,29 @@ define <3 x double> @constrained_vector_exp2_v3f64() #0 { ; CHECK-LABEL: constrained_vector_exp2_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq exp2 -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq exp2 -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq exp2 -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -2454,23 +2491,29 @@ define <3 x double> @constrained_vector_log_v3f64() #0 { ; CHECK-LABEL: constrained_vector_log_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq log -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq log -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq log -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -2678,23 +2721,29 @@ define <3 x double> @constrained_vector_log10_v3f64() #0 { ; CHECK-LABEL: constrained_vector_log10_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq log10 -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq log10 -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq log10 -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -2902,23 +2951,29 @@ define <3 x double> @constrained_vector_log2_v3f64() #0 { ; CHECK-LABEL: constrained_vector_log2_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq log2 -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq log2 -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq log2 -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -3104,23 +3159,29 @@ define <3 x double> @constrained_vector_rint_v3f64() #0 { ; CHECK-LABEL: constrained_vector_rint_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq rint -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq rint -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq rint -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -3274,23 +3335,29 @@ define <3 x double> @constrained_vector_nearby_v3f64() #0 { ; CHECK-LABEL: constrained_vector_nearby_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq nearbyint -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq nearbyint -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq nearbyint -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -3477,26 +3544,32 @@ define <3 x double> @constrained_vector_max_v3f64() #0 { ; CHECK-LABEL: constrained_vector_max_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: callq fmax -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: callq fmax -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: callq fmax -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -3727,26 +3800,32 @@ define <3 x double> @constrained_vector_min_v3f64() #0 { ; CHECK-LABEL: constrained_vector_min_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: callq fmin -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: callq fmin -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: callq fmin -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -5541,14 +5620,16 @@ define <3 x double> @constrained_vector_fpext_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fpext_v3f32: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: cvtss2sd %xmm0, %xmm0 ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: cvtss2sd %xmm1, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: cvtss2sd %xmm2, %xmm2 -; CHECK-NEXT: movsd %xmm2, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: cvtss2sd %xmm0, %xmm0 +; CHECK-NEXT: movsd %xmm0, 16(%rdi) +; CHECK-NEXT: movaps %xmm1, (%rdi) ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fpext_v3f32: @@ -5682,23 +5763,29 @@ define <3 x double> @constrained_vector_ceil_v3f64() #0 { ; CHECK-LABEL: constrained_vector_ceil_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq ceil -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq ceil -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq ceil -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -5810,23 +5897,29 @@ define <3 x double> @constrained_vector_floor_v3f64() #0 { ; CHECK-LABEL: constrained_vector_floor_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq floor -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq floor -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq floor -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -5960,23 +6053,29 @@ define <3 x double> @constrained_vector_round_v3f64() #0 { ; CHECK-LABEL: constrained_vector_round_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq round -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq round -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq round -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -6100,23 +6199,29 @@ define <3 x double> @constrained_vector_trunc_v3f64() #0 { ; CHECK-LABEL: constrained_vector_trunc_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: subq $16, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq trunc -; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq trunc -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: callq trunc -; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) -; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: movsd %xmm0, 16(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq ; @@ -6322,19 +6427,20 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 { ; CHECK-LABEL: constrained_vector_sitofp_v3f64_v3i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: cvtsi2sd %eax, %xmm2 +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: movd %xmm1, %ecx ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: cvtsi2sd %eax, %xmm1 +; CHECK-NEXT: cvtsi2sd %ecx, %xmm1 +; CHECK-NEXT: movd %xmm0, %ecx +; CHECK-NEXT: cvtsi2sd %ecx, %xmm2 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: movd %xmm0, %ecx ; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: cvtsi2sd %eax, %xmm0 -; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: cvtsi2sd %ecx, %xmm0 +; CHECK-NEXT: movsd %xmm0, 16(%rdi) +; CHECK-NEXT: movapd %xmm2, (%rdi) ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_sitofp_v3f64_v3i32: @@ -6396,11 +6502,14 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; CHECK-LABEL: constrained_vector_sitofp_v3f64_v3i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvtsi2sd %rdi, %xmm0 +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: cvtsi2sd %rdx, %xmm0 ; CHECK-NEXT: cvtsi2sd %rsi, %xmm1 -; CHECK-NEXT: cvtsi2sd %rdx, %xmm2 -; CHECK-NEXT: movsd %xmm2, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2sd %rcx, %xmm0 +; CHECK-NEXT: movsd %xmm0, 16(%rdi) +; CHECK-NEXT: movapd %xmm1, (%rdi) ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_sitofp_v3f64_v3i64: @@ -6976,19 +7085,20 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: cvtsi2sd %rax, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; CHECK-NEXT: movd %xmm1, %eax -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: cvtsi2sd %rax, %xmm1 +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movd %xmm0, %ecx +; CHECK-NEXT: cvtsi2sd %rcx, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; CHECK-NEXT: movd %xmm2, %ecx +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: cvtsi2sd %rcx, %xmm2 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: movd %xmm0, %ecx ; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: cvtsi2sd %rax, %xmm0 -; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movapd %xmm2, %xmm0 +; CHECK-NEXT: cvtsi2sd %rcx, %xmm0 +; CHECK-NEXT: movsd %xmm0, 16(%rdi) +; CHECK-NEXT: movapd %xmm1, (%rdi) ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i32: @@ -7074,28 +7184,30 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rdi, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25] -; CHECK-NEXT: subpd %xmm3, %xmm1 -; CHECK-NEXT: movapd %xmm1, %xmm0 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; CHECK-NEXT: addpd %xmm1, %xmm0 -; CHECK-NEXT: movq %rsi, %xmm4 -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; CHECK-NEXT: subpd %xmm3, %xmm4 -; CHECK-NEXT: movapd %xmm4, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; CHECK-NEXT: addpd %xmm4, %xmm1 -; CHECK-NEXT: movq %rdx, %xmm4 -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; CHECK-NEXT: subpd %xmm3, %xmm4 -; CHECK-NEXT: movapd %xmm4, %xmm2 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; CHECK-NEXT: addpd %xmm4, %xmm2 -; CHECK-NEXT: movlpd %xmm2, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq %rdx, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: movapd {{.*#+}} xmm2 = [4.503599627370496E+15,1.9342813113834067E+25] +; CHECK-NEXT: subpd %xmm2, %xmm0 +; CHECK-NEXT: movapd %xmm0, %xmm3 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; CHECK-NEXT: addpd %xmm0, %xmm3 +; CHECK-NEXT: movq %rsi, %xmm0 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: subpd %xmm2, %xmm0 +; CHECK-NEXT: movapd %xmm0, %xmm4 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; CHECK-NEXT: addpd %xmm0, %xmm4 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; CHECK-NEXT: movq %rcx, %xmm0 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: subpd %xmm2, %xmm0 +; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; CHECK-NEXT: addpd %xmm0, %xmm1 +; CHECK-NEXT: movlpd %xmm1, 16(%rdi) +; CHECK-NEXT: movapd %xmm4, (%rdi) ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i64: diff --git a/llvm/test/CodeGen/X86/vectorcall.ll b/llvm/test/CodeGen/X86/vectorcall.ll --- a/llvm/test/CodeGen/X86/vectorcall.ll +++ b/llvm/test/CodeGen/X86/vectorcall.ll @@ -62,18 +62,25 @@ ; CHECK: xorps %xmm2 ; CHECK: xorps %xmm3 -; FIXME: Returning via x87 isn't compatible, but its hard to structure the -; tablegen any other way. define x86_vectorcallcc {double, double, double, double, double} @test_fp_4() { ret {double, double, double, double, double} { double 0.0, double 0.0, double 0.0, double 0.0, double 0.0 } } -; CHECK-LABEL: {{^}}test_fp_4@@0: -; CHECK: fldz -; CHECK: xorps %xmm0 -; CHECK: xorps %xmm1 -; CHECK: xorps %xmm2 -; CHECK: xorps %xmm3 +; X64-LABEL: {{^}}test_fp_4@@0: +; X64: movq $0, 32(%rcx) +; X64: movq $0, 24(%rcx) +; X64: movq $0, 16(%rcx) +; X64: movq $0, 8(%rcx) +; X64: movq $0, (%rcx) + +; FIXME: Returning via x87 isn't compatible, but its hard to structure the +; tablegen any other way. +; X86-LABEL: {{^}}test_fp_4@@0: +; X86: fldz +; X86: xorps %xmm0 +; X86: xorps %xmm1 +; X86: xorps %xmm2 +; X86: xorps %xmm3 define x86_vectorcallcc <16 x i8> @test_vec_1(<16 x i8> %a, <16 x i8> %b) { ret <16 x i8> %b