diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -23,6 +23,13 @@ "(State.getMachineFunction().getSubtarget()).", F), A>; +/// CCIfIsVarArgOnWin - Match if isVarArg on Windows 32bits. +class CCIfIsVarArgOnWin + : CCIf<"State.isVarArg() && " + "State.getMachineFunction().getSubtarget().getTargetTriple()." + "isOSWindows()", + A>; + // Register classes for RegCall class RC_X86_RegCall { list GPR_8 = []; @@ -771,6 +778,22 @@ CCAssignToStack<64, 64>> ]>; +/// CC_X86_Win32_Vector - In X86 Win32 calling conventions, extra vector +/// values are spilled on the stack. +def CC_X86_Win32_Vector : CallingConv<[ + // Other SSE vectors get 16-byte stack slots that are 4-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCAssignToStack<16, 4>>, + + // 256-bit AVX vectors get 32-byte stack slots that are 4-byte aligned. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], + CCAssignToStack<32, 4>>, + + // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 4-byte aligned. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], + CCAssignToStack<64, 4>> +]>; + // CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in // vector registers def CC_X86_32_Vector_Standard : CallingConv<[ @@ -787,6 +810,7 @@ CCIfNotVarArg>>, + CCIfIsVarArgOnWin>, CCDelegateTo ]>; diff --git a/llvm/test/CodeGen/X86/vaargs-win32.ll b/llvm/test/CodeGen/X86/vaargs-win32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vaargs-win32.ll @@ -0,0 +1,48 @@ +; RUN: llc -mcpu=generic -mtriple=i686-pc-windows-msvc -mattr=+sse < %s | FileCheck %s + +@res = external dso_local global <4 x float>, align 16 +@a = external dso_local global <4 x float>, align 16 + +define dso_local i32 @testm128(i32 returned %argCount, ...) { +; CHECK-LABEL: testm128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movl 8(%esp), %eax +; CHECK-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, (%esp) +; CHECK-NEXT: movups 12(%esp), %xmm0 +; CHECK-NEXT: movaps %xmm0, _res +; CHECK-NEXT: popl %ecx +; CHECK-NEXT: retl +entry: + %args = alloca i8*, align 4 + %0 = bitcast i8** %args to i8* + call void @llvm.va_start(i8* nonnull %0) + %argp.cur = load i8*, i8** %args, align 4 + %argp.next = getelementptr inbounds i8, i8* %argp.cur, i32 16 + store i8* %argp.next, i8** %args, align 4 + %1 = bitcast i8* %argp.cur to <4 x float>* + %2 = load <4 x float>, <4 x float>* %1, align 4 + store <4 x float> %2, <4 x float>* @res, align 16 + call void @llvm.va_end(i8* nonnull %0) + ret i32 %argCount +} + +define dso_local void @testPastArguments() local_unnamed_addr { +; CHECK-LABEL: testPastArguments: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subl $20, %esp +; CHECK-NEXT: movaps _a, %xmm0 +; CHECK-NEXT: movups %xmm0, 4(%esp) +; CHECK-NEXT: movl $1, (%esp) +; CHECK-NEXT: calll _testm128 +; CHECK-NEXT: addl $20, %esp +; CHECK-NEXT: retl +entry: + %0 = load <4 x float>, <4 x float>* @a, align 16 + %call = tail call i32 (i32, ...) @testm128(i32 1, <4 x float> inreg %0) + ret void +} + +declare void @llvm.va_start(i8*) +declare void @llvm.va_end(i8*) diff --git a/llvm/test/CodeGen/X86/win32-spill-xmm.ll b/llvm/test/CodeGen/X86/win32-spill-xmm.ll --- a/llvm/test/CodeGen/X86/win32-spill-xmm.ll +++ b/llvm/test/CodeGen/X86/win32-spill-xmm.ll @@ -20,7 +20,7 @@ ; Check that proper alignment of spilled vector does not affect vargs ; CHECK-LABEL: vargs_not_affected -; CHECK: movl 28(%ebp), %eax +; CHECK: movl 28(%esp), %eax define i32 @vargs_not_affected(<4 x float> %v, i8* %f, ...) { entry: %ap = alloca i8*, align 4