diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -23,6 +23,13 @@
                        "(State.getMachineFunction().getSubtarget()).", F),
            A>;
 
+/// CCIfIsVarArgOnWin - Match if isVarArg on Windows 32bits.
+class CCIfIsVarArgOnWin<CCAction A>
+    : CCIf<"State.isVarArg() && "
+           "State.getMachineFunction().getSubtarget().getTargetTriple()."
+           "isWindowsMSVCEnvironment()",
+           A>;
+
 // Register classes for RegCall
 class RC_X86_RegCall {
   list<Register> GPR_8 = [];
@@ -771,6 +778,22 @@
            CCAssignToStack<64, 64>>
 ]>;
 
+/// CC_X86_Win32_Vector - In X86 Win32 calling conventions, extra vector
+/// values are spilled on the stack.
+def CC_X86_Win32_Vector : CallingConv<[
+  // Other SSE vectors get 16-byte stack slots that are 4-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
+           CCAssignToStack<16, 4>>,
+
+  // 256-bit AVX vectors get 32-byte stack slots that are 4-byte aligned.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
+           CCAssignToStack<32, 4>>,
+
+  // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 4-byte aligned.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
+           CCAssignToStack<64, 4>>
+]>;
+
 // CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in
 // vector registers
 def CC_X86_32_Vector_Standard : CallingConv<[
@@ -787,6 +810,7 @@
   CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
                 CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>>,
 
+  CCIfIsVarArgOnWin<CCDelegateTo<CC_X86_Win32_Vector>>,
   CCDelegateTo<CC_X86_32_Vector_Common>
 ]>;
 
diff --git a/llvm/test/CodeGen/X86/vaargs-win32.ll b/llvm/test/CodeGen/X86/vaargs-win32.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vaargs-win32.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mcpu=generic -mtriple=i686-pc-windows-msvc -mattr=+sse < %s | FileCheck %s --check-prefix=MSVC
+; RUN: llc -mcpu=generic -mtriple=i686-pc-mingw32 -mattr=+sse < %s | FileCheck %s --check-prefix=MINGW
+
+@a = external dso_local global <4 x float>, align 16
+
+define dso_local void @testPastArguments() nounwind {
+; MSVC-LABEL: testPastArguments:
+; MSVC:       # %bb.0: # %entry
+; MSVC-NEXT:    subl $20, %esp
+; MSVC-NEXT:    movaps _a, %xmm0
+; MSVC-NEXT:    movups %xmm0, 4(%esp)
+; MSVC-NEXT:    movl $1, (%esp)
+; MSVC-NEXT:    calll _testm128
+; MSVC-NEXT:    addl $20, %esp
+; MSVC-NEXT:    retl
+;
+; MINGW-LABEL: testPastArguments:
+; MINGW:       # %bb.0: # %entry
+; MINGW-NEXT:    pushl %ebp
+; MINGW-NEXT:    movl %esp, %ebp
+; MINGW-NEXT:    andl $-16, %esp
+; MINGW-NEXT:    subl $48, %esp
+; MINGW-NEXT:    movaps _a, %xmm0
+; MINGW-NEXT:    movaps %xmm0, 16(%esp)
+; MINGW-NEXT:    movl $1, (%esp)
+; MINGW-NEXT:    calll _testm128
+; MINGW-NEXT:    movl %ebp, %esp
+; MINGW-NEXT:    popl %ebp
+; MINGW-NEXT:    retl
+entry:
+  %0 = load <4 x float>, <4 x float>* @a, align 16
+  %call = tail call i32 (i32, ...) @testm128(i32 1, <4 x float> inreg %0)
+  ret void
+}
+
+declare i32 @testm128(i32, ...) nounwind
diff --git a/llvm/test/CodeGen/X86/win32-spill-xmm.ll b/llvm/test/CodeGen/X86/win32-spill-xmm.ll
--- a/llvm/test/CodeGen/X86/win32-spill-xmm.ll
+++ b/llvm/test/CodeGen/X86/win32-spill-xmm.ll
@@ -20,7 +20,7 @@
 ; Check that proper alignment of spilled vector does not affect vargs
 
 ; CHECK-LABEL: vargs_not_affected
-; CHECK: movl 28(%ebp), %eax
+; CHECK: movl 28(%esp), %eax
 define i32 @vargs_not_affected(<4 x float> %v, i8* %f, ...) {
 entry:
   %ap = alloca i8*, align 4