Index: include/llvm/CodeGen/CallingConvLower.h
===================================================================
--- include/llvm/CodeGen/CallingConvLower.h
+++ include/llvm/CodeGen/CallingConvLower.h
@@ -201,6 +201,7 @@
   LLVMContext &Context;
 
   unsigned StackOffset;
+  unsigned MaxStackArgAlign;
   SmallVector<uint32_t, 16> UsedRegs;
   SmallVector<CCValAssign, 4> PendingLocs;
 
@@ -270,7 +271,18 @@
   CallingConv::ID getCallingConv() const { return CallingConv; }
   bool isVarArg() const { return IsVarArg; }
 
-  unsigned getNextStackOffset() const { return StackOffset; }
+  /// getNextStackOffset - Return the next stack offset such that all stack
+  /// slots satisfy their alignment requirements.
+  unsigned getNextStackOffset() const {
+    return StackOffset;
+  }
+
+  /// getAlignedCallFrameSize - Return the size of the call frame needed to
+  /// be able to store all arguments and such that the alignment requirement
+  /// of each of the arguments is satisfied.
+  unsigned getAlignedCallFrameSize() const {
+    return RoundUpToAlignment(StackOffset, MaxStackArgAlign);
+  }
 
   /// isAllocated - Return true if the specified register (or an alias) is
   /// allocated.
@@ -400,9 +412,10 @@
   /// and alignment.
   unsigned AllocateStack(unsigned Size, unsigned Align) {
     assert(Align && ((Align - 1) & Align) == 0); // Align is power of 2.
-    StackOffset = ((StackOffset + Align - 1) & ~(Align - 1));
+    StackOffset = RoundUpToAlignment(StackOffset, Align);
     unsigned Result = StackOffset;
     StackOffset += Size;
+    MaxStackArgAlign = std::max(Align, MaxStackArgAlign);
     MF.getFrameInfo()->ensureMaxAlignment(Align);
     return Result;
   }
Index: lib/CodeGen/CallingConvLower.cpp
===================================================================
--- lib/CodeGen/CallingConvLower.cpp
+++ lib/CodeGen/CallingConvLower.cpp
@@ -32,6 +32,7 @@
       CallOrPrologue(Unknown) {
   // No stack is used.
   StackOffset = 0;
+  MaxStackArgAlign = 1;
 
   clearByValRegsInfo();
   UsedRegs.resize((TRI.getNumRegs()+31)/32);
@@ -192,6 +193,7 @@
 void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
                                           MVT VT, CCAssignFn Fn) {
   unsigned SavedStackOffset = StackOffset;
+  unsigned SavedMaxStackArgAlign = MaxStackArgAlign;
   unsigned NumLocs = Locs.size();
 
   // Set the 'inreg' flag if it is used for this calling convention.
@@ -223,6 +225,7 @@
   // as allocated so that future queries don't return the same registers, i.e.
   // when i64 and f64 are both passed in GPRs.
   StackOffset = SavedStackOffset;
+  MaxStackArgAlign = SavedMaxStackArgAlign;
   Locs.resize(NumLocs);
 }
 
Index: lib/Target/X86/X86FastISel.cpp
===================================================================
--- lib/Target/X86/X86FastISel.cpp
+++ lib/Target/X86/X86FastISel.cpp
@@ -2906,7 +2906,7 @@
   CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
 
   // Get a count of how many bytes are to be pushed on the stack.
-  unsigned NumBytes = CCInfo.getNextStackOffset();
+  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
 
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -2981,7 +2981,7 @@
   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
 
   // Get a count of how many bytes are to be pushed on the stack.
-  unsigned NumBytes = CCInfo.getNextStackOffset();
+  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
   if (IsSibcall)
     // This is a sibcall. The memory operands are available in caller's
     // own caller's stack.
Index: test/CodeGen/X86/win32-spill-xmm.ll
===================================================================
--- test/CodeGen/X86/win32-spill-xmm.ll
+++ test/CodeGen/X86/win32-spill-xmm.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mcpu=generic -mtriple=i686-pc-windows-msvc -mattr=+sse < %s | FileCheck %s
+
+; Check proper alignment of spilled vector
+
+; CHECK-LABEL: spill_ok
+; CHECK: subl    $32, %esp
+; CHECK: movaps  %xmm3, (%esp)
+; CHECK: movl    $0, 16(%esp)
+define void @spill_ok(i32, <16 x float> *) {
+entry:
+  %2 = alloca i32, i32 %0
+  %3 = load <16 x float>, <16 x float> * %1, align 64
+  tail call void @bar(<16 x float> %3, i32 0) nounwind
+  ret void
+}
+
+declare void @bar(<16 x float> %a, i32 %b)
+
+; Check that proper alignment of spilled vector does not affect vargs
+
+; CHECK-LABEL: vargs_not_affected
+; CHECK: leal    28(%ebp), %eax
+define i32 @vargs_not_affected(<4 x float> %v, i8* %f, ...) {
+entry:
+  %ap = alloca i8*, align 4
+  %0 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %0)
+  %argp.cur = load i8*, i8** %ap, align 4
+  %argp.next = getelementptr inbounds i8, i8* %argp.cur, i32 4
+  store i8* %argp.next, i8** %ap, align 4
+  %1 = bitcast i8* %argp.cur to i32*
+  %2 = load i32, i32* %1, align 4
+  call void @llvm.va_end(i8* %0)
+  ret i32 %2
+}
+
+declare void @llvm.va_start(i8*)
+
+declare void @llvm.va_end(i8*)