Index: include/llvm/CodeGen/CallingConvLower.h =================================================================== --- include/llvm/CodeGen/CallingConvLower.h +++ include/llvm/CodeGen/CallingConvLower.h @@ -201,6 +201,7 @@ LLVMContext &Context; unsigned StackOffset; + unsigned MaxStackArgAlign; SmallVector UsedRegs; SmallVector PendingLocs; @@ -270,7 +271,11 @@ CallingConv::ID getCallingConv() const { return CallingConv; } bool isVarArg() const { return IsVarArg; } - unsigned getNextStackOffset() const { return StackOffset; } + /// getNextStackOffset - Return the stack offset needed to be able to store + /// all stack slots according to their alignment requirements. + unsigned getNextStackOffset() const { + return RoundUpToAlignment(StackOffset, MaxStackArgAlign); + } /// isAllocated - Return true if the specified register (or an alias) is /// allocated. @@ -400,9 +405,10 @@ /// and alignment. unsigned AllocateStack(unsigned Size, unsigned Align) { assert(Align && ((Align - 1) & Align) == 0); // Align is power of 2. - StackOffset = ((StackOffset + Align - 1) & ~(Align - 1)); + StackOffset = RoundUpToAlignment(StackOffset, Align); unsigned Result = StackOffset; StackOffset += Size; + MaxStackArgAlign = std::max(Align, MaxStackArgAlign); MF.getFrameInfo()->ensureMaxAlignment(Align); return Result; } Index: lib/CodeGen/CallingConvLower.cpp =================================================================== --- lib/CodeGen/CallingConvLower.cpp +++ lib/CodeGen/CallingConvLower.cpp @@ -32,6 +32,7 @@ CallOrPrologue(Unknown) { // No stack is used. StackOffset = 0; + MaxStackArgAlign = 1; clearByValRegsInfo(); UsedRegs.resize((TRI.getNumRegs()+31)/32); @@ -192,6 +193,7 @@ void CCState::getRemainingRegParmsForType(SmallVectorImpl &Regs, MVT VT, CCAssignFn Fn) { unsigned SavedStackOffset = StackOffset; + unsigned SavedMaxStackArgAlign = MaxStackArgAlign; unsigned NumLocs = Locs.size(); // Set the 'inreg' flag if it is used for this calling convention. @@ -223,6 +225,7 @@ // as allocated so that future queries don't return the same registers, i.e. // when i64 and f64 are both passed in GPRs. StackOffset = SavedStackOffset; + MaxStackArgAlign = SavedMaxStackArgAlign; Locs.resize(NumLocs); } Index: test/CodeGen/X86/aligned-variadic.ll =================================================================== --- test/CodeGen/X86/aligned-variadic.ll +++ test/CodeGen/X86/aligned-variadic.ll @@ -15,7 +15,7 @@ %overflow_arg_area = load i8*, i8** %overflow_arg_area_p, align 8 %overflow_arg_area.next = getelementptr i8, i8* %overflow_arg_area, i64 24 store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8 -; X32: leal 68(%esp), [[REG:%.*]] +; X32: leal 72(%esp), [[REG:%.*]] ; X32: movl [[REG]], 16(%esp) ; X64: leaq 232(%rsp), [[REG:%.*]] ; X64: movq [[REG]], 184(%rsp) Index: test/CodeGen/X86/win32-spill-xmm.ll =================================================================== --- test/CodeGen/X86/win32-spill-xmm.ll +++ test/CodeGen/X86/win32-spill-xmm.ll @@ -0,0 +1,14 @@ +; RUN: llc -mcpu=generic -mtriple=i686-pc-windows-msvc -mattr=+sse < %s | FileCheck %s +; CHECK: subl $32, %esp +; CHECK: movaps %xmm3, (%esp) +; CHECK: movl $0, 16(%esp) + +declare void @bar(<16 x float> %a, i32 %b) nounwind + +define void @foo(i32, <16 x float> * nocapture readonly) nounwind { +entry: + %2 = alloca i32, i32 %0 + %3 = load <16 x float>, <16 x float> * %1, align 64 + tail call void @bar(<16 x float> %3, i32 0) nounwind + ret void +}