Index: llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp +++ llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp @@ -155,12 +155,22 @@ // This is bad, and breaks SP adjustment. // So, check that all of the frames in the function are closed inside // the same block, and, for good measure, that there are no nested frames. + // + // If any call allocates more argument stack memory than the stack + // probe size, don't do this optimization. Otherwise, this pass + // would need to synthesize additional stack probe calls to allocate + // memory for arguments. unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + bool UseStackProbe = + !STI->getTargetLowering()->getStackProbeSymbolName(MF).empty(); + unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF); for (MachineBasicBlock &BB : MF) { bool InsideFrameSequence = false; for (MachineInstr &MI : BB) { if (MI.getOpcode() == FrameSetupOpcode) { + if (TII->getFrameSize(MI) >= StackProbeSize && UseStackProbe) + return false; if (InsideFrameSequence) return false; InsideFrameSequence = true; Index: llvm/trunk/lib/Target/X86/X86FrameLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86FrameLowering.cpp +++ llvm/trunk/lib/Target/X86/X86FrameLowering.cpp @@ -1022,14 +1022,7 @@ X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty(); - - // The default stack probe size is 4096 if the function has no stackprobesize - // attribute. - unsigned StackProbeSize = 4096; - if (Fn.hasFnAttribute("stack-probe-size")) - Fn.getFnAttribute("stack-probe-size") - .getValueAsString() - .getAsInteger(0, StackProbeSize); + unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF); // Re-align the stack on 64-bit if the x86-interrupt calling convention is // used and an error code was pushed, since the x86-64 ABI requires a 16-byte Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -1207,6 +1207,8 @@ StringRef getStackProbeSymbolName(MachineFunction &MF) const override; + unsigned getStackProbeSize(MachineFunction &MF) const; + bool hasVectorBlend() const override { return true; } unsigned getMaxSupportedInterleaveFactor() const override { return 4; } Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -44970,3 +44970,16 @@ return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk"; return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk"; } + +unsigned +X86TargetLowering::getStackProbeSize(MachineFunction &MF) const { + // The default stack probe size is 4096 if the function has no stackprobesize + // attribute. + unsigned StackProbeSize = 4096; + const Function &Fn = MF.getFunction(); + if (Fn.hasFnAttribute("stack-probe-size")) + Fn.getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + return StackProbeSize; +} Index: llvm/trunk/test/CodeGen/X86/nomovtopush.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/nomovtopush.ll +++ llvm/trunk/test/CodeGen/X86/nomovtopush.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-pc-windows-msvc | FileCheck %s + +target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" +target triple = "i386-pc-windows-msvc" + +%struct._param_str = type { i32, i32, [4096 x i32], i32 } + +@g_d = common dso_local local_unnamed_addr global i32 0, align 4 +@g_c = common dso_local local_unnamed_addr global i32 0, align 4 +@g_b = common dso_local local_unnamed_addr global i32 0, align 4 +@g_a = common dso_local local_unnamed_addr global i32 0, align 4 +@g_param = common dso_local global %struct._param_str zeroinitializer, align 4 + +; Function Attrs: nounwind +define dso_local i32 @test() local_unnamed_addr { +; CHECK-LABEL: test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: movl $16396, %eax # imm = 0x400C +; CHECK-NEXT: calll __chkstk +; CHECK-NEXT: movl _g_d, %eax +; CHECK-NEXT: movl _g_c, %ecx +; CHECK-NEXT: movl _g_b, %edx +; CHECK-NEXT: movl _g_a, %esi +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, (%esp) +; CHECK-NEXT: calll _bar +; CHECK-NEXT: movl $4099, %ecx # imm = 0x1003 +; CHECK-NEXT: movl %esp, %edi +; CHECK-NEXT: movl $_g_param, %esi +; CHECK-NEXT: rep;movsl (%esi), %es:(%edi) +; CHECK-NEXT: calll _foo +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: addl $16396, %esp # imm = 0x400C +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: retl +entry: + %0 = load i32, i32* @g_d, align 4, !tbaa !3 + %1 = load i32, i32* @g_c, align 4, !tbaa !3 + %2 = load i32, i32* @g_b, align 4, !tbaa !3 + %3 = load i32, i32* @g_a, align 4, !tbaa !3 + %call = tail call i32 @bar(i32 %3, i32 %2, i32 %1, i32 %0) #2 + tail call void @foo(%struct._param_str* byval nonnull align 4 @g_param) #2 + ret i32 0 +} + +declare dso_local i32 @bar(i32, i32, i32, i32) local_unnamed_addr + +declare dso_local void @foo(%struct._param_str* byval align 4) local_unnamed_addr + +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"}