Index: lib/Target/X86/X86CallFrameOptimization.cpp =================================================================== --- lib/Target/X86/X86CallFrameOptimization.cpp +++ lib/Target/X86/X86CallFrameOptimization.cpp @@ -50,13 +50,11 @@ bool runOnMachineFunction(MachineFunction &MF) override; private: - bool shouldPerformTransformation(MachineFunction &MF); - // Information we know about a particular call site struct CallContext { CallContext() : Call(nullptr), SPCopy(nullptr), ExpectedDist(0), - MovVector(4, nullptr), UsePush(false){}; + MovVector(4, nullptr), NoStackParams(false), UsePush(false){}; // Actuall call instruction MachineInstr *Call; @@ -70,10 +68,19 @@ // The sequence of movs used to pass the parameters SmallVector MovVector; - // Whether this site should use push instructions + // True if this call site has no stack parameters + bool NoStackParams; + + // True of this callsite can use push instructions bool UsePush; }; + typedef DenseMap ContextMap; + + bool isLegal(MachineFunction &MF); + + bool isProfitable(MachineFunction &MF, ContextMap &CallSeqMap); + void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, CallContext &Context); @@ -98,9 +105,10 @@ return new X86CallFrameOptimization(); } -// This checks whether the transformation is legal and profitable -bool X86CallFrameOptimization::shouldPerformTransformation( - MachineFunction &MF) { +// This checks whether the transformation is legal. +// Also returns false in cases where it's potentially legal, but +// we don't even want to try. +bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { if (NoX86CFOpt.getValue()) return false; @@ -140,21 +148,21 @@ return false; } - // Now that we know the transformation is legal, check if it is - // profitable. - // TODO: Add a heuristic that actually looks at the function, - // and enable this for more cases. + return true; +} - // This transformation is always a win when we expected to have +// Check whether this trasnformation is profitable for a particular +// function - in terms of code size. +bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, + ContextMap &CallSeqMap) { + // This transformation is always a win when we do not expect to have // a reserved call frame. Under other circumstances, it may be either // a win or a loss, and requires a heuristic. - // For now, enable it only for the relatively clear win cases. bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects(); if (CannotReserveFrame) return true; - // For now, don't even try to evaluate the profitability when - // not optimizing for size. + // Don't do this when not optimizing for size. AttributeSet FnAttrs = MF.getFunction()->getAttributes(); bool OptForSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex, @@ -164,29 +172,53 @@ if (!OptForSize) return false; - // Stack re-alignment can make this unprofitable even in terms of size. - // As mentioned above, a better heuristic is needed. For now, don't do this - // when the required alignment is above 8. (4 would be the safe choice, but - // some experimentation showed 8 is generally good). - if (TFL->getStackAlignment() > 8) - return false; - return true; + unsigned StackAlign = TFL->getStackAlignment(); + + int64_t Advantage = 0; + for (auto CC : CallSeqMap) { + // Call sites where no parameters are passed on the stack + // do not affect the cost, since there needs to be no + // stack adjustment. + if (CC.second.NoStackParams) + continue; + + if (!CC.second.UsePush) { + // If we don't use pushes for a particular call site, + // we pay for not having a reserved call frame with an + // additional sub/add esp pair. The cost is ~3 bytes per instruction, + // depending on how the size of the constant. + Advantage -= 6; + } else { + // We can use pushes. First, account for the fixed costs. + // We'll need a add after the call. + Advantage -= 3; + // If we have to realign the stack, we'll also need and sub before + if (CC.second.ExpectedDist % StackAlign) + Advantage -= 3; + // Now, for each push, we save ~3 bytes. For small constants, we actually, + // save more (up to 5 bytes), but 3 should be a good approximation. + Advantage += (CC.second.ExpectedDist / 4) * 3; + } + } + + return (Advantage >= 0); } + bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); TFL = MF.getSubtarget().getFrameLowering(); MRI = &MF.getRegInfo(); - if (!shouldPerformTransformation(MF)) + if (!isLegal(MF)) return false; int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); bool Changed = false; - DenseMap CallSeqMap; + ContextMap CallSeqMap; for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) @@ -195,6 +227,9 @@ collectCallInfo(MF, *BB, I, Context); } + if (!isProfitable(MF, CallSeqMap)) + return false; + for (auto CC : CallSeqMap) if (CC.second.UsePush) Changed |= adjustCallSequence(MF, CC.first, CC.second); @@ -217,6 +252,16 @@ assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); MachineBasicBlock::iterator FrameSetup = I++; + // How much do we adjust the stack? This puts an upper bound on + // the number of parameters actually passed on it. + unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; + + // A zero adjustment means no stack parameters + if (!MaxAdjust) { + Context.NoStackParams = true; + return; + } + // For globals in PIC mode, we can have some LEAs here. // Ignore them, they don't bother us. // TODO: Extend this to something that covers more cases. @@ -236,7 +281,6 @@ // We only handle a simple case - a sequence of MOV32mi or MOV32mr // instructions, that push a sequence of 32-bit values onto the stack, with // no gaps between them. - unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; if (MaxAdjust > 4) Context.MovVector.resize(MaxAdjust, nullptr); Index: test/CodeGen/X86/movtopush.ll =================================================================== --- test/CodeGen/X86/movtopush.ll +++ test/CodeGen/X86/movtopush.ll @@ -4,6 +4,9 @@ declare void @good(i32 %a, i32 %b, i32 %c, i32 %d) declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d) +declare void @oneparam(i32 %a) +declare void @eightparams(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) + ; Here, we should have a reserved frame, so we don't expect pushes ; NORMAL-LABEL: test1: @@ -135,6 +138,34 @@ ret void } +; When the alignment adds up, do the transformation +; ALIGNED-LABEL: test5b: +; ALIGNED: pushl $8 +; ALIGNED-NEXT: pushl $7 +; ALIGNED-NEXT: pushl $6 +; ALIGNED-NEXT: pushl $5 +; ALIGNED-NEXT: pushl $4 +; ALIGNED-NEXT: pushl $3 +; ALIGNED-NEXT: pushl $2 +; ALIGNED-NEXT: pushl $1 +; ALIGNED-NEXT: call +define void @test5b() optsize { +entry: + call void @eightparams(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8) + ret void +} + +; When having to compensate for the alignment isn't worth it, +; don't use pushes. +; ALIGNED-LABEL: test5c: +; ALIGNED: movl $1, (%esp) +; ALIGNED-NEXT: call +define void @test5c() optsize { +entry: + call void @oneparam(i32 1) + ret void +} + ; Check that pushing the addresses of globals (Or generally, things that ; aren't exactly immediates) isn't broken. ; Fixes PR21878. @@ -256,3 +287,60 @@ call void @good(i32 %myload, i32 2, i32 3, i32 4) ret void } + +; Converting one mov into a push isn't worth it when +; doing so forces too much overhead for other calls. +; NORMAL-LABEL: test12: +; NORMAL: subl $16, %esp +; NORMAL-NEXT: movl $4, 8(%esp) +; NORMAL-NEXT: movl $3, 4(%esp) +; NORMAL-NEXT: movl $1, (%esp) +; NORMAL-NEXT: movl $2, %eax +; NORMAL-NEXT: calll _inreg +; NORMAL-NEXT: movl $8, 12(%esp) +; NORMAL-NEXT: movl $7, 8(%esp) +; NORMAL-NEXT: movl $6, 4(%esp) +; NORMAL-NEXT: movl $5, (%esp) +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: movl $12, 8(%esp) +; NORMAL-NEXT: movl $11, 4(%esp) +; NORMAL-NEXT: movl $9, (%esp) +; NORMAL-NEXT: movl $10, %eax +; NORMAL-NEXT: calll _inreg +; NORMAL-NEXT: addl $16, %esp +define void @test12() optsize { +entry: + call void @inreg(i32 1, i32 2, i32 3, i32 4) + call void @good(i32 5, i32 6, i32 7, i32 8) + call void @inreg(i32 9, i32 10, i32 11, i32 12) + ret void +} + +; But if the gains outweigh the overhead, we should do it +; NORMAL-LABEL: test12b: +; NORMAL: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: addl $16, %esp +; NORMAL-NEXT: subl $12, %esp +; NORMAL-NEXT: movl $8, 8(%esp) +; NORMAL-NEXT: movl $7, 4(%esp) +; NORMAL-NEXT: movl $5, (%esp) +; NORMAL-NEXT: movl $6, %eax +; NORMAL-NEXT: calll _inreg +; NORMAL-NEXT: addl $12, %esp +; NORMAL-NEXT: pushl $12 +; NORMAL-NEXT: pushl $11 +; NORMAL-NEXT: pushl $10 +; NORMAL-NEXT: pushl $9 +; NORMAL-NEXT: calll _good +; NORMAL-NEXT: addl $16, %esp +define void @test12b() optsize { +entry: + call void @good(i32 1, i32 2, i32 3, i32 4) + call void @inreg(i32 5, i32 6, i32 7, i32 8) + call void @good(i32 9, i32 10, i32 11, i32 12) + ret void +}