Index: lib/Target/X86/X86CallFrameOptimization.cpp =================================================================== --- lib/Target/X86/X86CallFrameOptimization.cpp +++ lib/Target/X86/X86CallFrameOptimization.cpp @@ -20,6 +20,7 @@ #include #include "X86.h" +#include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" @@ -51,31 +52,59 @@ bool runOnMachineFunction(MachineFunction &MF) override; private: - // Information we know about a particular call site + // Information about the setup for an inalloca call. + struct InAllocaInfo { + // Frame setup for the _chkstk call. + MachineBasicBlock::iterator FrameSetup; + + // Move of _chkstk amount into virtual register. + MachineBasicBlock::iterator AmountInstr; + + // Move of virtual register with _chkstk amount into %eax. + MachineBasicBlock::iterator AmountMov; + + // Call to _chkstk. + MachineBasicBlock::iterator ChkstkCall; + + // Copy of _chkstk result into virtual register. + MachineBasicBlock::iterator ChkstkResultCopy; + + // Frame destroy for the _chkstk call. + MachineBasicBlock::iterator FrameDestroy; + }; + + // Information we know about a particular call site. struct CallContext { CallContext() - : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0), - MovVector(4, nullptr), NoStackParams(false), UsePush(false) {} + : FrameSetup(nullptr), IsInAlloca(false), Call(nullptr), SPCopy(nullptr), + ExpectedDist(0), MovVector(4, nullptr), NoStackParams(false), + UsePush(false) {} - // Iterator referring to the frame setup instruction + // Iterator referring to the frame setup instruction. MachineBasicBlock::iterator FrameSetup; - // Actual call instruction + // Whether this is an inalloca call. + bool IsInAlloca; + + // If this is an inalloca call, information about the setup for that. + InAllocaInfo InAllocaSetup; + + // Actual call instruction. MachineInstr *Call; - // A copy of the stack pointer + // A copy of the stack pointer. MachineInstr *SPCopy; - // The total displacement of all passed parameters + // The total displacement of all passed parameters. int64_t ExpectedDist; - // The sequence of movs used to pass the parameters + // The sequence of movs used to pass the parameters. SmallVector MovVector; - // True if this call site has no stack parameters + // True if this call site has no stack parameters. bool NoStackParams; - // True if this call site can use push instructions + // True if this call site can use push instructions. bool UsePush; }; @@ -85,6 +114,9 @@ bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap); + bool matchInAlloca(MachineBasicBlock::iterator &I, InAllocaInfo &Info, + unsigned int &MaxAdjust); + void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, CallContext &Context); @@ -314,6 +346,60 @@ return Skip; } +bool X86CallFrameOptimization::matchInAlloca(MachineBasicBlock::iterator &I, + InAllocaInfo &Info, + unsigned int &MaxAdjust) { + // inalloca is only expected to occur in 32-bit code. + if (STI->is64Bit()) + return false; + + assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); + + // FrameSetup for the _chkstk call. + if (I->getOperand(0).getImm() != 0 || I->getOperand(1).getImm() != 0) + return false; + Info.FrameSetup = I++; + + // Often, there's an instruction here that moves the _chkstk amount into a + // virtual register. Ignore it for now; we'll look into that below. + if (I->getOpcode() == X86::MOV32ri) + I++; + + // Match move of virtual register to %eax, the _chkstk argument. + if (!I->isCopy() || !I->getOperand(0).isReg() || !I->getOperand(1).isReg() || + I->getOperand(0).getReg() != X86::EAX) + return false; + Info.AmountMov = I++; + + // Get the definition of that virtual register. + unsigned ChkstkAmountVreg = Info.AmountMov->getOperand(1).getReg(); + MachineInstr *Def = MRI->getUniqueVRegDef(ChkstkAmountVreg); + if (!Def || Def->getOpcode() != X86::MOV32ri || !Def->getOperand(1).isImm()) + return false; + Info.AmountInstr = Def; + MaxAdjust = Def->getOperand(1).getImm() >> Log2SlotSize; + + // Match call to chkstk. + if (!I->isCall() || !I->getOperand(0).isSymbol() || + StringRef(I->getOperand(0).getSymbolName()) != "_chkstk") + return false; + Info.ChkstkCall = I++; + + // Match copy of %esp (the result of _chkstk) to a register. + if (!I->isCopy() || !I->getOperand(0).isReg() || !I->getOperand(1).isReg() || + I->getOperand(1).getReg() != X86::ESP) + return false; + Info.ChkstkResultCopy = I++; + + // Match FrameDestroy for _chkstk call. + if (I->getOpcode() != TII->getCallFrameDestroyOpcode() || + I->getOperand(0).getImm() != 0 || I->getOperand(1).getImm() != 0) + return false; + Info.FrameDestroy = I++; + + return true; +} + void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -322,10 +408,11 @@ // transformation. const X86RegisterInfo &RegInfo = *static_cast(STI->getRegisterInfo()); + unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); // We expect to enter this at the beginning of a call sequence - assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); + assert(I->getOpcode() == FrameSetupOpcode); MachineBasicBlock::iterator FrameSetup = I++; Context.FrameSetup = FrameSetup; @@ -334,10 +421,16 @@ unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() >> Log2SlotSize; - // A zero adjustment means no stack parameters if (!MaxAdjust) { - Context.NoStackParams = true; - return; + // It might be an inalloca call. Back up and check for that. + --I; + if (matchInAlloca(I, Context.InAllocaSetup, MaxAdjust)) { + Context.IsInAlloca = true; + } else { + // Otherwise, a zero adjustment means no stack parameters. + Context.NoStackParams = true; + return; + } } // For globals in PIC mode, we can have some LEAs here. @@ -346,15 +439,20 @@ while (I->getOpcode() == X86::LEA32r) ++I; - // We expect a copy instruction here. - // TODO: The copy instruction is a lowering artifact. - // We should also support a copy-less version, where the stack - // pointer is used directly. - if (!I->isCopy() || !I->getOperand(0).isReg()) - return; - Context.SPCopy = I++; + unsigned StackPtr; - unsigned StackPtr = Context.SPCopy->getOperand(0).getReg(); + if (Context.IsInAlloca) { + StackPtr = Context.InAllocaSetup.ChkstkResultCopy->getOperand(0).getReg(); + } else { + // We expect a copy instruction here. + // TODO: The copy instruction is a lowering artifact. + // We should also support a copy-less version, where the stack + // pointer is used directly. + if (!I->isCopy() || !I->getOperand(0).isReg()) + return; + Context.SPCopy = I++; + StackPtr = Context.SPCopy->getOperand(0).getReg(); + } // Scan the call setup sequence for the pattern we're looking for. // We only handle a simple case - a sequence of store instructions that @@ -419,6 +517,13 @@ ++I; } + // For an inalloca call, the FrameSetup instruction for the call is here. + if (Context.IsInAlloca) { + if (I == MBB.end() || I->getOpcode() != FrameSetupOpcode) + return; + Context.FrameSetup = I++; + } + // We now expect the end of the sequence. If we stopped early, // or reached the end of the block without finding a call, bail. if (I == MBB.end() || !I->isCall()) @@ -450,6 +555,13 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, const CallContext &Context) { + if (Context.IsInAlloca) { + // Move the FrameSetup instruction for the call to before the moves. + assert(Context.MovVector.size() > 0 && "No moves?"); + auto *MBB = Context.MovVector[0]->getParent(); + MBB->insert(Context.MovVector[0], Context.FrameSetup->removeFromParent()); + } + // Ok, we can in fact do the transformation for this call. // Do not remove the FrameSetup instruction, but adjust the parameters. // PEI will end up finalizing the handling of this. @@ -537,10 +649,35 @@ MBB.erase(MOV); } - // The stack-pointer copy is no longer used in the call sequences. - // There should not be any other users, but we can't commit to that, so: - if (MRI->use_empty(Context.SPCopy->getOperand(0).getReg())) - Context.SPCopy->eraseFromParent(); + if (!Context.IsInAlloca) { + // The stack-pointer copy is no longer used in the call sequences. + // There should not be any other users, but we can't commit to that, so: + if (MRI->use_empty(Context.SPCopy->getOperand(0).getReg())) + Context.SPCopy->eraseFromParent(); + } else { + // Remove the incalloca call setup (in reverse to delete uses before defs). + Context.InAllocaSetup.FrameDestroy->eraseFromParent(); + + unsigned ChkstkRes = + Context.InAllocaSetup.ChkstkResultCopy->getOperand(0).getReg(); + if (!MRI->use_empty(ChkstkRes)) { + // Something is using the result of _chkstk. Provide a replacement. + unsigned NewReg = MRI->createVirtualRegister(&X86::GR32RegClass); + addRegOffset(BuildMI(MBB, Context.InAllocaSetup.ChkstkResultCopy, DL, + TII->get(X86::LEA32r), NewReg), + X86::ESP, false, -Context.ExpectedDist); + MRI->replaceRegWith(ChkstkRes, NewReg); + } + Context.InAllocaSetup.ChkstkResultCopy->eraseFromParent(); + + Context.InAllocaSetup.ChkstkCall->eraseFromParent(); + Context.InAllocaSetup.AmountMov->eraseFromParent(); + if (MRI->use_empty( + Context.InAllocaSetup.AmountInstr->getOperand(0).getReg())) { + Context.InAllocaSetup.AmountInstr->eraseFromParent(); + } + Context.InAllocaSetup.FrameSetup->eraseFromParent(); + } // Once we've done this, we need to make sure PEI doesn't assume a reserved // frame. Index: test/CodeGen/X86/inalloca-callframeopt.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/inalloca-callframeopt.ll @@ -0,0 +1,96 @@ +; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s + +%struct.S = type { i32 } +declare void @f(<{ %struct.S }>* inalloca) + +define void @basic() { +entry: + %argmem = alloca inalloca <{ %struct.S }>, align 4 + %x = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem, i32 0, i32 0, i32 0 + store i32 42, i32* %x, align 4 + call void @f(<{ %struct.S }>* inalloca %argmem) + ret void + +; CHECK-LABEL: basic: +; TODO: We've removed the dynamic alloca; make frame pointer omission possible. +; CHECK: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NOT: calll __chkstk +; CHECK-NOT: movl $42 +; CHECK-NEXT: pushl $42 +; CHECK-NEXT: calll _f +; CHECK-NEXT: movl %ebp, %esp +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl +} + + +%struct.T = type { i32*, [1 x i32] } +declare void @g(<{ %struct.T }>* inalloca) + +define void @stack_reference_arg() { +entry: + %argmem = alloca inalloca <{ %struct.T }>, align 4 + %arrayinit.begin.i = getelementptr inbounds <{ %struct.T }>, <{ %struct.T }>* %argmem, i32 0, i32 0, i32 1, i32 0 + store i32 1, i32* %arrayinit.begin.i, align 4 + %p.i = getelementptr inbounds <{ %struct.T }>, <{ %struct.T }>* %argmem, i32 0, i32 0, i32 0 + store i32* %arrayinit.begin.i, i32** %p.i, align 4 + call void @g(<{ %struct.T }>* inalloca %argmem) + ret void + +; One of the arguments is a pointer into the call frame. +; FIXME: It would be cool if we could fold away the add instruction. +; CHECK-LABEL: stack_reference_arg: +; CHECK: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: leal -8(%esp), %eax +; CHECK-NEXT: addl $4, %eax +; CHECK-NEXT: pushl $1 +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: calll _g +; CHECK-NEXT: movl %ebp, %esp +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl +} + + +define void @two_calls() { +entry: + %inalloca.save = tail call i8* @llvm.stacksave() + %argmem = alloca inalloca <{ %struct.S }>, align 4 + %x = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem, i32 0, i32 0, i32 0 + store i32 42, i32* %x, align 4 + call void @f(<{ %struct.S }>* inalloca nonnull %argmem) + call void @llvm.stackrestore(i8* %inalloca.save) + %argmem3 = alloca inalloca <{ %struct.S }>, align 4 + %x2 = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem3, i32 0, i32 0, i32 0 + store i32 42, i32* %x2, align 4 + call void @f(<{ %struct.S }>* inalloca nonnull %argmem3) + ret void + +; Two inalloca calls after eachother. The vreg used for the _chkstk argument +; is shared between them. +; FIXME: Clang puts stacksave/restore around the first call, which are +; redundant since the stack is adjusted back after the call anyway. If +; we wanted to be aggressive, we could even skip adjusting the stack +; back between the calls. +; CHECK-LABEL: two_calls +; CHECK: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: movl %esp, %esi +; CHECK-NEXT: pushl $42 +; CHECK-NEXT: calll _f +; CHECK-NEXT: addl $4, %esp +; CHECK-NEXT: movl %esi, %esp +; CHECK-NEXT: pushl $42 +; CHECK-NEXT: calll _f +; CHECK-NEXT: leal -4(%ebp), %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl +} + + +declare i8* @llvm.stacksave() +declare void @llvm.stackrestore(i8*) Index: test/CodeGen/X86/inalloca-stdcall.ll =================================================================== --- test/CodeGen/X86/inalloca-stdcall.ll +++ test/CodeGen/X86/inalloca-stdcall.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s +; RUN: llc < %s -mtriple=i686-pc-win32 -no-x86-call-frame-opt | FileCheck %s %Foo = type { i32, i32 } @@ -19,8 +19,8 @@ ; CHECK: movl $42, 4(%eax) call x86_stdcallcc void @f(%Foo* inalloca %b) ; CHECK: calll _f@8 -; CHECK-NOT: %esp -; CHECK: pushl +; CHECK: subl $4, %esp +; CHECK: movl $0, (%esp) ; CHECK: calll _i@4 call x86_stdcallcc void @i(i32 0) ret void Index: test/CodeGen/X86/inalloca.ll =================================================================== --- test/CodeGen/X86/inalloca.ll +++ test/CodeGen/X86/inalloca.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s +; RUN: llc < %s -mtriple=i686-pc-win32 -no-x86-call-frame-opt | FileCheck %s %Foo = type { i32, i32 }