Index: llvm/trunk/lib/Target/X86/X86FrameLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86FrameLowering.h +++ llvm/trunk/lib/Target/X86/X86FrameLowering.h @@ -76,6 +76,16 @@ void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + +private: + /// convertArgMovsToPushes - This method tries to convert a call sequence + /// that uses sub and mov instructions to put the argument onto the stack + /// into a series of pushes. + /// Returns true if the transformation succeeded, false if not. + bool convertArgMovsToPushes(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + uint64_t Amount) const; }; } // End llvm namespace Index: llvm/trunk/lib/Target/X86/X86FrameLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86FrameLowering.cpp +++ llvm/trunk/lib/Target/X86/X86FrameLowering.cpp @@ -93,6 +93,15 @@ return X86::AND32ri; } +static unsigned getPUSHiOpcode(bool IsLP64, int64_t Imm) { + // We don't support LP64 for now. + assert(!IsLP64); + + if (isInt<8>(Imm)) + return X86::PUSH32i8; + return X86::PUSHi32; +} + static unsigned getLEArOpcode(unsigned IsLP64) { return IsLP64 ? X86::LEA64r : X86::LEA32r; } @@ -1802,6 +1811,103 @@ #endif } +bool X86FrameLowering:: +convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, uint64_t Amount) const { + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86RegisterInfo &RegInfo = *static_cast( + MF.getSubtarget().getRegisterInfo()); + unsigned StackPtr = RegInfo.getStackRegister(); + + // Scan the call setup sequence for the pattern we're looking for. + // We only handle a simple case now - a sequence of MOV32mi or MOV32mr + // instructions, that push a sequence of 32-bit values onto the stack, with + // no gaps. + std::map MovMap; + do { + int Opcode = I->getOpcode(); + if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) + break; + + // We only want movs of the form: + // movl imm/r32, k(%ecx) + // If we run into something else, bail + // Note that AddrBaseReg may, counterintuitively, not be a register... + if (!I->getOperand(X86::AddrBaseReg).isReg() || + (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || + !I->getOperand(X86::AddrScaleAmt).isImm() || + (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || + (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || + (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || + !I->getOperand(X86::AddrDisp).isImm()) + return false; + + int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); + + // We don't want to consider the unaligned case. + if (StackDisp % 4) + return false; + + // If the same stack slot is being filled twice, something's fishy. + if (!MovMap.insert(std::pair(StackDisp, I)).second) + return false; + + ++I; + } while (I != MBB.end()); + + // We now expect the end of the sequence - a call and a stack adjust. + if (I == MBB.end()) + return false; + if (!I->isCall()) + return false; + MachineBasicBlock::iterator Call = I; + if ((++I)->getOpcode() != TII.getCallFrameDestroyOpcode()) + return false; + + // Now, go through the map, and see that we don't have any gaps, + // but only a series of 32-bit MOVs. + // Since std::map provides ordered iteration, the original order + // of the MOVs doesn't matter. + int64_t ExpectedDist = 0; + for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME; + ++MMI, ExpectedDist += 4) + if (MMI->first != ExpectedDist) + return false; + + // Ok, everything looks fine. Do the transformation. + DebugLoc DL = I->getDebugLoc(); + + // It's possible the original stack adjustment amount was larger than + // that done by the pushes. If so, we still need a SUB. + Amount -= ExpectedDist; + if (Amount) { + MachineInstr* Sub = BuildMI(MBB, Call, DL, + TII.get(getSUBriOpcode(false, Amount)), StackPtr) + .addReg(StackPtr).addImm(Amount); + Sub->getOperand(3).setIsDead(); + } + + // Now, iterate through the map in reverse order, and replace the movs + // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses. + for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) { + MachineBasicBlock::iterator MOV = MMI->second; + MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); + int PushOpcode; + if (MOV->getOpcode() == X86::MOV32mi) { + int64_t Val = PushOp.getImm(); + BuildMI(MBB, Call, DL, TII.get(getPUSHiOpcode(false, Val))) + .addImm(Val); + } else { + PushOpcode = X86::PUSH32r; + BuildMI(MBB, Call, DL, TII.get(X86::PUSH32r)) + .addReg(PushOp.getReg()); + } + MBB.erase(MOV); + } + + return true; +} + void X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { @@ -1809,21 +1915,20 @@ const X86RegisterInfo &RegInfo = *static_cast( MF.getSubtarget().getRegisterInfo()); unsigned StackPtr = RegInfo.getStackRegister(); - bool reseveCallFrame = hasReservedCallFrame(MF); + bool reserveCallFrame = hasReservedCallFrame(MF); int Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); const X86Subtarget &STI = MF.getTarget().getSubtarget(); bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL = I->getDebugLoc(); - uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0; + uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0; I = MBB.erase(I); - if (!reseveCallFrame) { + if (!reserveCallFrame) { // If the stack pointer can be changed after prologue, turn the // adjcallstackup instruction into a 'sub ESP, ' and the // adjcallstackdown instruction into 'add ESP, ' - // TODO: consider using push / pop instead of sub + store / add if (Amount == 0) return; @@ -1838,6 +1943,12 @@ MachineInstr *New = nullptr; if (Opcode == TII.getCallFrameSetupOpcode()) { + // Try to convert movs to the stack into pushes. + // We currently only look for a pattern that appears in 32-bit + // calling conventions. + if (!IsLP64 && convertArgMovsToPushes(MF, MBB, I, Amount)) + return; + New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr) .addReg(StackPtr) Index: llvm/trunk/test/CodeGen/X86/force-align-stack-alloca.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/force-align-stack-alloca.ll +++ llvm/trunk/test/CodeGen/X86/force-align-stack-alloca.ll @@ -33,14 +33,14 @@ ; CHECK-NOT: {{[^ ,]*}}, %esp ; ; Next we set up the memset call, and then undo it. -; CHECK: subl $32, %esp +; CHECK: subl $20, %esp ; CHECK-NOT: {{[^ ,]*}}, %esp ; CHECK: calll memset ; CHECK-NEXT: addl $32, %esp ; CHECK-NOT: {{[^ ,]*}}, %esp ; ; Next we set up the call to 'f'. -; CHECK: subl $32, %esp +; CHECK: subl $28, %esp ; CHECK-NOT: {{[^ ,]*}}, %esp ; CHECK: calll f ; CHECK-NEXT: addl $32, %esp Index: llvm/trunk/test/CodeGen/X86/inalloca-ctor.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/inalloca-ctor.ll +++ llvm/trunk/test/CodeGen/X86/inalloca-ctor.ll @@ -17,16 +17,16 @@ ; CHECK: movl %esp, call void @Foo_ctor(%Foo* %c) ; CHECK: leal 12(%{{.*}}), -; CHECK: subl $4, %esp -; CHECK: calll _Foo_ctor +; CHECK-NEXT: pushl +; CHECK-NEXT: calll _Foo_ctor ; CHECK: addl $4, %esp %b = getelementptr %frame* %args, i32 0, i32 1 store i32 42, i32* %b ; CHECK: movl $42, %a = getelementptr %frame* %args, i32 0, i32 0 call void @Foo_ctor(%Foo* %a) -; CHECK: subl $4, %esp -; CHECK: calll _Foo_ctor +; CHECK-NEXT: pushl +; CHECK-NEXT: calll _Foo_ctor ; CHECK: addl $4, %esp call void @f(%frame* inalloca %args) ; CHECK: calll _f Index: llvm/trunk/test/CodeGen/X86/inalloca-invoke.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/inalloca-invoke.ll +++ llvm/trunk/test/CodeGen/X86/inalloca-invoke.ll @@ -37,7 +37,7 @@ invoke.cont: call void @begin(%Iter* sret %beg) -; CHECK: movl %[[beg]], +; CHECK: pushl %[[beg]] ; CHECK: calll _begin invoke void @reverse(%frame.reverse* inalloca align 4 %rev_args) Index: llvm/trunk/test/CodeGen/X86/inalloca-stdcall.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/inalloca-stdcall.ll +++ llvm/trunk/test/CodeGen/X86/inalloca-stdcall.ll @@ -19,7 +19,7 @@ call x86_stdcallcc void @f(%Foo* inalloca %b) ; CHECK: calll _f@8 ; CHECK-NOT: %esp -; CHECK: subl $4, %esp +; CHECK: pushl ; CHECK: calll _i@4 call x86_stdcallcc void @i(i32 0) ret void Index: llvm/trunk/test/CodeGen/X86/mem-intrin-base-reg.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/mem-intrin-base-reg.ll +++ llvm/trunk/test/CodeGen/X86/mem-intrin-base-reg.ll @@ -63,7 +63,7 @@ ; CHECK-LABEL: _memcpy_vla_vector: ; CHECK: andl $-16, %esp ; CHECK: movl %esp, %esi -; CHECK: movl $128, {{.*}}(%esp) +; CHECK: pushl $128 ; CHECK: calll _memcpy ; CHECK: calll __chkstk Index: llvm/trunk/test/CodeGen/X86/movtopush.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/movtopush.ll +++ llvm/trunk/test/CodeGen/X86/movtopush.ll @@ -0,0 +1,97 @@ +; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL +; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED +declare void @good(i32 %a, i32 %b, i32 %c, i32 %d) +declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d) + +; Here, we should have a reserved frame, so we don't expect pushes +; NORMAL-LABEL: test1 +; NORMAL: subl $16, %esp +; NORMAL-NEXT: movl $4, 12(%esp) +; NORMAL-NEXT: movl $3, 8(%esp) +; NORMAL-NEXT: movl $2, 4(%esp) +; NORMAL-NEXT: movl $1, (%esp) +; NORMAL-NEXT: call +define void @test1() { +entry: + call void @good(i32 1, i32 2, i32 3, i32 4) + ret void +} + +; Here, we expect a sequence of 4 immediate pushes +; NORMAL-LABEL: test2 +; NORMAL-NOT: subl {{.*}} %esp +; NORMAL: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: call +define void @test2(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @good(i32 1, i32 2, i32 3, i32 4) + ret void +} + +; Again, we expect a sequence of 4 immediate pushes +; Checks that we generate the right pushes for >8bit immediates +; NORMAL-LABEL: test2b +; NORMAL-NOT: subl {{.*}} %esp +; NORMAL: pushl $4096 +; NORMAL-NEXT: pushl $3072 +; NORMAL-NEXT: pushl $2048 +; NORMAL-NEXT: pushl $1024 +; NORMAL-NEXT: call +define void @test2b(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @good(i32 1024, i32 2048, i32 3072, i32 4096) + ret void +} + +; The first push should push a register +; NORMAL-LABEL: test3 +; NORMAL-NOT: subl {{.*}} %esp +; NORMAL: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl %e{{..}} +; NORMAL-NEXT: call +define void @test3(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @good(i32 %k, i32 2, i32 3, i32 4) + ret void +} + +; We don't support weird calling conventions +; NORMAL-LABEL: test4 +; NORMAL: subl $12, %esp +; NORMAL-NEXT: movl $4, 8(%esp) +; NORMAL-NEXT: movl $3, 4(%esp) +; NORMAL-NEXT: movl $1, (%esp) +; NORMAL-NEXT: movl $2, %eax +; NORMAL-NEXT: call +define void @test4(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @inreg(i32 1, i32 2, i32 3, i32 4) + ret void +} + +; Check that additional alignment is added when the pushes +; don't add up to the required alignment. +; ALIGNED-LABEL: test5 +; ALIGNED: subl $16, %esp +; ALIGNED-NEXT: pushl $4 +; ALIGNED-NEXT: pushl $3 +; ALIGNED-NEXT: pushl $2 +; ALIGNED-NEXT: pushl $1 +; ALIGNED-NEXT: call +define void @test5(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @good(i32 1, i32 2, i32 3, i32 4) + ret void +} + +