Index: lib/Target/X86/X86CallFrameOptimization.cpp =================================================================== --- lib/Target/X86/X86CallFrameOptimization.cpp +++ lib/Target/X86/X86CallFrameOptimization.cpp @@ -105,7 +105,7 @@ const TargetInstrInfo *TII; const X86FrameLowering *TFL; const X86Subtarget *STI; - const MachineRegisterInfo *MRI; + MachineRegisterInfo *MRI; unsigned SlotSize; unsigned Log2SlotSize; static char ID; @@ -125,14 +125,6 @@ if (NoX86CFOpt.getValue()) return false; - // We currently only support call sequences where *all* parameters. - // are passed on the stack. - // No point in running this in 64-bit mode, since some arguments are - // passed in-register in all common calling conventions, so the pattern - // we're looking for will never match. - if (STI->is64Bit()) - return false; - // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset // in the compact unwind encoding that Darwin uses. So, bail if there // is a danger of that being generated. @@ -141,6 +133,11 @@ (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF)))) return false; + // It is not valid to change the stack pointer outside the prolog/epilog + // on 64-bit Windows. + if (STI->isTargetWin64()) + return false; + // You would expect straight-line code between call-frame setup and // call-frame destroy. You would be wrong. There are circumstances (e.g. // CMOV_GR8 expansion of a select that feeds a function call!) where we can @@ -204,7 +201,7 @@ // We can use pushes. First, account for the fixed costs. // We'll need a add after the call. Advantage -= 3; - // If we have to realign the stack, we'll also need and sub before + // If we have to realign the stack, we'll also need a sub before if (CC.ExpectedDist % StackAlign) Advantage -= 3; // Now, for each push, we save ~3 bytes. For small constants, we actually, @@ -264,7 +261,8 @@ // The instructions we actually care about are movs onto the stack int Opcode = MI->getOpcode(); - if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr) + if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr || + Opcode == X86::MOV64mi32 || Opcode == X86::MOV64mr) return Convert; // Not all calling conventions have only stack MOVs between the stack @@ -457,6 +455,7 @@ FrameSetup->getOperand(1).setImm(Context.ExpectedDist); DebugLoc DL = FrameSetup->getDebugLoc(); + bool Is64Bit = STI->is64Bit(); // Now, iterate through the vector in reverse order, and replace the movs // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to // replace uses. @@ -469,7 +468,8 @@ default: llvm_unreachable("Unexpected Opcode!"); case X86::MOV32mi: - PushOpcode = X86::PUSHi32; + case X86::MOV64mi32: + PushOpcode = Is64Bit ? X86::PUSH64i32 : X86::PUSHi32; // If the operand is a small (8-bit) immediate, we can use a // PUSH instruction with a shorter encoding. // Note that isImm() may fail even though this is a MOVmi, because @@ -477,14 +477,27 @@ if (PushOp.isImm()) { int64_t Val = PushOp.getImm(); if (isInt<8>(Val)) - PushOpcode = X86::PUSH32i8; + PushOpcode = Is64Bit ? X86::PUSH64i8 : X86::PUSH32i8; } Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)) .addOperand(PushOp); break; case X86::MOV32mr: + case X86::MOV64mr: unsigned int Reg = PushOp.getReg(); + // If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg + // in preparation for the PUSH64. The upper 32 bits can be undef. + if (Is64Bit && MOV->getOpcode() == X86::MOV32mr) { + unsigned UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass); + Reg = MRI->createVirtualRegister(&X86::GR64RegClass); + BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg); + BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg) + .addReg(UndefReg) + .addOperand(PushOp) + .addImm(X86::sub_32bit); + } + // If PUSHrmm is not slow on this target, try to fold the source of the // push into the instruction. bool SlowPUSHrmm = STI->isAtom() || STI->isSLM(); @@ -493,7 +506,7 @@ // conservative about that. MachineInstr *DefMov = nullptr; if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) { - PushOpcode = X86::PUSH32rmm; + PushOpcode = Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm; Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)); unsigned NumOps = DefMov->getDesc().getNumOperands(); @@ -502,7 +515,7 @@ DefMov->eraseFromParent(); } else { - PushOpcode = X86::PUSH32r; + PushOpcode = Is64Bit ? X86::PUSH64r : X86::PUSH32r; Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)) .addReg(Reg) .getInstr(); @@ -557,7 +570,8 @@ // Make sure the def is a MOV from memory. // If the def is an another block, give up. - if (DefMI->getOpcode() != X86::MOV32rm || + if ((DefMI->getOpcode() != X86::MOV32rm && + DefMI->getOpcode() != X86::MOV64rm) || DefMI->getParent() != FrameSetup->getParent()) return nullptr; Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -2137,6 +2137,12 @@ case X86::PUSH32rmr: case X86::PUSHi32: return 4; + case X86::PUSH64i8: + case X86::PUSH64r: + case X86::PUSH64rmm: + case X86::PUSH64rmr: + case X86::PUSH64i32: + return 8; } } Index: test/CodeGen/X86/movtopush64.ll =================================================================== --- test/CodeGen/X86/movtopush64.ll +++ test/CodeGen/X86/movtopush64.ll @@ -0,0 +1,192 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=NORMAL +; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=NOPUSH +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -no-x86-call-frame-opt | FileCheck %s -check-prefix=NOPUSH + +declare void @seven_params(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g) +declare void @ten_params(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g, i64 %h, i32 %i, i64 %j) +declare void @ten_params_ptr(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g, i8* %h, i32 %i, i64 %j) +declare void @cannot_push(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) + +; We should get pushes for the last 4 parameters. Test that the +; in-register parameters are all in the right places, and check +; that the stack manipulations are correct and correctly +; described by the DWARF directives. Test that the switch +; to disable the optimization works and that the optimization +; doesn't kick in on Windows64 where it is not allowed. +; NORMAL-LABEL: test1 +; NORMAL: pushq +; NORMAL-DAG: movl $1, %edi +; NORMAL-DAG: movl $2, %esi +; NORMAL-DAG: movl $3, %edx +; NORMAL-DAG: movl $4, %ecx +; NORMAL-DAG: movl $5, %r8d +; NORMAL-DAG: movl $6, %r9d +; NORMAL: pushq $10 +; NORMAL: .cfi_adjust_cfa_offset 8 +; NORMAL: pushq $9 +; NORMAL: .cfi_adjust_cfa_offset 8 +; NORMAL: pushq $8 +; NORMAL: .cfi_adjust_cfa_offset 8 +; NORMAL: pushq $7 +; NORMAL: .cfi_adjust_cfa_offset 8 +; NORMAL: callq ten_params +; NORMAL: addq $32, %rsp +; NORMAL: .cfi_adjust_cfa_offset -32 +; NORMAL: popq +; NORMAL: retq +; NOPUSH-LABEL: test1 +; NOPUSH-NOT: pushq +; NOPUSH: retq +define void @test1() { +entry: + call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10) + ret void +} + +; The presence of a frame pointer should not prevent pushes. But we +; don't need the CFI directives in that case. +; Also check that we generate the right pushes for >8bit immediates. +; NORMAL-LABEL: test2 +; NORMAL: pushq $10000 +; NORMAL-NEXT: pushq $9000 +; NORMAL-NEXT: pushq $8000 +; NORMAL-NEXT: pushq $7000 +; NORMAL-NEXT: callq ten_params +define void @test2(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7000, i64 8000, i32 9000, i64 10000) + ret void +} + +; Parameters 7 & 8 should push a 64-bit register. +; TODO: Note that the regular expressions disallow r8 and r9. That's fine for +; now, because the pushes will always follow the moves into r8 and r9. +; Eventually, though, we want to be able to schedule the pushes better. +; In this example, it will save two copies, because we have to move the +; incoming parameters out of %rdi and %rsi to make room for the outgoing +; parameters. +; NORMAL-LABEL: test3 +; NORMAL: pushq $10000 +; NORMAL: pushq $9000 +; NORMAL: pushq %r{{..}} +; NORMAL: pushq %r{{..}} +; NORMAL: callq ten_params +define void @test3(i32 %a, i64 %b) { +entry: + call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 %a, i64 %b, i32 9000, i64 10000) + ret void +} + +; Check that we avoid the optimization for just one push. +; NORMAL-LABEL: test4 +; NORMAL: movl $7, (%rsp) +; NORMAL: callq seven_params +define void @test4() { +entry: + call void @seven_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7) + ret void +} + +; Check that pushing link-time constant addresses works correctly +; NORMAL-LABEL: test5 +; NORMAL: pushq $10 +; NORMAL: pushq $9 +; NORMAL: pushq $ext +; NORMAL: pushq $7 +; NORMAL: callq ten_params_ptr +@ext = external constant i8 +define void @test5() { +entry: + call void @ten_params_ptr(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i8* @ext, i32 9, i64 10) + ret void +} + +; Check that we fuse 64-bit loads but not 32-bit loads into PUSH mem. +; NORMAL-LABEL: test6 +; NORMAL: movq %rsi, [[REG64:%.+]] +; NORMAL: pushq $10 +; NORMAL: pushq $9 +; NORMAL: pushq ([[REG64]]) +; NORMAL: pushq {{%r..}} +; NORMAL: callq ten_params +define void @test6(i32* %p32, i64* %p64) { +entry: + %v32 = load i32, i32* %p32 + %v64 = load i64, i64* %p64 + call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 %v32, i64 %v64, i32 9, i64 10) + ret void +} + +; Fold stack-relative loads into the push with correct offsets. +; Do the same for an indirect call whose address is loaded from the stack. +; On entry, %p7 is at 8(%rsp) and %p8 is at 16(%rsp). Prior to the call +; sequence, 72 bytes are allocated to the stack, 48 for register saves and +; 24 for local storage and alignment, so %p7 is at 80(%rsp) and %p8 is at +; 88(%rsp). The call address can be stored anywhere in the local space but +; happens to be stored at 8(%rsp). Each push bumps these offsets up by +; 8 bytes. +; NORMAL-LABEL: test7 +; NORMAL: movq %r{{.*}}, 8(%rsp) {{.*Spill$}} +; NORMAL: pushq 88(%rsp) +; NORMAL: pushq $9 +; NORMAL: pushq 96(%rsp) +; NORMAL: pushq $7 +; NORMAL: callq *40(%rsp) +define void @test7(i64 %p1, i64 %p2, i64 %p3, i64 %p4, i64 %p5, i64 %p6, i64 %p7, i64 %p8) { +entry: + %stack_fptr = alloca void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)* + store void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)* @ten_params, void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)** %stack_fptr + %ten_params_ptr = load volatile void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)*, void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)** %stack_fptr + call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + call void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64) %ten_params_ptr(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 %p7, i32 9, i64 %p8) + ret void +} + +; We can't fold the load from the global into the push because of +; interference from the store +; NORMAL-LABEL: test8 +; NORMAL: movq the_global(%rip), [[REG:%r.+]] +; NORMAL: movq $42, the_global +; NORMAL: pushq $10 +; NORMAL: pushq $9 +; NORMAL: pushq [[REG]] +; NORMAL: pushq $7 +; NORMAL: callq ten_params +@the_global = external global i64 +define void @test8() { + %myload = load i64, i64* @the_global + store i64 42, i64* @the_global + call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 %myload, i32 9, i64 10) + ret void +} + + +; Converting one function call to use pushes negatively affects +; other calls that pass arguments on the stack without pushes. +; If the cost outweighs the benefit, avoid using pushes. +; NORMAL-LABEL: test9 +; NORMAL: callq cannot_push +; NORMAL-NOT: push +; NORMAL: callq ten_params +define void @test9(float %p1) { + call void @cannot_push(float 1.0e0, float 2.0e0, float 3.0e0, float 4.0e0, float 5.0e0, float 6.0e0, float 7.0e0, float 8.0e0, float %p1) + call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10) + call void @cannot_push(float 1.0e0, float 2.0e0, float 3.0e0, float 4.0e0, float 5.0e0, float 6.0e0, float 7.0e0, float 8.0e0, float %p1) + ret void +} + +; But if the benefit outweighs the cost, use pushes. +; NORMAL-LABEL: test10 +; NORMAL: callq cannot_push +; NORMAL: pushq $10 +; NORMAL: pushq $9 +; NORMAL: pushq $8 +; NORMAL: pushq $7 +; NORMAL: callq ten_params +define void @test10(float %p1) { + call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10) + call void @cannot_push(float 1.0e0, float 2.0e0, float 3.0e0, float 4.0e0, float 5.0e0, float 6.0e0, float 7.0e0, float 8.0e0, float %p1) + call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10) + ret void +}