Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -2498,10 +2498,10 @@ /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). -static SDValue -EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, - SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, - unsigned SlotSize, int FPDiff, SDLoc dl) { +static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, + SDValue Chain, SDValue RetAddrFrIdx, + EVT PtrVT, unsigned SlotSize, + int FPDiff, SDLoc dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. @@ -2538,17 +2538,19 @@ if (MF.getTarget().Options.DisableTailCalls) isTailCall = false; - if (isTailCall) { + bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); + if (IsMustTail) { + // Force this to be a tail call. The verifier rules are enough to ensure + // that we can lower this successfully without moving the return address + // around. + isTailCall = true; + } else if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, MF.getFunction()->hasStructRetAttr(), CLI.RetTy, Outs, OutVals, Ins, DAG); - if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall()) - report_fatal_error("failed to perform tail call elimination on a call " - "site marked musttail"); - // Sibcalls are automatically detected tailcalls which do not require // ABI changes. if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) @@ -2583,7 +2585,7 @@ NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; - if (isTailCall && !IsSibcall) { + if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. X86MachineFunctionInfo *X86Info = MF.getInfo(); unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); @@ -2746,8 +2748,10 @@ DAG.getConstant(NumXMMRegs, MVT::i8))); } - // For tail calls lower the arguments to the 'real' stack slot. - if (isTailCall) { + // For tail calls lower the arguments to the 'real' stack slots. Sibcalls + // don't need this because the eligibility check rejects calls that require + // shuffling arguments passed in memory. + if (!IsSibcall && isTailCall) { // Force all the incoming stack arguments to be loaded from the stack // before any new outgoing arguments are stored to the stack, because the // outgoing stack slots may alias the incoming argument stack slots, and @@ -2759,39 +2763,40 @@ SmallVector MemOpChains2; SDValue FIN; int FI = 0; - if (getTargetMachine().Options.GuaranteedTailCallOpt) { - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; - if (VA.isRegLoc()) - continue; - assert(VA.isMemLoc()); - SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; - // Create frame index. - int32_t Offset = VA.getLocMemOffset()+FPDiff; - uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; - FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); - FIN = DAG.getFrameIndex(FI, getPointerTy()); - - if (Flags.isByVal()) { - // Copy relative to framepointer. - SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); - if (!StackPtr.getNode()) - StackPtr = DAG.getCopyFromReg(Chain, dl, - RegInfo->getStackRegister(), - getPointerTy()); - Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); - - MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, - ArgChain, - Flags, DAG, dl)); - } else { - // Store relative to framepointer. - MemOpChains2.push_back( - DAG.getStore(ArgChain, dl, Arg, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, 0)); - } + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (VA.isRegLoc()) + continue; + assert(VA.isMemLoc()); + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + // Skip inalloca arguments. They don't require any work. + if (Flags.isInAlloca()) + continue; + // Create frame index. + int32_t Offset = VA.getLocMemOffset()+FPDiff; + uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; + FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); + FIN = DAG.getFrameIndex(FI, getPointerTy()); + + if (Flags.isByVal()) { + // Copy relative to framepointer. + SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); + if (!StackPtr.getNode()) + StackPtr = DAG.getCopyFromReg(Chain, dl, + RegInfo->getStackRegister(), + getPointerTy()); + Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); + + MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, + ArgChain, + Flags, DAG, dl)); + } else { + // Store relative to framepointer. + MemOpChains2.push_back( + DAG.getStore(ArgChain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, 0)); } } Index: llvm/trunk/test/CodeGen/X86/musttail-indirect.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/musttail-indirect.ll +++ llvm/trunk/test/CodeGen/X86/musttail-indirect.ll @@ -0,0 +1,124 @@ +; RUN: llc < %s -mtriple=i686-win32 | FileCheck %s +; RUN: llc < %s -mtriple=i686-win32 -O0 | FileCheck %s + +; IR simplified from the following C++ snippet compiled for i686-windows-msvc: + +; struct A { A(); ~A(); int a; }; +; +; struct B { +; virtual int f(int); +; virtual int g(A, int, A); +; virtual void h(A, int, A); +; virtual A i(A, int, A); +; virtual A j(int); +; }; +; +; int (B::*mp_f)(int) = &B::f; +; int (B::*mp_g)(A, int, A) = &B::g; +; void (B::*mp_h)(A, int, A) = &B::h; +; A (B::*mp_i)(A, int, A) = &B::i; +; A (B::*mp_j)(int) = &B::j; + +; Each member pointer creates a thunk. The ones with inalloca are required to +; tail calls by the ABI, even at O0. + +%struct.B = type { i32 (...)** } +%struct.A = type { i32 } + +; CHECK-LABEL: f_thunk: +; CHECK: jmpl +; CHECK-NOT: ret +define x86_thiscallcc i32 @f_thunk(%struct.B* %this, i32) { +entry: + %1 = bitcast %struct.B* %this to i32 (%struct.B*, i32)*** + %vtable = load i32 (%struct.B*, i32)*** %1 + %2 = load i32 (%struct.B*, i32)** %vtable + %3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, i32 %0) + ret i32 %3 +} + +; Inalloca thunks shouldn't require any stores to the stack. +; CHECK-LABEL: g_thunk: +; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}} +; CHECK: jmpl +; CHECK-NOT: ret +define x86_thiscallcc i32 @g_thunk(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca) { +entry: + %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** + %vtable = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1 + %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 1 + %2 = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn + %3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca %0) + ret i32 %3 +} + +; CHECK-LABEL: h_thunk: +; CHECK: jmpl +; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}} +; CHECK-NOT: ret +define x86_thiscallcc void @h_thunk(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca) { +entry: + %1 = bitcast %struct.B* %this to void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** + %vtable = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1 + %vfn = getelementptr inbounds void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 2 + %2 = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn + musttail call x86_thiscallcc void %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca %0) + ret void +} + +; CHECK-LABEL: i_thunk: +; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}} +; CHECK: jmpl +; CHECK-NOT: ret +define x86_thiscallcc %struct.A* @i_thunk(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* inalloca) { +entry: + %1 = bitcast %struct.B* %this to %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*** + %vtable = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*** %1 + %vfn = getelementptr inbounds %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vtable, i32 3 + %2 = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vfn + %3 = musttail call x86_thiscallcc %struct.A* %2(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* inalloca %0) + ret %struct.A* %3 +} + +; CHECK-LABEL: j_thunk: +; CHECK: jmpl +; CHECK-NOT: ret +define x86_thiscallcc void @j_thunk(%struct.A* noalias sret %agg.result, %struct.B* %this, i32) { +entry: + %1 = bitcast %struct.B* %this to void (%struct.A*, %struct.B*, i32)*** + %vtable = load void (%struct.A*, %struct.B*, i32)*** %1 + %vfn = getelementptr inbounds void (%struct.A*, %struct.B*, i32)** %vtable, i32 4 + %2 = load void (%struct.A*, %struct.B*, i32)** %vfn + musttail call x86_thiscallcc void %2(%struct.A* sret %agg.result, %struct.B* %this, i32 %0) + ret void +} + +; CHECK-LABEL: _stdcall_thunk@8: +; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}} +; CHECK: jmpl +; CHECK-NOT: ret +define x86_stdcallcc i32 @stdcall_thunk(<{ %struct.B*, %struct.A }>* inalloca) { +entry: + %this_ptr = getelementptr inbounds <{ %struct.B*, %struct.A }>* %0, i32 0, i32 0 + %this = load %struct.B** %this_ptr + %1 = bitcast %struct.B* %this to i32 (<{ %struct.B*, %struct.A }>*)*** + %vtable = load i32 (<{ %struct.B*, %struct.A }>*)*** %1 + %vfn = getelementptr inbounds i32 (<{ %struct.B*, %struct.A }>*)** %vtable, i32 1 + %2 = load i32 (<{ %struct.B*, %struct.A }>*)** %vfn + %3 = musttail call x86_stdcallcc i32 %2(<{ %struct.B*, %struct.A }>* inalloca %0) + ret i32 %3 +} + +; CHECK-LABEL: @fastcall_thunk@8: +; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}} +; CHECK: jmpl +; CHECK-NOT: ret +define x86_fastcallcc i32 @fastcall_thunk(%struct.B* inreg %this, <{ %struct.A }>* inalloca) { +entry: + %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A }>*)*** + %vtable = load i32 (%struct.B*, <{ %struct.A }>*)*** %1 + %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A }>*)** %vtable, i32 1 + %2 = load i32 (%struct.B*, <{ %struct.A }>*)** %vfn + %3 = musttail call x86_fastcallcc i32 %2(%struct.B* inreg %this, <{ %struct.A }>* inalloca %0) + ret i32 %3 +} Index: llvm/trunk/test/CodeGen/X86/musttail-thiscall.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/musttail-thiscall.ll +++ llvm/trunk/test/CodeGen/X86/musttail-thiscall.ll @@ -0,0 +1,31 @@ +; RUN: llc -march=x86 < %s | FileCheck %s +; RUN: llc -march=x86 -O0 < %s | FileCheck %s + +; CHECK-LABEL: t1: +; CHECK: jmp {{_?}}t1_callee +define x86_thiscallcc void @t1(i8* %this) { + %adj = getelementptr i8* %this, i32 4 + musttail call x86_thiscallcc void @t1_callee(i8* %adj) + ret void +} +declare x86_thiscallcc void @t1_callee(i8* %this) + +; CHECK-LABEL: t2: +; CHECK: jmp {{_?}}t2_callee +define x86_thiscallcc i32 @t2(i8* %this, i32 %a) { + %adj = getelementptr i8* %this, i32 4 + %rv = musttail call x86_thiscallcc i32 @t2_callee(i8* %adj, i32 %a) + ret i32 %rv +} +declare x86_thiscallcc i32 @t2_callee(i8* %this, i32 %a) + +; CHECK-LABEL: t3: +; CHECK: jmp {{_?}}t3_callee +define x86_thiscallcc i8* @t3(i8* %this, <{ i8*, i32 }>* inalloca %args) { + %adj = getelementptr i8* %this, i32 4 + %a_ptr = getelementptr <{ i8*, i32 }>* %args, i32 0, i32 1 + store i32 0, i32* %a_ptr + %rv = musttail call x86_thiscallcc i8* @t3_callee(i8* %adj, <{ i8*, i32 }>* inalloca %args) + ret i8* %rv +} +declare x86_thiscallcc i8* @t3_callee(i8* %this, <{ i8*, i32 }>* inalloca %args); Index: llvm/trunk/test/CodeGen/X86/musttail.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/musttail.ll +++ llvm/trunk/test/CodeGen/X86/musttail.ll @@ -1,8 +1,6 @@ ; RUN: llc -march=x86 < %s | FileCheck %s - -; FIXME: Eliminate this tail call at -O0, since musttail is a correctness -; requirement. -; RUN: not llc -march=x86 -O0 < %s +; RUN: llc -march=x86 -O0 < %s | FileCheck %s +; RUN: llc -march=x86 -disable-tail-calls < %s | FileCheck %s declare void @t1_callee(i8*) define void @t1(i32* %a) { @@ -21,3 +19,72 @@ %w = bitcast i8* %v to i32* ret i32* %w } + +; Complex frame layout: stack realignment with dynamic alloca. +define void @t3(i32 %n) alignstack(32) nounwind { +entry: +; CHECK: t3: +; CHECK: pushl %ebp +; CHECK: pushl %esi +; CHECK: andl $-32, %esp +; CHECK: movl %esp, %esi +; CHECK: popl %esi +; CHECK: popl %ebp +; CHECK-NEXT: jmp {{_?}}t3_callee + %a = alloca i8, i32 %n + call void @capture(i8* %a) + musttail call void @t3_callee(i32 %n) nounwind + ret void +} + +declare void @capture(i8*) +declare void @t3_callee(i32) + +; Test that we actually copy in and out stack arguments that aren't forwarded +; without modification. +define i32 @t4({}* %fn, i32 %n, i32 %r) { +; CHECK-LABEL: t4: +; CHECK: incl %[[r:.*]] +; CHECK: decl %[[n:.*]] +; CHECK: movl %[[r]], {{[0-9]+}}(%esp) +; CHECK: movl %[[n]], {{[0-9]+}}(%esp) +; CHECK: jmpl *%{{.*}} + +entry: + %r1 = add i32 %r, 1 + %n1 = sub i32 %n, 1 + %fn_cast = bitcast {}* %fn to i32 ({}*, i32, i32)* + %r2 = musttail call i32 %fn_cast({}* %fn, i32 %n1, i32 %r1) + ret i32 %r2 +} + +; Combine the complex stack frame with the parameter modification. +define i32 @t5({}* %fn, i32 %n, i32 %r) alignstack(32) { +; CHECK-LABEL: t5: +; CHECK: pushl %ebp +; CHECK: movl %esp, %ebp +; CHECK: pushl %esi +; Align the stack. +; CHECK: andl $-32, %esp +; CHECK: movl %esp, %esi +; Modify the args. +; CHECK: incl %[[r:.*]] +; CHECK: decl %[[n:.*]] +; Store them through ebp, since that's the only stable arg pointer. +; CHECK: movl %[[r]], {{[0-9]+}}(%ebp) +; CHECK: movl %[[n]], {{[0-9]+}}(%ebp) +; Epilogue. +; CHECK: leal {{[-0-9]+}}(%ebp), %esp +; CHECK: popl %esi +; CHECK: popl %ebp +; CHECK: jmpl *%{{.*}} + +entry: + %a = alloca i8, i32 %n + call void @capture(i8* %a) + %r1 = add i32 %r, 1 + %n1 = sub i32 %n, 1 + %fn_cast = bitcast {}* %fn to i32 ({}*, i32, i32)* + %r2 = musttail call i32 %fn_cast({}* %fn, i32 %n1, i32 %r1) + ret i32 %r2 +}