Index: llvm/trunk/lib/CodeGen/GlobalISel/IRTranslator.cpp =================================================================== --- llvm/trunk/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ llvm/trunk/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2193,6 +2193,20 @@ FuncInfo.clear(); } +/// Returns true if a BasicBlock \p BB within a variadic function contains a +/// variadic musttail call. +static bool checkForMustTailInVarArgFn(bool IsVarArg, const BasicBlock &BB) { + if (!IsVarArg) + return false; + + // Walk the block backwards, because tail calls usually only appear at the end + // of a block. + return std::any_of(BB.rbegin(), BB.rend(), [](const Instruction &I) { + const auto *CI = dyn_cast(&I); + return CI && CI->isMustTailCall(); + }); +} + bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) { MF = &CurMF; const Function &F = MF->getFunction(); @@ -2254,6 +2268,9 @@ SwiftError.setFunction(CurMF); SwiftError.createEntriesInEntryBlock(DbgLoc); + bool IsVarArg = F.isVarArg(); + bool HasMustTailInVarArgFn = false; + // Create all blocks, in IR order, to preserve the layout. for (const BasicBlock &BB: F) { auto *&MBB = BBToMBB[&BB]; @@ -2263,8 +2280,13 @@ if (BB.hasAddressTaken()) MBB->setHasAddressTaken(); + + if (!HasMustTailInVarArgFn) + HasMustTailInVarArgFn = checkForMustTailInVarArgFn(IsVarArg, BB); } + MF->getFrameInfo().setHasMustTailInVarArgFunc(HasMustTailInVarArgFn); + // Make our arguments/constants entry block fallthrough to the IR entry block. EntryBB->addSuccessor(&getMBB(F.front())); Index: llvm/trunk/lib/Target/AArch64/AArch64CallLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64CallLowering.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64CallLowering.cpp @@ -368,6 +368,49 @@ return Success; } +/// Helper function to compute forwarded registers for musttail calls. Computes +/// the forwarded registers, sets MBB liveness, and emits COPY instructions that +/// can be used to save + restore registers later. +static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder, + CCAssignFn *AssignFn) { + MachineBasicBlock &MBB = MIRBuilder.getMBB(); + MachineFunction &MF = MIRBuilder.getMF(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + if (!MFI.hasMustTailInVarArgFunc()) + return; + + AArch64FunctionInfo *FuncInfo = MF.getInfo(); + const Function &F = MF.getFunction(); + assert(F.isVarArg() && "Expected F to be vararg?"); + + // Compute the set of forwarded registers. The rest are scratch. + SmallVector ArgLocs; + CCState CCInfo(F.getCallingConv(), /*IsVarArg=*/true, MF, ArgLocs, + F.getContext()); + SmallVector RegParmTypes; + RegParmTypes.push_back(MVT::i64); + RegParmTypes.push_back(MVT::f128); + + // Later on, we can use this vector to restore the registers if necessary. + SmallVectorImpl &Forwards = + FuncInfo->getForwardedMustTailRegParms(); + CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, AssignFn); + + // Conservatively forward X8, since it might be used for an aggregate + // return. + if (!CCInfo.isAllocated(AArch64::X8)) { + unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); + Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); + } + + // Add the forwards to the MachineBasicBlock and MachineFunction. + for (const auto &F : Forwards) { + MBB.addLiveIn(F.PReg); + MIRBuilder.buildCopy(Register(F.VReg), Register(F.PReg)); + } +} + bool AArch64CallLowering::lowerFormalArguments( MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef> VRegs) const { @@ -441,6 +484,8 @@ if (Subtarget.hasCustomCallingConv()) Subtarget.getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); + handleMustTailForwardedRegisters(MIRBuilder, AssignFn); + // Move back to the end of the basic block. MIRBuilder.setMBB(MBB); @@ -695,16 +740,6 @@ assert((!Info.IsVarArg || CalleeCC == CallingConv::C) && "Unexpected variadic calling convention"); - // Before we can musttail varargs, we need to forward parameters like in - // r345641. Make sure that we don't enable musttail with varargs without - // addressing that! - if (Info.IsVarArg && Info.IsMustTailCall) { - LLVM_DEBUG( - dbgs() - << "... Cannot handle vararg musttail functions yet.\n"); - return false; - } - // Verify that the incoming and outgoing arguments from the callee are // safe to tail call. if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) { @@ -745,6 +780,7 @@ const Function &F = MF.getFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); const AArch64TargetLowering &TLI = *getTLI(); + AArch64FunctionInfo *FuncInfo = MF.getInfo(); // True when we're tail calling, but without -tailcallopt. bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt; @@ -800,7 +836,6 @@ // We aren't sibcalling, so we need to compute FPDiff. We need to do this // before handling assignments, because FPDiff must be known for memory // arguments. - AArch64FunctionInfo *FuncInfo = MF.getInfo(); unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); SmallVector OutLocs; CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext()); @@ -823,6 +858,8 @@ assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); } + const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); + // Do the actual argument marshalling. SmallVector PhysRegs; OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, @@ -830,6 +867,27 @@ if (!handleAssignments(MIRBuilder, OutArgs, Handler)) return false; + if (Info.IsVarArg && Info.IsMustTailCall) { + // Now we know what's being passed to the function. Add uses to the call for + // the forwarded registers that we *aren't* passing as parameters. This will + // preserve the copies we build earlier. + for (const auto &F : Forwards) { + Register ForwardedReg = F.PReg; + // If the register is already passed, or aliases a register which is + // already being passed, then skip it. + if (any_of(MIB->uses(), [&ForwardedReg, &TRI](const MachineOperand &Use) { + if (!Use.isReg()) + return false; + return TRI->regsOverlap(Use.getReg(), ForwardedReg); + })) + continue; + + // We aren't passing it already, so we should add it to the call. + MIRBuilder.buildCopy(ForwardedReg, Register(F.VReg)); + MIB.addReg(ForwardedReg, RegState::Implicit); + } + } + // If we have -tailcallopt, we need to adjust the stack. We'll do the call // sequence start and end here. if (!IsSibCall) { Index: llvm/trunk/test/CodeGen/AArch64/GlobalISel/call-translator-variadic-musttail.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/GlobalISel/call-translator-variadic-musttail.ll +++ llvm/trunk/test/CodeGen/AArch64/GlobalISel/call-translator-variadic-musttail.ll @@ -0,0 +1,223 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -verify-machineinstrs -mtriple aarch64-apple-darwin -global-isel -o - 2>&1 | FileCheck %s + +; There are two things we want to test here: +; (1) We can tail call musttail calls. +; (2) We spill and reload all of the arguments around a normal call. + +declare i32 @musttail_variadic_callee(i32, ...) +define i32 @test_musttail_variadic(i32 %arg0, ...) { +; CHECK-LABEL: test_musttail_variadic: +; CHECK: ; %bb.0: +; CHECK-NEXT: b _musttail_variadic_callee + %r = musttail call i32 (i32, ...) @musttail_variadic_callee(i32 %arg0, ...) + ret i32 %r +} + +declare [2 x i64] @musttail_variadic_aggret_callee(i32 %arg0, ...) +define [2 x i64] @test_musttail_variadic_aggret(i32 %arg0, ...) { +; CHECK-LABEL: test_musttail_variadic_aggret: +; CHECK: ; %bb.0: +; CHECK-NEXT: b _musttail_variadic_aggret_callee + %r = musttail call [2 x i64] (i32, ...) @musttail_variadic_aggret_callee(i32 %arg0, ...) + ret [2 x i64] %r +} + +; Test musttailing with a normal call in the block. Test that we spill and +; restore, as a normal call will clobber all argument registers. +@asdf = internal constant [4 x i8] c"asdf" +declare void @puts(i8*) +define i32 @test_musttail_variadic_spill(i32 %arg0, ...) { +; CHECK-LABEL: test_musttail_variadic_spill: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #224 ; =224 +; CHECK-NEXT: stp x28, x27, [sp, #128] ; 16-byte Folded Spill +; CHECK-NEXT: stp x26, x25, [sp, #144] ; 16-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #160] ; 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #176] ; 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #192] ; 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #208] ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 224 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_offset w19, -24 +; CHECK-NEXT: .cfi_offset w20, -32 +; CHECK-NEXT: .cfi_offset w21, -40 +; CHECK-NEXT: .cfi_offset w22, -48 +; CHECK-NEXT: .cfi_offset w23, -56 +; CHECK-NEXT: .cfi_offset w24, -64 +; CHECK-NEXT: .cfi_offset w25, -72 +; CHECK-NEXT: .cfi_offset w26, -80 +; CHECK-NEXT: .cfi_offset w27, -88 +; CHECK-NEXT: .cfi_offset w28, -96 +; CHECK-NEXT: mov w19, w0 +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x0, _asdf@PAGE +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: add x0, x0, _asdf@PAGEOFF +; CHECK-NEXT: mov x20, x1 +; CHECK-NEXT: mov x21, x2 +; CHECK-NEXT: mov x22, x3 +; CHECK-NEXT: mov x23, x4 +; CHECK-NEXT: mov x24, x5 +; CHECK-NEXT: mov x25, x6 +; CHECK-NEXT: mov x26, x7 +; CHECK-NEXT: stp q1, q0, [sp, #96] ; 32-byte Folded Spill +; CHECK-NEXT: stp q3, q2, [sp, #64] ; 32-byte Folded Spill +; CHECK-NEXT: stp q5, q4, [sp, #32] ; 32-byte Folded Spill +; CHECK-NEXT: stp q7, q6, [sp] ; 32-byte Folded Spill +; CHECK-NEXT: mov x27, x8 +; CHECK-NEXT: bl _puts +; CHECK-NEXT: ldp q1, q0, [sp, #96] ; 32-byte Folded Reload +; CHECK-NEXT: ldp q3, q2, [sp, #64] ; 32-byte Folded Reload +; CHECK-NEXT: ldp q5, q4, [sp, #32] ; 32-byte Folded Reload +; CHECK-NEXT: ldp q7, q6, [sp] ; 32-byte Folded Reload +; CHECK-NEXT: mov w0, w19 +; CHECK-NEXT: mov x1, x20 +; CHECK-NEXT: mov x2, x21 +; CHECK-NEXT: mov x3, x22 +; CHECK-NEXT: mov x4, x23 +; CHECK-NEXT: mov x5, x24 +; CHECK-NEXT: mov x6, x25 +; CHECK-NEXT: mov x7, x26 +; CHECK-NEXT: mov x8, x27 +; CHECK-NEXT: ldp x29, x30, [sp, #208] ; 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #192] ; 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #176] ; 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #160] ; 16-byte Folded Reload +; CHECK-NEXT: ldp x26, x25, [sp, #144] ; 16-byte Folded Reload +; CHECK-NEXT: ldp x28, x27, [sp, #128] ; 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #224 ; =224 +; CHECK-NEXT: b _musttail_variadic_callee +; CHECK-NEXT: .loh AdrpAdd Lloh0, Lloh1 + call void @puts(i8* getelementptr ([4 x i8], [4 x i8]* @asdf, i32 0, i32 0)) + %r = musttail call i32 (i32, ...) @musttail_variadic_callee(i32 %arg0, ...) + ret i32 %r +} + +; Test musttailing with a varargs call in the block. Test that we spill and +; reload all arguments in the variadic argument pack. +declare void @llvm.va_start(i8*) nounwind +declare void(i8*, ...)* @get_f(i8* %this) +define void @f_thunk(i8* %this, ...) { +; CHECK-LABEL: f_thunk: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #256 ; =256 +; CHECK-NEXT: stp x28, x27, [sp, #160] ; 16-byte Folded Spill +; CHECK-NEXT: stp x26, x25, [sp, #176] ; 16-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #192] ; 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #208] ; 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #224] ; 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #240] ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 256 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_offset w19, -24 +; CHECK-NEXT: .cfi_offset w20, -32 +; CHECK-NEXT: .cfi_offset w21, -40 +; CHECK-NEXT: .cfi_offset w22, -48 +; CHECK-NEXT: .cfi_offset w23, -56 +; CHECK-NEXT: .cfi_offset w24, -64 +; CHECK-NEXT: .cfi_offset w25, -72 +; CHECK-NEXT: .cfi_offset w26, -80 +; CHECK-NEXT: .cfi_offset w27, -88 +; CHECK-NEXT: .cfi_offset w28, -96 +; CHECK-NEXT: mov x27, x8 +; CHECK-NEXT: add x8, sp, #128 ; =128 +; CHECK-NEXT: add x9, sp, #256 ; =256 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mov x20, x1 +; CHECK-NEXT: mov x21, x2 +; CHECK-NEXT: mov x22, x3 +; CHECK-NEXT: mov x23, x4 +; CHECK-NEXT: mov x24, x5 +; CHECK-NEXT: mov x25, x6 +; CHECK-NEXT: mov x26, x7 +; CHECK-NEXT: stp q1, q0, [sp, #96] ; 32-byte Folded Spill +; CHECK-NEXT: stp q3, q2, [sp, #64] ; 32-byte Folded Spill +; CHECK-NEXT: stp q5, q4, [sp, #32] ; 32-byte Folded Spill +; CHECK-NEXT: stp q7, q6, [sp] ; 32-byte Folded Spill +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: bl _get_f +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: ldp q1, q0, [sp, #96] ; 32-byte Folded Reload +; CHECK-NEXT: ldp q3, q2, [sp, #64] ; 32-byte Folded Reload +; CHECK-NEXT: ldp q5, q4, [sp, #32] ; 32-byte Folded Reload +; CHECK-NEXT: ldp q7, q6, [sp] ; 32-byte Folded Reload +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 +; CHECK-NEXT: mov x2, x21 +; CHECK-NEXT: mov x3, x22 +; CHECK-NEXT: mov x4, x23 +; CHECK-NEXT: mov x5, x24 +; CHECK-NEXT: mov x6, x25 +; CHECK-NEXT: mov x7, x26 +; CHECK-NEXT: mov x8, x27 +; CHECK-NEXT: ldp x29, x30, [sp, #240] ; 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #224] ; 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #208] ; 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #192] ; 16-byte Folded Reload +; CHECK-NEXT: ldp x26, x25, [sp, #176] ; 16-byte Folded Reload +; CHECK-NEXT: ldp x28, x27, [sp, #160] ; 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #256 ; =256 +; CHECK-NEXT: br x9 + %ap = alloca [4 x i8*], align 16 + %ap_i8 = bitcast [4 x i8*]* %ap to i8* + call void @llvm.va_start(i8* %ap_i8) + %fptr = call void(i8*, ...)*(i8*) @get_f(i8* %this) + musttail call void (i8*, ...) %fptr(i8* %this, ...) + ret void +} + +; We don't need any spills and reloads here, but we should still emit the +; copies in call lowering. +define void @g_thunk(i8* %fptr_i8, ...) { +; CHECK-LABEL: g_thunk: +; CHECK: ; %bb.0: +; CHECK-NEXT: br x0 + %fptr = bitcast i8* %fptr_i8 to void (i8*, ...)* + musttail call void (i8*, ...) %fptr(i8* %fptr_i8, ...) + ret void +} + +; Test that this works with multiple exits and basic blocks. +%struct.Foo = type { i1, i8*, i8* } +@g = external global i32 +define void @h_thunk(%struct.Foo* %this, ...) { +; CHECK-LABEL: h_thunk: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: tbz w9, #0, LBB5_2 +; CHECK-NEXT: ; %bb.1: ; %then +; CHECK-NEXT: ldr x9, [x0, #8] +; CHECK-NEXT: br x9 +; CHECK-NEXT: LBB5_2: ; %else +; CHECK-NEXT: Lloh2: +; CHECK-NEXT: adrp x10, _g@GOTPAGE +; CHECK-NEXT: ldr x9, [x0, #16] +; CHECK-NEXT: Lloh3: +; CHECK-NEXT: ldr x10, [x10, _g@GOTPAGEOFF] +; CHECK-NEXT: mov w11, #42 +; CHECK-NEXT: Lloh4: +; CHECK-NEXT: str w11, [x10] +; CHECK-NEXT: br x9 +; CHECK-NEXT: .loh AdrpLdrGotStr Lloh2, Lloh3, Lloh4 + %cond_p = getelementptr %struct.Foo, %struct.Foo* %this, i32 0, i32 0 + %cond = load i1, i1* %cond_p + br i1 %cond, label %then, label %else + +then: + %a_p = getelementptr %struct.Foo, %struct.Foo* %this, i32 0, i32 1 + %a_i8 = load i8*, i8** %a_p + %a = bitcast i8* %a_i8 to void (%struct.Foo*, ...)* + musttail call void (%struct.Foo*, ...) %a(%struct.Foo* %this, ...) + ret void + +else: + %b_p = getelementptr %struct.Foo, %struct.Foo* %this, i32 0, i32 2 + %b_i8 = load i8*, i8** %b_p + %b = bitcast i8* %b_i8 to void (%struct.Foo*, ...)* + store i32 42, i32* @g + musttail call void (%struct.Foo*, ...) %b(%struct.Foo* %this, ...) + ret void +}