This is an archive of the discontinued LLVM Phabricator instance.

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
967–970	Checking the register mask seems pretty cheap, while checking argument assignment looks more expensive. Does it make sense to do this check first? That should bail out faster if the calling conventions don’t match.
1224	Can this use `getStackAlignment()` instead of hardcoding 16?
1236	Same about `getStackAlignment()`
llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll
371–373	The parent does not use byval. Is this comment correct?

Call common function, rebase on top of patch to fix split arguments

arsenm added a parent revision: D101947: GlobalISel: Split ValueHandler into assignment and emission classes.May 5 2021, 6:26 PM

Harbormaster completed remote builds in B102895: Diff 343261.May 5 2021, 6:26 PM

ping

I left a nit inline. Apart from that, LGTM.

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
1172–1174	Can this and the assert below use getStackAlignment() instead of hardcoding 16? Or does this have a different meaning?

Don't hardcode stack alignment

Harbormaster completed remote builds in B104131: Diff 344943.May 12 2021, 1:32 PM

Flakebi accepted this revision.May 13 2021, 2:27 AM

Flakebi added inline comments.

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
1184	The assert should also use `getStackAlignment()`.

This revision is now accepted and ready to land.May 13 2021, 2:27 AM

6a70874d27c73cf8b55a568449fd92f97b5bb7b3

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

AMDGPUCallLowering.h

25 lines

AMDGPUCallLowering.cpp

395 lines

test/

CodeGen/

AMDGPU/

GlobalISel/

irtranslator-sibling-call.ll

1509 lines

irtranslator-tail-call.ll

45 lines

call-constant.ll

4 lines

tail-call-amdgpu-gfx.ll

1 line

Diff 332163

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h

Show All 13 Lines
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H		#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H		#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H

#include "llvm/CodeGen/GlobalISel/CallLowering.h"		#include "llvm/CodeGen/GlobalISel/CallLowering.h"

namespace llvm {		namespace llvm {

class AMDGPUTargetLowering;		class AMDGPUTargetLowering;
		class GCNSubtarget;
class MachineInstrBuilder;		class MachineInstrBuilder;
		class SIMachineFunctionInfo;

class AMDGPUCallLowering final : public CallLowering {		class AMDGPUCallLowering final : public CallLowering {
void lowerParameterPtr(Register DstReg, MachineIRBuilder &B, Type *ParamTy,		void lowerParameterPtr(Register DstReg, MachineIRBuilder &B, Type *ParamTy,
uint64_t Offset) const;		uint64_t Offset) const;

void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset,		void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset,
Align Alignment, Register DstReg) const;		Align Alignment, Register DstReg) const;

Show All 18 Lines	bool lowerFormalArguments(MachineIRBuilder &B, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs,		ArrayRef<ArrayRef<Register>> VRegs,
FunctionLoweringInfo &FLI) const override;		FunctionLoweringInfo &FLI) const override;

bool passSpecialInputs(MachineIRBuilder &MIRBuilder,		bool passSpecialInputs(MachineIRBuilder &MIRBuilder,
CCState &CCInfo,		CCState &CCInfo,
SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,		SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
CallLoweringInfo &Info) const;		CallLoweringInfo &Info) const;

		bool
		doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info,
		MachineFunction &MF,
		SmallVectorImpl<ArgInfo> &InArgs) const;

		bool
		areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF,
		SmallVectorImpl<ArgInfo> &OutArgs) const;

		/// Returns true if the call can be lowered as a tail call.
		bool
		isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder,
		CallLoweringInfo &Info,
		SmallVectorImpl<ArgInfo> &InArgs,
		SmallVectorImpl<ArgInfo> &OutArgs) const;

		void handleImplicitCallArguments(
		MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
		const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI,
		ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const;

		bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
		SmallVectorImpl<ArgInfo> &OutArgs) const;
bool lowerCall(MachineIRBuilder &MIRBuilder,		bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;		CallLoweringInfo &Info) const override;

static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);		static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);		static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
};		};
} // End of namespace llvm;		} // End of namespace llvm;
#endif		#endif

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Show First 20 Lines • Show All 196 Lines • ▼ Show 20 Lines	struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
Register getStackAddress(uint64_t Size, int64_t Offset,		Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO,		MachinePointerInfo &MPO,
ISD::ArgFlagsTy Flags) override {		ISD::ArgFlagsTy Flags) override {
MachineFunction &MF = MIRBuilder.getMF();		MachineFunction &MF = MIRBuilder.getMF();
const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);		const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
const LLT S32 = LLT::scalar(32);		const LLT S32 = LLT::scalar(32);

if (IsTailCall) {		if (IsTailCall) {
llvm_unreachable("implement me");		Offset += FPDiff;
		int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
		auto FIReg = MIRBuilder.buildFrameIndex(PtrTy, FI);
		MPO = MachinePointerInfo::getFixedStack(MF, FI);
		return FIReg.getReg(0);
}		}

const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();		const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

if (!SPReg)		if (!SPReg)
SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0);		SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0);

auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);		auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
▲ Show 20 Lines • Show All 502 Lines • ▼ Show 20 Lines	if (!IsEntryFunc) {
if (AMDGPUTargetMachine::EnableFixedFunctionABI)		if (AMDGPUTargetMachine::EnableFixedFunctionABI)
TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI, Info);		TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI, Info);
}		}

FormalArgHandler Handler(B, MRI, AssignFn);		FormalArgHandler Handler(B, MRI, AssignFn);
if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))		if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
return false;		return false;

		uint64_t StackOffset = Handler.StackUsed;

if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {		if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
// Special inputs come after user arguments.		// Special inputs come after user arguments.
TLI.allocateSpecialInputVGPRs(CCInfo, MF, TRI, Info);		TLI.allocateSpecialInputVGPRs(CCInfo, MF, TRI, Info);
}		}

// Start adding system SGPRs.		// Start adding system SGPRs.
if (IsEntryFunc) {		if (IsEntryFunc) {
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);		TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
} else {		} else {
if (!Subtarget.enableFlatScratch())		if (!Subtarget.enableFlatScratch())
CCInfo.AllocateReg(Info->getScratchRSrcReg());		CCInfo.AllocateReg(Info->getScratchRSrcReg());
TLI.allocateSpecialInputSGPRs(CCInfo, MF, TRI, Info);		TLI.allocateSpecialInputSGPRs(CCInfo, MF, TRI, Info);
}		}

		// When we tail call, we need to check if the callee's arguments will fit on
		// the caller's stack. So, whenever we lower formal arguments, we should keep
		// track of this information, since we might lower a tail call in this
		// function later.
		Info->setBytesInStackArgArea(StackOffset);

// Move back to the end of the basic block.		// Move back to the end of the basic block.
B.setMBB(MBB);		B.setMBB(MBB);

return true;		return true;
}		}

bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,		bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
CCState &CCInfo,		CCState &CCInfo,
▲ Show 20 Lines • Show All 146 Lines • ▼ Show 20 Lines
/// CC.		/// CC.
static std::pair<CCAssignFn , CCAssignFn >		static std::pair<CCAssignFn , CCAssignFn >
getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {		getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};		return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
}		}

static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,		static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
bool IsTailCall) {		bool IsTailCall) {
return AMDGPU::SI_CALL;		return IsTailCall ? AMDGPU::SI_TCRETURN : AMDGPU::SI_CALL;
}		}

// Add operands to call instruction to track the callee.		// Add operands to call instruction to track the callee.
static bool addCallTargetOperands(MachineInstrBuilder &CallInst,		static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
MachineIRBuilder &MIRBuilder,		MachineIRBuilder &MIRBuilder,
AMDGPUCallLowering::CallLoweringInfo &Info) {		AMDGPUCallLowering::CallLoweringInfo &Info) {
if (Info.Callee.isReg()) {		if (Info.Callee.isReg()) {
CallInst.addReg(Info.Callee.getReg());		CallInst.addReg(Info.Callee.getReg());
CallInst.addImm(0);		CallInst.addImm(0);
} else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {		} else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
// The call lowering lightly assumed we can directly encode a call target in		// The call lowering lightly assumed we can directly encode a call target in
// the instruction, which is not the case. Materialize the address here.		// the instruction, which is not the case. Materialize the address here.
const GlobalValue *GV = Info.Callee.getGlobal();		const GlobalValue *GV = Info.Callee.getGlobal();
auto Ptr = MIRBuilder.buildGlobalValue(		auto Ptr = MIRBuilder.buildGlobalValue(
LLT::pointer(GV->getAddressSpace(), 64), GV);		LLT::pointer(GV->getAddressSpace(), 64), GV);
CallInst.addReg(Ptr.getReg(0));		CallInst.addReg(Ptr.getReg(0));
CallInst.add(Info.Callee);		CallInst.add(Info.Callee);
} else		} else
return false;		return false;

return true;		return true;
}		}

		bool AMDGPUCallLowering::doCallerAndCalleePassArgsTheSameWay(
		CallLoweringInfo &Info, MachineFunction &MF,
		SmallVectorImpl<ArgInfo> &InArgs) const {
		const Function &CallerF = MF.getFunction();
		CallingConv::ID CalleeCC = Info.CallConv;
		CallingConv::ID CallerCC = CallerF.getCallingConv();

		// If the calling conventions match, then everything must be the same.
		if (CalleeCC == CallerCC)
		return true;

		// Check if the caller and callee will handle arguments in the same way.
		const SITargetLowering &TLI = *getTLI<SITargetLowering>();
		CCAssignFn *CalleeAssignFnFixed;
		CCAssignFn *CalleeAssignFnVarArg;
		std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) =
		getAssignFnsForCC(CalleeCC, TLI);

		CCAssignFn *CallerAssignFnFixed;
		CCAssignFn *CallerAssignFnVarArg;
		std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) =
		getAssignFnsForCC(CallerCC, TLI);

		if (!resultsCompatible(Info, MF, InArgs, *CalleeAssignFnFixed,
		CalleeAssignFnVarArg, CallerAssignFnFixed,
		*CallerAssignFnVarArg))
		return false;

		const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();

		// Make sure that the caller and callee preserve all of the same registers.
		auto TRI = ST.getRegisterInfo();
		const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
		const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
		return TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved);
		FlakebiUnsubmitted Not Done Reply Inline Actions Checking the register mask seems pretty cheap, while checking argument assignment looks more expensive. Does it make sense to do this check first? That should bail out faster if the calling conventions don’t match. Flakebi: Checking the register mask seems pretty cheap, while checking argument assignment looks more…
		}

		bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable(
		CallLoweringInfo &Info, MachineFunction &MF,
		SmallVectorImpl<ArgInfo> &OutArgs) const {
		// If there are no outgoing arguments, then we are done.
		if (OutArgs.empty())
		return true;

		const Function &CallerF = MF.getFunction();
		CallingConv::ID CalleeCC = Info.CallConv;
		CallingConv::ID CallerCC = CallerF.getCallingConv();
		const SITargetLowering &TLI = *getTLI<SITargetLowering>();

		CCAssignFn *AssignFnFixed;
		CCAssignFn *AssignFnVarArg;
		std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);

		// We have outgoing arguments. Make sure that we can tail call with them.
		SmallVector<CCValAssign, 16> OutLocs;
		CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());

		if (!analyzeArgInfo(OutInfo, OutArgs, AssignFnFixed, AssignFnVarArg)) {
		LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
		return false;
		}

		// Make sure that they can fit on the caller's stack.
		const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
		if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) {
		LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
		return false;
		}

		// Verify that the parameters in callee-saved registers match.
		// TODO: Port this over to CallLowering as general code once swiftself is
		// supported.
		const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
		const SIRegisterInfo *TRI = ST.getRegisterInfo();
		const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
		MachineRegisterInfo &MRI = MF.getRegInfo();

		for (unsigned i = 0; i < OutLocs.size(); ++i) {
		auto &ArgLoc = OutLocs[i];
		// If it's not a register, it's fine.
		if (!ArgLoc.isRegLoc()) {
		if (Info.IsVarArg) {
		// Be conservative and disallow variadic memory operands to match SDAG's
		// behaviour.
		// FIXME: If the caller's calling convention is C, then we can
		// potentially use its argument area. However, for cases like fastcc,
		// we can't do anything.
		LLVM_DEBUG(
		dbgs()
		<< "... Cannot tail call vararg function with stack arguments\n");
		return false;
		}
		continue;
		}

		Register Reg = ArgLoc.getLocReg();

		// Only look at callee-saved registers.
		if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
		continue;

		LLVM_DEBUG(
		dbgs()
		<< "... Call has an argument passed in a callee-saved register.\n");

		// Check if it was copied from.
		ArgInfo &OutInfo = OutArgs[i];

		if (OutInfo.Regs.size() > 1) {
		LLVM_DEBUG(
		dbgs() << "... Cannot handle arguments in multiple registers.\n");
		return false;
		}

		// Check if we copy the register, walking through copies from virtual
		// registers. Note that getDefIgnoringCopies does not ignore copies from
		// physical registers.
		MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI);
		if (!RegDef \|\| RegDef->getOpcode() != TargetOpcode::COPY) {
		LLVM_DEBUG(
		dbgs()
		<< "... Parameter was not copied into a VReg, cannot tail call.\n");
		return false;
		}

		// Got a copy. Verify that it's the same as the register we want.
		Register CopyRHS = RegDef->getOperand(1).getReg();
		if (CopyRHS != Reg) {
		LLVM_DEBUG(dbgs() << "... Callee-saved register was not copied into "
		"VReg, cannot tail call.\n");
		return false;
		}
		}

		return true;
		}

		/// Return true if the calling convention is one that we can guarantee TCO for.
		static bool canGuaranteeTCO(CallingConv::ID CC) {
		return CC == CallingConv::Fast;
		}

		/// Return true if we might ever do TCO for calls with this calling convention.
		static bool mayTailCallThisCC(CallingConv::ID CC) {
		switch (CC) {
		case CallingConv::C:
		case CallingConv::AMDGPU_Gfx:
		return true;
		default:
		return canGuaranteeTCO(CC);
		}
		}

		bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
		MachineIRBuilder &B, CallLoweringInfo &Info,
		SmallVectorImpl<ArgInfo> &InArgs,
		SmallVectorImpl<ArgInfo> &OutArgs) const {
		// Must pass all target-independent checks in order to tail call optimize.
		if (!Info.IsTailCall)
		return false;

		MachineFunction &MF = B.getMF();
		const Function &CallerF = MF.getFunction();
		CallingConv::ID CalleeCC = Info.CallConv;
		CallingConv::ID CallerCC = CallerF.getCallingConv();

		const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
		const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
		// Kernels aren't callable, and don't have a live in return address so it
		// doesn't make sense to do a tail call with entry functions.
		if (!CallerPreserved)
		return false;

		if (!mayTailCallThisCC(CalleeCC)) {
		LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
		return false;
		}

		if (any_of(CallerF.args(), [](const Argument &A) {
		return A.hasByValAttr() \|\| A.hasSwiftErrorAttr();
		})) {
		LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
		"or swifterror arguments\n");
		return false;
		}

		// If we have -tailcallopt, then we're done.
		if (MF.getTarget().Options.GuaranteedTailCallOpt)
		return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv();

		// Verify that the incoming and outgoing arguments from the callee are
		// safe to tail call.
		if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
		LLVM_DEBUG(
		dbgs()
		<< "... Caller and callee have incompatible calling conventions.\n");
		return false;
		}

		if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
		return false;

		LLVM_DEBUG(
		dbgs() << "... Call is eligible for tail call optimization.\n");
		return true;
		}

		// Insert outgoing implicit arguments for a call, by inserting copies to the
		// implicit argument registers and adding the necessary implicit uses to the
		// call instruction.
		void AMDGPUCallLowering::handleImplicitCallArguments(
		MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
		const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
		ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
		if (!ST.enableFlatScratch()) {
		// Insert copies for the SRD. In the HSA case, this should be an identity
		// copy.
		auto ScratchRSrcReg =
		MIRBuilder.buildCopy(LLT::vector(4, 32), FuncInfo.getScratchRSrcReg());
		MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
		CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
		}

		for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
		MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
		CallInst.addReg(ArgReg.first, RegState::Implicit);
		}
		}

		bool AMDGPUCallLowering::lowerTailCall(
		MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
		SmallVectorImpl<ArgInfo> &OutArgs) const {
		MachineFunction &MF = MIRBuilder.getMF();
		const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
		SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
		const Function &F = MF.getFunction();
		MachineRegisterInfo &MRI = MF.getRegInfo();
		const SITargetLowering &TLI = *getTLI<SITargetLowering>();

		FlakebiUnsubmitted Done Reply Inline Actions Can this and the assert below use getStackAlignment() instead of hardcoding 16? Or does this have a different meaning? Flakebi: Can this and the assert below use getStackAlignment() instead of hardcoding 16? Or does this…
		// True when we're tail calling, but without -tailcallopt.
		bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;

		// Find out which ABI gets to decide where things go.
		CallingConv::ID CalleeCC = Info.CallConv;
		CCAssignFn *AssignFnFixed;
		CCAssignFn *AssignFnVarArg;
		std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);

		MachineInstrBuilder CallSeqStart;
		FlakebiUnsubmitted Not Done Reply Inline Actions The assert should also use `getStackAlignment()`. Flakebi: The assert should also use `getStackAlignment()`.
		if (!IsSibCall)
		CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);

		unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true);
		auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
		if (!addCallTargetOperands(MIB, MIRBuilder, Info))
		return false;

		// Byte offset for the tail call. When we are sibcalling, this will always
		// be 0.
		MIB.addImm(0);

		// Tell the call which registers are clobbered.
		const SIRegisterInfo *TRI = ST.getRegisterInfo();
		const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
		MIB.addRegMask(Mask);

		// FPDiff is the byte offset of the call's argument area from the callee's.
		// Stores to callee stack arguments will be placed in FixedStackSlots offset
		// by this amount for a tail call. In a sibling call it must be 0 because the
		// caller will deallocate the entire stack and the callee still expects its
		// arguments to begin at SP+0.
		int FPDiff = 0;

		// This will be 0 for sibcalls, potentially nonzero for tail calls produced
		// by -tailcallopt. For sibcalls, the memory operands for the call are
		// already available in the caller's incoming argument space.
		unsigned NumBytes = 0;
		if (!IsSibCall) {
		// We aren't sibcalling, so we need to compute FPDiff. We need to do this
		// before handling assignments, because FPDiff must be known for memory
		// arguments.
		unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
		SmallVector<CCValAssign, 16> OutLocs;
		CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
		analyzeArgInfo(OutInfo, OutArgs, AssignFnFixed, AssignFnVarArg);

		// The callee will pop the argument stack as a tail call. Thus, we must
		// keep it 16-byte aligned.
		NumBytes = alignTo(OutInfo.getNextStackOffset(), 16);
		FlakebiUnsubmitted Not Done Reply Inline Actions Can this use `getStackAlignment()` instead of hardcoding 16? Flakebi: Can this use `getStackAlignment()` instead of hardcoding 16?

		// FPDiff will be negative if this tail call requires more space than we
		// would automatically have in our incoming argument space. Positive if we
		// actually shrink the stack.
		FPDiff = NumReusableBytes - NumBytes;

		// The stack pointer must be 16-byte aligned at all times it's used for a
		// memory operation, which in practice means at all times and in
		// particular across call boundaries. Therefore our own arguments started at
		// a 16-byte aligned SP and the delta applied for the tail call should
		// satisfy the same constraint.
		assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
		FlakebiUnsubmitted Not Done Reply Inline Actions Same about `getStackAlignment()` Flakebi: Same about `getStackAlignment()`
		}

		SmallVector<CCValAssign, 16> ArgLocs;
		CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());

		// We could pass MIB and directly add the implicit uses to the call
		// now. However, as an aesthetic choice, place implicit argument operands
		// after the ordinary user argument registers.
		SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;

		if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
		Info.CallConv != CallingConv::AMDGPU_Gfx) {
		// With a fixed ABI, allocate fixed registers before user arguments.
		if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
		return false;
		}

		// Do the actual argument marshalling.
		AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
		AssignFnVarArg, true, FPDiff);
		if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler))
		return false;

		handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs);

		// If we have -tailcallopt, we need to adjust the stack. We'll do the call
		// sequence start and end here.
		if (!IsSibCall) {
		MIB->getOperand(1).setImm(FPDiff);
		CallSeqStart.addImm(NumBytes).addImm(0);
		// End the call sequence before emitting the call. Normally, we would
		// tidy the frame up after the call. However, here, we've laid out the
		// parameters so that when SP is reset, they will be in the correct
		// location.
		MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN).addImm(NumBytes).addImm(0);
		}

		// Now we can add the actual call instruction to the correct basic block.
		MIRBuilder.insertInstr(MIB);

		// If Callee is a reg, since it is used by a target specific
		// instruction, it must have a register class matching the
		// constraint of that instruction.

		// FIXME: We should define regbankselectable call instructions to handle
		// divergent call targets.
		if (MIB->getOperand(0).isReg()) {
		MIB->getOperand(0).setReg(constrainOperandRegClass(
		MF, TRI, MRI, ST.getInstrInfo(),
		ST.getRegBankInfo(), MIB, MIB->getDesc(), MIB->getOperand(0),
		0));
		}

		MF.getFrameInfo().setHasTailCall();
		Info.LoweredTailCall = true;
		return true;
		}

bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,		bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const {		CallLoweringInfo &Info) const {
if (Info.IsVarArg) {		if (Info.IsVarArg) {
LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");		LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
return false;		return false;
}		}

MachineFunction &MF = MIRBuilder.getMF();		MachineFunction &MF = MIRBuilder.getMF();
Show All 21 Lines	bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
for (auto &OrigArg : Info.OrigArgs)		for (auto &OrigArg : Info.OrigArgs)
splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);		splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);

SmallVector<ArgInfo, 8> InArgs;		SmallVector<ArgInfo, 8> InArgs;
if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())		if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())
splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv);		splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv);

// If we can lower as a tail call, do that instead.		// If we can lower as a tail call, do that instead.
bool CanTailCallOpt = false;		bool CanTailCallOpt =
		isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs);

// We must emit a tail call if we have musttail.		// We must emit a tail call if we have musttail.
if (Info.IsMustTailCall && !CanTailCallOpt) {		if (Info.IsMustTailCall && !CanTailCallOpt) {
LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");		LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
return false;		return false;
}		}

		if (CanTailCallOpt)
		return lowerTailCall(MIRBuilder, Info, OutArgs);

// Find out which ABI gets to decide where things go.		// Find out which ABI gets to decide where things go.
CCAssignFn *AssignFnFixed;		CCAssignFn *AssignFnFixed;
CCAssignFn *AssignFnVarArg;		CCAssignFn *AssignFnVarArg;
std::tie(AssignFnFixed, AssignFnVarArg) =		std::tie(AssignFnFixed, AssignFnVarArg) =
getAssignFnsForCC(Info.CallConv, TLI);		getAssignFnsForCC(Info.CallConv, TLI);

MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)		MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)
.addImm(0)		.addImm(0)
Show All 32 Lines	bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
SmallVector<Register, 8> PhysRegs;		SmallVector<Register, 8> PhysRegs;
AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,		AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
AssignFnVarArg, false);		AssignFnVarArg, false);
if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler))		if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler))
return false;		return false;

const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();		const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

if (!ST.enableFlatScratch()) {		handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs);
// Insert copies for the SRD. In the HSA case, this should be an identity
// copy.
auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
MFI->getScratchRSrcReg());
MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
}

for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
MIB.addReg(ArgReg.first, RegState::Implicit);
}

// Get a count of how many bytes are to be pushed on the stack.		// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();		unsigned NumBytes = CCInfo.getNextStackOffset();

// If Callee is a reg, since it is used by a target specific		// If Callee is a reg, since it is used by a target specific
// instruction, it must have a register class matching the		// instruction, it must have a register class matching the
// constraint of that instruction.		// constraint of that instruction.

Show All 37 Lines

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
				; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-sroa=0 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefix=GCN %s
				; This is a copy of sibling-call.ll, but stops after the IRTranslator.

				define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
				; GCN-LABEL: name: i32_fastcc_i32_i32
				; GCN: bb.1 (%ir-block.0):
				; GCN: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
				; GCN: $vgpr0 = COPY [[ADD]](s32)
				; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
				; GCN: S_SETPC_B64_return [[COPY3]], implicit $vgpr0
				%add0 = add i32 %arg0, %arg1
				ret i32 %add0
				}

				define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
				; GCN-LABEL: name: i32_fastcc_i32_i32_stack_object
				; GCN: bb.1 (%ir-block.0):
				; GCN: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
				; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
				; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
				; GCN: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s32)
				; GCN: G_STORE [[C]](s32), [[PTR_ADD]](p5) :: (volatile store 4 into %ir.gep, addrspace 5)
				; GCN: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
				; GCN: $vgpr0 = COPY [[ADD]](s32)
				; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
				; GCN: S_SETPC_B64_return [[COPY3]], implicit $vgpr0
				%alloca = alloca [16 x i32], align 4, addrspace(5)
				%gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
				store volatile i32 9, i32 addrspace(5)* %gep
				%add0 = add i32 %arg0, %arg1
				ret i32 %add0
				}

				define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
				; GCN-LABEL: name: sibling_call_i32_fastcc_i32_i32
				; GCN: frameInfo:
				; GCN: hasCalls: false
				; GCN: hasTailCall: true
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; GCN: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
				; GCN: [[COPY11:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32
				; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY15:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[COPY8]](s32)
				; GCN: $vgpr1 = COPY [[COPY9]](s32)
				; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY12]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY13]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY14]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY15]](s64)
				; GCN: $sgpr12 = COPY [[COPY16]](s32)
				; GCN: $sgpr13 = COPY [[COPY17]](s32)
				; GCN: $sgpr14 = COPY [[COPY18]](s32)
				; GCN: $vgpr31 = COPY [[COPY19]](s32)
				; GCN: SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
				entry:
				%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
				ret i32 %ret
				}

				define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
				; GCN-LABEL: name: sibling_call_i32_fastcc_i32_i32_stack_object
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; GCN: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
				; GCN: [[COPY11:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
				; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
				; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
				; GCN: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s32)
				; GCN: G_STORE [[C]](s32), [[PTR_ADD]](p5) :: (volatile store 4 into %ir.gep, addrspace 5)
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32
				; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY15:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[COPY8]](s32)
				; GCN: $vgpr1 = COPY [[COPY9]](s32)
				; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY12]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY13]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY14]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY15]](s64)
				; GCN: $sgpr12 = COPY [[COPY16]](s32)
				; GCN: $sgpr13 = COPY [[COPY17]](s32)
				; GCN: $sgpr14 = COPY [[COPY18]](s32)
				; GCN: $vgpr31 = COPY [[COPY19]](s32)
				; GCN: SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
				entry:
				%alloca = alloca [16 x i32], align 4, addrspace(5)
				%gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
				store volatile i32 9, i32 addrspace(5)* %gep
				%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
				ret i32 %ret
				}

				define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 {
				; GCN-LABEL: name: sibling_call_i32_fastcc_i32_i32_callee_stack_object
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; GCN: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
				; GCN: [[COPY11:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
				; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
				; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
				; GCN: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s32)
				; GCN: G_STORE [[C]](s32), [[PTR_ADD]](p5) :: (volatile store 4 into %ir.gep, addrspace 5)
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32_stack_object
				; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY15:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[COPY8]](s32)
				; GCN: $vgpr1 = COPY [[COPY9]](s32)
				; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY12]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY13]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY14]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY15]](s64)
				; GCN: $sgpr12 = COPY [[COPY16]](s32)
				; GCN: $sgpr13 = COPY [[COPY17]](s32)
				; GCN: $sgpr14 = COPY [[COPY18]](s32)
				; GCN: $vgpr31 = COPY [[COPY19]](s32)
				; GCN: SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32_stack_object, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
				entry:
				%alloca = alloca [16 x i32], align 4, addrspace(5)
				%gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
				store volatile i32 9, i32 addrspace(5)* %gep
				%ret = tail call fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b)
				ret i32 %ret
				}

				define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
				; GCN-LABEL: name: sibling_call_i32_fastcc_i32_i32_unused_result
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; GCN: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
				; GCN: [[COPY11:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32
				; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY15:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[COPY8]](s32)
				; GCN: $vgpr1 = COPY [[COPY9]](s32)
				; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY12]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY13]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY14]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY15]](s64)
				; GCN: $sgpr12 = COPY [[COPY16]](s32)
				; GCN: $sgpr13 = COPY [[COPY17]](s32)
				; GCN: $sgpr14 = COPY [[COPY18]](s32)
				; GCN: $vgpr31 = COPY [[COPY19]](s32)
				; GCN: SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
				entry:
				%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
				ret void
				}

				; It doesn't make sense to do a tail from a kernel
				define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
				; GCN-LABEL: name: kernel_call_i32_fastcc_i32_i32_unused_result
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
				; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
				; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
				; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
				; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
				; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
				; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
				; GCN: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
				; GCN: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load 8 from %ir.0, align 16, addrspace 4)
				; GCN: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<2 x s32>), [[C]](s32)
				; GCN: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[LOAD]](<2 x s32>), [[C1]](s32)
				; GCN: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
				; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[INT]], [[C2]](s64)
				; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32
				; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
				; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
				; GCN: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
				; GCN: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C3]](s64)
				; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
				; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
				; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
				; GCN: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
				; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C4]](s32)
				; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
				; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32)
				; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
				; GCN: $vgpr0 = COPY [[EVEC]](s32)
				; GCN: $vgpr1 = COPY [[EVEC1]](s32)
				; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64)
				; GCN: $sgpr12 = COPY [[COPY14]](s32)
				; GCN: $sgpr13 = COPY [[COPY15]](s32)
				; GCN: $sgpr14 = COPY [[COPY16]](s32)
				; GCN: $vgpr31 = COPY [[OR1]](s32)
				; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @i32_fastcc_i32_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
				; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
				; GCN: S_ENDPGM 0
				entry:
				%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
				ret void
				}

				define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)* byval(i32) align 4 %arg1) #1 {
				; GCN-LABEL: name: i32_fastcc_i32_byval_i32
				; GCN: bb.1 (%ir-block.0):
				; GCN: liveins: $vgpr0, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
				; GCN: [[COPY1:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX]](p5)
				; GCN: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p5) :: (dereferenceable load 4 from %ir.arg1, addrspace 5)
				; GCN: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[LOAD]]
				; GCN: $vgpr0 = COPY [[ADD]](s32)
				; GCN: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
				; GCN: S_SETPC_B64_return [[COPY3]], implicit $vgpr0
				%arg1.load = load i32, i32 addrspace(5)* %arg1, align 4
				%add0 = add i32 %arg0, %arg1.load
				ret i32 %add0
				}

				; Tail call disallowed with byval in parent.
				define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32 addrspace(5)* byval(i32) %b.byval, i32 %c) #1 {
				; GCN-LABEL: name: sibling_call_i32_fastcc_i32_byval_i32_byval_parent
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; GCN: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
				; GCN: [[COPY9:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX]](p5)
				; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY11:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_byval_i32
				; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY15:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[COPY8]](s32)
				; GCN: [[COPY20:%[0-9]+]]:_(p5) = COPY $sgpr32
				; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
				; GCN: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C]](s32)
				; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
				; GCN: G_MEMCPY [[PTR_ADD]](p5), [[COPY9]](p5), [[C1]](s32), 0 :: (dereferenceable store 4 into stack, addrspace 5), (dereferenceable load 4 from %ir.b.byval, addrspace 5)
				; GCN: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY12]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY13]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY14]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY15]](s64)
				; GCN: $sgpr12 = COPY [[COPY16]](s32)
				; GCN: $sgpr13 = COPY [[COPY17]](s32)
				; GCN: $sgpr14 = COPY [[COPY18]](s32)
				; GCN: $vgpr31 = COPY [[COPY19]](s32)
				; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @i32_fastcc_i32_byval_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
				; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: ADJCALLSTACKDOWN 0, 4, implicit-def $scc
				; GCN: $vgpr0 = COPY [[COPY22]](s32)
				; GCN: [[COPY23:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY11]]
				; GCN: S_SETPC_B64_return [[COPY23]], implicit $vgpr0
				entry:
				%ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* byval(i32) %b.byval)
				ret i32 %ret
				}

				; Tail call disallowed with byval in parent, not callee. The stack
				; usage of incoming arguments must be <= the outgoing stack
				; arguments.
				FlakebiUnsubmitted Not Done Reply Inline Actions The parent does not use byval. Is this comment correct? Flakebi: The parent does not use byval. Is this comment correct?
				define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 {
				; GCN-LABEL: name: sibling_call_i32_fastcc_i32_byval_i32
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; GCN: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
				; GCN: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
				; GCN: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
				; GCN: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
				; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6
				; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr7
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr8
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr9
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr10
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr11
				; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr12
				; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr13
				; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr14
				; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr15
				; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr16
				; GCN: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr17
				; GCN: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr18
				; GCN: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr19
				; GCN: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr20
				; GCN: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr21
				; GCN: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr22
				; GCN: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr23
				; GCN: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr24
				; GCN: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr25
				; GCN: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr26
				; GCN: [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr27
				; GCN: [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr28
				; GCN: [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr29
				; GCN: [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr30
				; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
				; GCN: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.2, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
				; GCN: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 4 from %fixed-stack.1, addrspace 5)
				; GCN: [[COPY39:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
				; GCN: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[C]](s32)
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_byval_i32
				; GCN: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[COPY8]](s32)
				; GCN: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
				; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
				; GCN: G_MEMCPY [[FRAME_INDEX2]](p5), [[INTTOPTR]](p5), [[C1]](s32), 0 :: (dereferenceable store 4 into %fixed-stack.0, align 16, addrspace 5), (dereferenceable load 4 from `i32 addrspace(5)* inttoptr (i32 16 to i32 addrspace(5)*)`, align 16, addrspace 5)
				; GCN: [[COPY48:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY48]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY40]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY41]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY42]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY43]](s64)
				; GCN: $sgpr12 = COPY [[COPY44]](s32)
				; GCN: $sgpr13 = COPY [[COPY45]](s32)
				; GCN: $sgpr14 = COPY [[COPY46]](s32)
				; GCN: $vgpr31 = COPY [[COPY47]](s32)
				; GCN: SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_byval_i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
				entry:
				%ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* byval(i32) inttoptr (i32 16 to i32 addrspace(5)*))
				ret i32 %ret
				}

				define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 {
				; GCN-LABEL: name: i32_fastcc_i32_i32_a32i32
				; GCN: bb.1 (%ir-block.0):
				; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
				; GCN: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
				; GCN: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
				; GCN: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
				; GCN: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
				; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
				; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
				; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
				; GCN: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
				; GCN: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
				; GCN: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
				; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
				; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
				; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
				; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
				; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
				; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
				; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr24
				; GCN: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr25
				; GCN: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr26
				; GCN: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr27
				; GCN: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28
				; GCN: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
				; GCN: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
				; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
				; GCN: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.2, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
				; GCN: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 4 from %fixed-stack.1, addrspace 5)
				; GCN: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
				; GCN: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 4 from %fixed-stack.0, align 8, addrspace 5)
				; GCN: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
				; GCN: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[LOAD1]]
				; GCN: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[LOAD2]]
				; GCN: $vgpr0 = COPY [[ADD2]](s32)
				; GCN: [[COPY32:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]]
				; GCN: S_SETPC_B64_return [[COPY32]], implicit $vgpr0
				%val_firststack = extractvalue [32 x i32] %large, 30
				%val_laststack = extractvalue [32 x i32] %large, 31
				%add0 = add i32 %arg0, %arg1
				%add1 = add i32 %add0, %val_firststack
				%add2 = add i32 %add1, %val_laststack
				ret i32 %add2
				}

				define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
				; GCN-LABEL: name: sibling_call_i32_fastcc_i32_i32_a32i32
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; GCN: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
				; GCN: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
				; GCN: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
				; GCN: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
				; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6
				; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr7
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr8
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr9
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr10
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr11
				; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr12
				; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr13
				; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr14
				; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr15
				; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr16
				; GCN: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr17
				; GCN: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr18
				; GCN: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr19
				; GCN: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr20
				; GCN: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr21
				; GCN: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr22
				; GCN: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr23
				; GCN: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr24
				; GCN: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr25
				; GCN: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr26
				; GCN: [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr27
				; GCN: [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr28
				; GCN: [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr29
				; GCN: [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr30
				; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5
				; GCN: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.5, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4
				; GCN: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 4 from %fixed-stack.4, addrspace 5)
				; GCN: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
				; GCN: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 4 from %fixed-stack.3, align 8, addrspace 5)
				; GCN: [[COPY39:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32_a32i32
				; GCN: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[COPY8]](s32)
				; GCN: $vgpr1 = COPY [[COPY9]](s32)
				; GCN: $vgpr2 = COPY [[COPY10]](s32)
				; GCN: $vgpr3 = COPY [[COPY11]](s32)
				; GCN: $vgpr4 = COPY [[COPY12]](s32)
				; GCN: $vgpr5 = COPY [[COPY13]](s32)
				; GCN: $vgpr6 = COPY [[COPY14]](s32)
				; GCN: $vgpr7 = COPY [[COPY15]](s32)
				; GCN: $vgpr8 = COPY [[COPY16]](s32)
				; GCN: $vgpr9 = COPY [[COPY17]](s32)
				; GCN: $vgpr10 = COPY [[COPY18]](s32)
				; GCN: $vgpr11 = COPY [[COPY19]](s32)
				; GCN: $vgpr12 = COPY [[COPY20]](s32)
				; GCN: $vgpr13 = COPY [[COPY21]](s32)
				; GCN: $vgpr14 = COPY [[COPY22]](s32)
				; GCN: $vgpr15 = COPY [[COPY23]](s32)
				; GCN: $vgpr16 = COPY [[COPY24]](s32)
				; GCN: $vgpr17 = COPY [[COPY25]](s32)
				; GCN: $vgpr18 = COPY [[COPY26]](s32)
				; GCN: $vgpr19 = COPY [[COPY27]](s32)
				; GCN: $vgpr20 = COPY [[COPY28]](s32)
				; GCN: $vgpr21 = COPY [[COPY29]](s32)
				; GCN: $vgpr22 = COPY [[COPY30]](s32)
				; GCN: $vgpr23 = COPY [[COPY31]](s32)
				; GCN: $vgpr24 = COPY [[COPY32]](s32)
				; GCN: $vgpr25 = COPY [[COPY33]](s32)
				; GCN: $vgpr26 = COPY [[COPY34]](s32)
				; GCN: $vgpr27 = COPY [[COPY35]](s32)
				; GCN: $vgpr28 = COPY [[COPY36]](s32)
				; GCN: $vgpr29 = COPY [[COPY37]](s32)
				; GCN: $vgpr30 = COPY [[COPY38]](s32)
				; GCN: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
				; GCN: G_STORE [[LOAD]](s32), [[FRAME_INDEX3]](p5) :: (store 4 into %fixed-stack.2, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
				; GCN: G_STORE [[LOAD1]](s32), [[FRAME_INDEX4]](p5) :: (store 4 into %fixed-stack.1, addrspace 5)
				; GCN: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
				; GCN: G_STORE [[LOAD2]](s32), [[FRAME_INDEX5]](p5) :: (store 4 into %fixed-stack.0, align 8, addrspace 5)
				; GCN: [[COPY48:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY48]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY40]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY41]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY42]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY43]](s64)
				; GCN: $sgpr12 = COPY [[COPY44]](s32)
				; GCN: $sgpr13 = COPY [[COPY45]](s32)
				; GCN: $sgpr14 = COPY [[COPY46]](s32)
				; GCN: $vgpr31 = COPY [[COPY47]](s32)
				; GCN: SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32_a32i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
				entry:
				%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
				ret i32 %ret
				}

				define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
				; GCN-LABEL: name: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; GCN: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
				; GCN: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
				; GCN: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
				; GCN: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
				; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6
				; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr7
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr8
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr9
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr10
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr11
				; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr12
				; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr13
				; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr14
				; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr15
				; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr16
				; GCN: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr17
				; GCN: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr18
				; GCN: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr19
				; GCN: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr20
				; GCN: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr21
				; GCN: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr22
				; GCN: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr23
				; GCN: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr24
				; GCN: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr25
				; GCN: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr26
				; GCN: [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr27
				; GCN: [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr28
				; GCN: [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr29
				; GCN: [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr30
				; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5
				; GCN: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.5, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4
				; GCN: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 4 from %fixed-stack.4, addrspace 5)
				; GCN: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
				; GCN: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 4 from %fixed-stack.3, align 8, addrspace 5)
				; GCN: [[COPY39:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
				; GCN: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
				; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
				; GCN: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX3]], [[C1]](s32)
				; GCN: G_STORE [[C]](s32), [[PTR_ADD]](p5) :: (volatile store 4 into %ir.gep, addrspace 5)
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32_a32i32
				; GCN: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[COPY8]](s32)
				; GCN: $vgpr1 = COPY [[COPY9]](s32)
				; GCN: $vgpr2 = COPY [[COPY10]](s32)
				; GCN: $vgpr3 = COPY [[COPY11]](s32)
				; GCN: $vgpr4 = COPY [[COPY12]](s32)
				; GCN: $vgpr5 = COPY [[COPY13]](s32)
				; GCN: $vgpr6 = COPY [[COPY14]](s32)
				; GCN: $vgpr7 = COPY [[COPY15]](s32)
				; GCN: $vgpr8 = COPY [[COPY16]](s32)
				; GCN: $vgpr9 = COPY [[COPY17]](s32)
				; GCN: $vgpr10 = COPY [[COPY18]](s32)
				; GCN: $vgpr11 = COPY [[COPY19]](s32)
				; GCN: $vgpr12 = COPY [[COPY20]](s32)
				; GCN: $vgpr13 = COPY [[COPY21]](s32)
				; GCN: $vgpr14 = COPY [[COPY22]](s32)
				; GCN: $vgpr15 = COPY [[COPY23]](s32)
				; GCN: $vgpr16 = COPY [[COPY24]](s32)
				; GCN: $vgpr17 = COPY [[COPY25]](s32)
				; GCN: $vgpr18 = COPY [[COPY26]](s32)
				; GCN: $vgpr19 = COPY [[COPY27]](s32)
				; GCN: $vgpr20 = COPY [[COPY28]](s32)
				; GCN: $vgpr21 = COPY [[COPY29]](s32)
				; GCN: $vgpr22 = COPY [[COPY30]](s32)
				; GCN: $vgpr23 = COPY [[COPY31]](s32)
				; GCN: $vgpr24 = COPY [[COPY32]](s32)
				; GCN: $vgpr25 = COPY [[COPY33]](s32)
				; GCN: $vgpr26 = COPY [[COPY34]](s32)
				; GCN: $vgpr27 = COPY [[COPY35]](s32)
				; GCN: $vgpr28 = COPY [[COPY36]](s32)
				; GCN: $vgpr29 = COPY [[COPY37]](s32)
				; GCN: $vgpr30 = COPY [[COPY38]](s32)
				; GCN: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
				; GCN: G_STORE [[LOAD]](s32), [[FRAME_INDEX4]](p5) :: (store 4 into %fixed-stack.2, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
				; GCN: G_STORE [[LOAD1]](s32), [[FRAME_INDEX5]](p5) :: (store 4 into %fixed-stack.1, addrspace 5)
				; GCN: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
				; GCN: G_STORE [[LOAD2]](s32), [[FRAME_INDEX6]](p5) :: (store 4 into %fixed-stack.0, align 8, addrspace 5)
				; GCN: [[COPY48:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY48]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY40]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY41]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY42]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY43]](s64)
				; GCN: $sgpr12 = COPY [[COPY44]](s32)
				; GCN: $sgpr13 = COPY [[COPY45]](s32)
				; GCN: $sgpr14 = COPY [[COPY46]](s32)
				; GCN: $vgpr31 = COPY [[COPY47]](s32)
				; GCN: SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32_a32i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
				entry:
				%alloca = alloca [16 x i32], align 4, addrspace(5)
				%gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
				store volatile i32 9, i32 addrspace(5)* %gep
				%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
				ret i32 %ret
				}

				; If the callee requires more stack argument space than the caller,
				; don't do a tail call.
				; TODO: Do we really need this restriction?
				define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
				; GCN-LABEL: name: no_sibling_call_callee_more_stack_space
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; GCN: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
				; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32_a32i32
				; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[COPY8]](s32)
				; GCN: $vgpr1 = COPY [[COPY9]](s32)
				; GCN: $vgpr2 = COPY [[C]](s32)
				; GCN: $vgpr3 = COPY [[C]](s32)
				; GCN: $vgpr4 = COPY [[C]](s32)
				; GCN: $vgpr5 = COPY [[C]](s32)
				; GCN: $vgpr6 = COPY [[C]](s32)
				; GCN: $vgpr7 = COPY [[C]](s32)
				; GCN: $vgpr8 = COPY [[C]](s32)
				; GCN: $vgpr9 = COPY [[C]](s32)
				; GCN: $vgpr10 = COPY [[C]](s32)
				; GCN: $vgpr11 = COPY [[C]](s32)
				; GCN: $vgpr12 = COPY [[C]](s32)
				; GCN: $vgpr13 = COPY [[C]](s32)
				; GCN: $vgpr14 = COPY [[C]](s32)
				; GCN: $vgpr15 = COPY [[C]](s32)
				; GCN: $vgpr16 = COPY [[C]](s32)
				; GCN: $vgpr17 = COPY [[C]](s32)
				; GCN: $vgpr18 = COPY [[C]](s32)
				; GCN: $vgpr19 = COPY [[C]](s32)
				; GCN: $vgpr20 = COPY [[C]](s32)
				; GCN: $vgpr21 = COPY [[C]](s32)
				; GCN: $vgpr22 = COPY [[C]](s32)
				; GCN: $vgpr23 = COPY [[C]](s32)
				; GCN: $vgpr24 = COPY [[C]](s32)
				; GCN: $vgpr25 = COPY [[C]](s32)
				; GCN: $vgpr26 = COPY [[C]](s32)
				; GCN: $vgpr27 = COPY [[C]](s32)
				; GCN: $vgpr28 = COPY [[C]](s32)
				; GCN: $vgpr29 = COPY [[C]](s32)
				; GCN: $vgpr30 = COPY [[C]](s32)
				; GCN: [[COPY19:%[0-9]+]]:_(p5) = COPY $sgpr32
				; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
				; GCN: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C1]](s32)
				; GCN: G_STORE [[C]](s32), [[PTR_ADD]](p5) :: (store 4 into stack, align 16, addrspace 5)
				; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
				; GCN: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C2]](s32)
				; GCN: G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store 4 into stack + 4, addrspace 5)
				; GCN: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
				; GCN: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C3]](s32)
				; GCN: G_STORE [[C]](s32), [[PTR_ADD2]](p5) :: (store 4 into stack + 8, align 8, addrspace 5)
				; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY11]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY12]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY13]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY14]](s64)
				; GCN: $sgpr12 = COPY [[COPY15]](s32)
				; GCN: $sgpr13 = COPY [[COPY16]](s32)
				; GCN: $sgpr14 = COPY [[COPY17]](s32)
				; GCN: $vgpr31 = COPY [[COPY18]](s32)
				; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @i32_fastcc_i32_i32_a32i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
				; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: ADJCALLSTACKDOWN 0, 12, implicit-def $scc
				; GCN: $vgpr0 = COPY [[COPY21]](s32)
				; GCN: [[COPY22:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY10]]
				; GCN: S_SETPC_B64_return [[COPY22]], implicit $vgpr0
				entry:
				%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
				ret i32 %ret
				}

				; Have another non-tail in the function
				define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
				; GCN-LABEL: name: sibling_call_i32_fastcc_i32_i32_other_call
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; GCN: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
				; GCN: [[COPY11:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32
				; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY14:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY15:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[COPY8]](s32)
				; GCN: $vgpr1 = COPY [[COPY9]](s32)
				; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY12]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY13]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY14]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY15]](s64)
				; GCN: $sgpr12 = COPY [[COPY16]](s32)
				; GCN: $sgpr13 = COPY [[COPY17]](s32)
				; GCN: $sgpr14 = COPY [[COPY18]](s32)
				; GCN: $vgpr31 = COPY [[COPY19]](s32)
				; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @i32_fastcc_i32_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
				; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
				; GCN: [[GV1:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @sibling_call_i32_fastcc_i32_i32
				; GCN: [[COPY22:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY23:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY24:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY25:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY27:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY28:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY29:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[COPY8]](s32)
				; GCN: $vgpr1 = COPY [[COPY9]](s32)
				; GCN: $vgpr2 = COPY [[COPY21]](s32)
				; GCN: [[COPY30:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY30]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY22]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY23]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY24]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY25]](s64)
				; GCN: $sgpr12 = COPY [[COPY26]](s32)
				; GCN: $sgpr13 = COPY [[COPY27]](s32)
				; GCN: $sgpr14 = COPY [[COPY28]](s32)
				; GCN: $vgpr31 = COPY [[COPY29]](s32)
				; GCN: SI_TCRETURN [[GV1]](p0), @sibling_call_i32_fastcc_i32_i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
				entry:
				%other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
				%ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call)
				ret i32 %ret
				}

				; Have stack object in caller and stack passed arguments. SP should be
				; in same place at function exit.
				define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
				; GCN-LABEL: name: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; GCN: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
				; GCN: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
				; GCN: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
				; GCN: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
				; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6
				; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr7
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr8
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr9
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr10
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr11
				; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr12
				; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr13
				; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr14
				; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr15
				; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr16
				; GCN: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr17
				; GCN: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr18
				; GCN: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr19
				; GCN: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr20
				; GCN: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr21
				; GCN: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr22
				; GCN: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr23
				; GCN: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr24
				; GCN: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr25
				; GCN: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr26
				; GCN: [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr27
				; GCN: [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr28
				; GCN: [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr29
				; GCN: [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr30
				; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5
				; GCN: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.5, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4
				; GCN: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 4 from %fixed-stack.4, addrspace 5)
				; GCN: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
				; GCN: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 4 from %fixed-stack.3, align 8, addrspace 5)
				; GCN: [[COPY39:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
				; GCN: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
				; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
				; GCN: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX3]], [[C1]](s32)
				; GCN: G_STORE [[C]](s32), [[PTR_ADD]](p5) :: (volatile store 4 into %ir.gep, addrspace 5)
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32_a32i32
				; GCN: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[COPY8]](s32)
				; GCN: $vgpr1 = COPY [[COPY9]](s32)
				; GCN: $vgpr2 = COPY [[COPY10]](s32)
				; GCN: $vgpr3 = COPY [[COPY11]](s32)
				; GCN: $vgpr4 = COPY [[COPY12]](s32)
				; GCN: $vgpr5 = COPY [[COPY13]](s32)
				; GCN: $vgpr6 = COPY [[COPY14]](s32)
				; GCN: $vgpr7 = COPY [[COPY15]](s32)
				; GCN: $vgpr8 = COPY [[COPY16]](s32)
				; GCN: $vgpr9 = COPY [[COPY17]](s32)
				; GCN: $vgpr10 = COPY [[COPY18]](s32)
				; GCN: $vgpr11 = COPY [[COPY19]](s32)
				; GCN: $vgpr12 = COPY [[COPY20]](s32)
				; GCN: $vgpr13 = COPY [[COPY21]](s32)
				; GCN: $vgpr14 = COPY [[COPY22]](s32)
				; GCN: $vgpr15 = COPY [[COPY23]](s32)
				; GCN: $vgpr16 = COPY [[COPY24]](s32)
				; GCN: $vgpr17 = COPY [[COPY25]](s32)
				; GCN: $vgpr18 = COPY [[COPY26]](s32)
				; GCN: $vgpr19 = COPY [[COPY27]](s32)
				; GCN: $vgpr20 = COPY [[COPY28]](s32)
				; GCN: $vgpr21 = COPY [[COPY29]](s32)
				; GCN: $vgpr22 = COPY [[COPY30]](s32)
				; GCN: $vgpr23 = COPY [[COPY31]](s32)
				; GCN: $vgpr24 = COPY [[COPY32]](s32)
				; GCN: $vgpr25 = COPY [[COPY33]](s32)
				; GCN: $vgpr26 = COPY [[COPY34]](s32)
				; GCN: $vgpr27 = COPY [[COPY35]](s32)
				; GCN: $vgpr28 = COPY [[COPY36]](s32)
				; GCN: $vgpr29 = COPY [[COPY37]](s32)
				; GCN: $vgpr30 = COPY [[COPY38]](s32)
				; GCN: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
				; GCN: G_STORE [[LOAD]](s32), [[FRAME_INDEX4]](p5) :: (store 4 into %fixed-stack.2, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
				; GCN: G_STORE [[LOAD1]](s32), [[FRAME_INDEX5]](p5) :: (store 4 into %fixed-stack.1, addrspace 5)
				; GCN: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
				; GCN: G_STORE [[LOAD2]](s32), [[FRAME_INDEX6]](p5) :: (store 4 into %fixed-stack.0, align 8, addrspace 5)
				; GCN: [[COPY48:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY48]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY40]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY41]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY42]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY43]](s64)
				; GCN: $sgpr12 = COPY [[COPY44]](s32)
				; GCN: $sgpr13 = COPY [[COPY45]](s32)
				; GCN: $sgpr14 = COPY [[COPY46]](s32)
				; GCN: $vgpr31 = COPY [[COPY47]](s32)
				; GCN: SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32_a32i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
				entry:
				%alloca = alloca [16 x i32], align 4, addrspace(5)
				%gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
				store volatile i32 9, i32 addrspace(5)* %gep
				%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
				ret i32 %ret
				}

				define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
				; GCN-LABEL: name: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; GCN: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
				; GCN: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
				; GCN: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
				; GCN: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
				; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6
				; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr7
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr8
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr9
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr10
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr11
				; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr12
				; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr13
				; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr14
				; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr15
				; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr16
				; GCN: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr17
				; GCN: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr18
				; GCN: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr19
				; GCN: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr20
				; GCN: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr21
				; GCN: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr22
				; GCN: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr23
				; GCN: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr24
				; GCN: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr25
				; GCN: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr26
				; GCN: [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr27
				; GCN: [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr28
				; GCN: [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr29
				; GCN: [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr30
				; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.9
				; GCN: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.9, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.8
				; GCN: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 4 from %fixed-stack.8, addrspace 5)
				; GCN: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7
				; GCN: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 4 from %fixed-stack.7, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6
				; GCN: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load 4 from %fixed-stack.6, addrspace 5)
				; GCN: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5
				; GCN: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load 4 from %fixed-stack.5, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4
				; GCN: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX5]](p5) :: (invariant load 4 from %fixed-stack.4, addrspace 5)
				; GCN: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
				; GCN: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load 4 from %fixed-stack.3, align 8, addrspace 5)
				; GCN: [[COPY39:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
				; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
				; GCN: [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
				; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
				; GCN: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX7]], [[C2]](s32)
				; GCN: G_STORE [[C]](s32), [[PTR_ADD]](p5) :: (volatile store 4 into %ir.gep, addrspace 5)
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @i32_fastcc_i32_i32_a32i32
				; GCN: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[COPY8]](s32)
				; GCN: $vgpr1 = COPY [[COPY9]](s32)
				; GCN: $vgpr2 = COPY [[C1]](s32)
				; GCN: $vgpr3 = COPY [[C1]](s32)
				; GCN: $vgpr4 = COPY [[C1]](s32)
				; GCN: $vgpr5 = COPY [[C1]](s32)
				; GCN: $vgpr6 = COPY [[C1]](s32)
				; GCN: $vgpr7 = COPY [[C1]](s32)
				; GCN: $vgpr8 = COPY [[C1]](s32)
				; GCN: $vgpr9 = COPY [[C1]](s32)
				; GCN: $vgpr10 = COPY [[C1]](s32)
				; GCN: $vgpr11 = COPY [[C1]](s32)
				; GCN: $vgpr12 = COPY [[C1]](s32)
				; GCN: $vgpr13 = COPY [[C1]](s32)
				; GCN: $vgpr14 = COPY [[C1]](s32)
				; GCN: $vgpr15 = COPY [[C1]](s32)
				; GCN: $vgpr16 = COPY [[C1]](s32)
				; GCN: $vgpr17 = COPY [[C1]](s32)
				; GCN: $vgpr18 = COPY [[C1]](s32)
				; GCN: $vgpr19 = COPY [[C1]](s32)
				; GCN: $vgpr20 = COPY [[C1]](s32)
				; GCN: $vgpr21 = COPY [[C1]](s32)
				; GCN: $vgpr22 = COPY [[C1]](s32)
				; GCN: $vgpr23 = COPY [[C1]](s32)
				; GCN: $vgpr24 = COPY [[C1]](s32)
				; GCN: $vgpr25 = COPY [[C1]](s32)
				; GCN: $vgpr26 = COPY [[C1]](s32)
				; GCN: $vgpr27 = COPY [[C1]](s32)
				; GCN: $vgpr28 = COPY [[C1]](s32)
				; GCN: $vgpr29 = COPY [[C1]](s32)
				; GCN: $vgpr30 = COPY [[C1]](s32)
				; GCN: [[FRAME_INDEX8:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
				; GCN: G_STORE [[C1]](s32), [[FRAME_INDEX8]](p5) :: (store 4 into %fixed-stack.2, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX9:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
				; GCN: G_STORE [[C1]](s32), [[FRAME_INDEX9]](p5) :: (store 4 into %fixed-stack.1, addrspace 5)
				; GCN: [[FRAME_INDEX10:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
				; GCN: G_STORE [[C1]](s32), [[FRAME_INDEX10]](p5) :: (store 4 into %fixed-stack.0, align 8, addrspace 5)
				; GCN: [[COPY48:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY48]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY40]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY41]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY42]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY43]](s64)
				; GCN: $sgpr12 = COPY [[COPY44]](s32)
				; GCN: $sgpr13 = COPY [[COPY45]](s32)
				; GCN: $sgpr14 = COPY [[COPY46]](s32)
				; GCN: $vgpr31 = COPY [[COPY47]](s32)
				; GCN: SI_TCRETURN [[GV]](p0), @i32_fastcc_i32_i32_a32i32, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
				entry:
				%alloca = alloca [16 x i32], align 4, addrspace(5)
				%gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
				store volatile i32 9, i32 addrspace(5)* %gep
				%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
				ret i32 %ret
				}

				declare hidden void @void_fastcc_multi_byval(i32 %a, [3 x i32] addrspace(5)* byval([3 x i32]) align 16, [2 x i64] addrspace(5)* byval([2 x i64]))

				define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
				; GCN-LABEL: name: sibling_call_fastcc_multi_byval
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; GCN: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
				; GCN: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
				; GCN: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
				; GCN: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
				; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6
				; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr7
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr8
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr9
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr10
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr11
				; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr12
				; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr13
				; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr14
				; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr15
				; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr16
				; GCN: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr17
				; GCN: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr18
				; GCN: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr19
				; GCN: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr20
				; GCN: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr21
				; GCN: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr22
				; GCN: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr23
				; GCN: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr24
				; GCN: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr25
				; GCN: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr26
				; GCN: [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr27
				; GCN: [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr28
				; GCN: [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr29
				; GCN: [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr30
				; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.35
				; GCN: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.35, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.34
				; GCN: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 4 from %fixed-stack.34, addrspace 5)
				; GCN: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.33
				; GCN: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 4 from %fixed-stack.33, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.32
				; GCN: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load 4 from %fixed-stack.32, addrspace 5)
				; GCN: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.31
				; GCN: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load 4 from %fixed-stack.31, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.30
				; GCN: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX5]](p5) :: (invariant load 4 from %fixed-stack.30, addrspace 5)
				; GCN: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.29
				; GCN: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load 4 from %fixed-stack.29, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.28
				; GCN: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load 4 from %fixed-stack.28, addrspace 5)
				; GCN: [[FRAME_INDEX8:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.27
				; GCN: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX8]](p5) :: (invariant load 4 from %fixed-stack.27, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX9:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.26
				; GCN: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX9]](p5) :: (invariant load 4 from %fixed-stack.26, addrspace 5)
				; GCN: [[FRAME_INDEX10:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.25
				; GCN: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX10]](p5) :: (invariant load 4 from %fixed-stack.25, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX11:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.24
				; GCN: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX11]](p5) :: (invariant load 4 from %fixed-stack.24, addrspace 5)
				; GCN: [[FRAME_INDEX12:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.23
				; GCN: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX12]](p5) :: (invariant load 4 from %fixed-stack.23, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX13:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.22
				; GCN: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX13]](p5) :: (invariant load 4 from %fixed-stack.22, addrspace 5)
				; GCN: [[FRAME_INDEX14:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.21
				; GCN: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX14]](p5) :: (invariant load 4 from %fixed-stack.21, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX15:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.20
				; GCN: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX15]](p5) :: (invariant load 4 from %fixed-stack.20, addrspace 5)
				; GCN: [[FRAME_INDEX16:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.19
				; GCN: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX16]](p5) :: (invariant load 4 from %fixed-stack.19, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX17:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.18
				; GCN: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX17]](p5) :: (invariant load 4 from %fixed-stack.18, addrspace 5)
				; GCN: [[FRAME_INDEX18:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.17
				; GCN: [[LOAD18:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX18]](p5) :: (invariant load 4 from %fixed-stack.17, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX19:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.16
				; GCN: [[LOAD19:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX19]](p5) :: (invariant load 4 from %fixed-stack.16, addrspace 5)
				; GCN: [[FRAME_INDEX20:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.15
				; GCN: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX20]](p5) :: (invariant load 4 from %fixed-stack.15, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX21:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.14
				; GCN: [[LOAD21:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX21]](p5) :: (invariant load 4 from %fixed-stack.14, addrspace 5)
				; GCN: [[FRAME_INDEX22:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.13
				; GCN: [[LOAD22:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX22]](p5) :: (invariant load 4 from %fixed-stack.13, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX23:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.12
				; GCN: [[LOAD23:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX23]](p5) :: (invariant load 4 from %fixed-stack.12, addrspace 5)
				; GCN: [[FRAME_INDEX24:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.11
				; GCN: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX24]](p5) :: (invariant load 4 from %fixed-stack.11, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX25:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.10
				; GCN: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX25]](p5) :: (invariant load 4 from %fixed-stack.10, addrspace 5)
				; GCN: [[FRAME_INDEX26:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.9
				; GCN: [[LOAD26:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX26]](p5) :: (invariant load 4 from %fixed-stack.9, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX27:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.8
				; GCN: [[LOAD27:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX27]](p5) :: (invariant load 4 from %fixed-stack.8, addrspace 5)
				; GCN: [[FRAME_INDEX28:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7
				; GCN: [[LOAD28:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX28]](p5) :: (invariant load 4 from %fixed-stack.7, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX29:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6
				; GCN: [[LOAD29:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX29]](p5) :: (invariant load 4 from %fixed-stack.6, addrspace 5)
				; GCN: [[FRAME_INDEX30:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5
				; GCN: [[LOAD30:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX30]](p5) :: (invariant load 4 from %fixed-stack.5, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX31:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4
				; GCN: [[LOAD31:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX31]](p5) :: (invariant load 4 from %fixed-stack.4, addrspace 5)
				; GCN: [[FRAME_INDEX32:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
				; GCN: [[LOAD32:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX32]](p5) :: (invariant load 4 from %fixed-stack.3, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX33:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
				; GCN: [[LOAD33:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX33]](p5) :: (invariant load 4 from %fixed-stack.2, addrspace 5)
				; GCN: [[COPY39:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
				; GCN: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
				; GCN: [[FRAME_INDEX34:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca0
				; GCN: [[FRAME_INDEX35:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.1.alloca1
				; GCN: G_STORE [[C]](s32), [[FRAME_INDEX34]](p5) :: (store 4 into %ir.alloca0, addrspace 5)
				; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
				; GCN: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX34]], [[C2]](s32)
				; GCN: G_STORE [[C]](s32), [[PTR_ADD]](p5) :: (store 4 into %ir.alloca0 + 4, addrspace 5)
				; GCN: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
				; GCN: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX34]], [[C3]](s32)
				; GCN: G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store 4 into %ir.alloca0 + 8, addrspace 5)
				; GCN: G_STORE [[C1]](s64), [[FRAME_INDEX35]](p5) :: (store 8 into %ir.alloca1, addrspace 5)
				; GCN: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX35]], [[C3]](s32)
				; GCN: G_STORE [[C1]](s64), [[PTR_ADD2]](p5) :: (store 8 into %ir.alloca1 + 8, addrspace 5)
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @void_fastcc_multi_byval
				; GCN: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: $vgpr0 = COPY [[COPY8]](s32)
				; GCN: [[FRAME_INDEX36:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
				; GCN: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
				; GCN: G_MEMCPY [[FRAME_INDEX36]](p5), [[FRAME_INDEX34]](p5), [[C4]](s32), 0 :: (dereferenceable store 12 into %fixed-stack.1, align 16, addrspace 5), (dereferenceable load 12 from %ir.alloca0, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX37:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
				; GCN: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
				; GCN: G_MEMCPY [[FRAME_INDEX37]](p5), [[FRAME_INDEX35]](p5), [[C5]](s32), 0 :: (dereferenceable store 16 into %fixed-stack.0, addrspace 5), (dereferenceable load 16 from %ir.alloca1, align 8, addrspace 5)
				; GCN: [[COPY48:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY48]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY40]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY41]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY42]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY43]](s64)
				; GCN: $sgpr12 = COPY [[COPY44]](s32)
				; GCN: $sgpr13 = COPY [[COPY45]](s32)
				; GCN: $sgpr14 = COPY [[COPY46]](s32)
				; GCN: $vgpr31 = COPY [[COPY47]](s32)
				; GCN: SI_TCRETURN [[GV]](p0), @void_fastcc_multi_byval, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
				entry:
				%alloca0 = alloca [3 x i32], align 16, addrspace(5)
				%alloca1 = alloca [2 x i64], align 8, addrspace(5)
				store [3 x i32] [i32 9, i32 9, i32 9], [3 x i32] addrspace(5)* %alloca0
				store [2 x i64] zeroinitializer, [2 x i64] addrspace(5)* %alloca1
				tail call fastcc void @void_fastcc_multi_byval(i32 %a, [3 x i32] addrspace(5)* byval([3 x i32]) %alloca0, [2 x i64] addrspace(5)* byval([2 x i64]) %alloca1)
				ret void
				}

				declare hidden void @void_fastcc_byval_and_stack_passed([3 x i32] addrspace(5)* byval([3 x i32]) align 16, [32 x i32], i32)

				; Callee has a byval and non-byval stack passed argument
				define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 {
				; GCN-LABEL: name: sibling_call_byval_and_stack_passed
				; GCN: bb.1.entry:
				; GCN: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; GCN: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; GCN: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
				; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
				; GCN: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
				; GCN: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
				; GCN: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
				; GCN: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
				; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6
				; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr7
				; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr8
				; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr9
				; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr10
				; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr11
				; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr12
				; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr13
				; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr14
				; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr15
				; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr16
				; GCN: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr17
				; GCN: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr18
				; GCN: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr19
				; GCN: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr20
				; GCN: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr21
				; GCN: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr22
				; GCN: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr23
				; GCN: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr24
				; GCN: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr25
				; GCN: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr26
				; GCN: [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr27
				; GCN: [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr28
				; GCN: [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr29
				; GCN: [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr30
				; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.36
				; GCN: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.36, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.35
				; GCN: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 4 from %fixed-stack.35, addrspace 5)
				; GCN: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.34
				; GCN: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 4 from %fixed-stack.34, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.33
				; GCN: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load 4 from %fixed-stack.33, addrspace 5)
				; GCN: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.32
				; GCN: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load 4 from %fixed-stack.32, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.31
				; GCN: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX5]](p5) :: (invariant load 4 from %fixed-stack.31, addrspace 5)
				; GCN: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.30
				; GCN: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load 4 from %fixed-stack.30, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.29
				; GCN: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load 4 from %fixed-stack.29, addrspace 5)
				; GCN: [[FRAME_INDEX8:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.28
				; GCN: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX8]](p5) :: (invariant load 4 from %fixed-stack.28, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX9:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.27
				; GCN: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX9]](p5) :: (invariant load 4 from %fixed-stack.27, addrspace 5)
				; GCN: [[FRAME_INDEX10:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.26
				; GCN: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX10]](p5) :: (invariant load 4 from %fixed-stack.26, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX11:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.25
				; GCN: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX11]](p5) :: (invariant load 4 from %fixed-stack.25, addrspace 5)
				; GCN: [[FRAME_INDEX12:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.24
				; GCN: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX12]](p5) :: (invariant load 4 from %fixed-stack.24, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX13:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.23
				; GCN: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX13]](p5) :: (invariant load 4 from %fixed-stack.23, addrspace 5)
				; GCN: [[FRAME_INDEX14:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.22
				; GCN: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX14]](p5) :: (invariant load 4 from %fixed-stack.22, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX15:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.21
				; GCN: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX15]](p5) :: (invariant load 4 from %fixed-stack.21, addrspace 5)
				; GCN: [[FRAME_INDEX16:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.20
				; GCN: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX16]](p5) :: (invariant load 4 from %fixed-stack.20, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX17:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.19
				; GCN: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX17]](p5) :: (invariant load 4 from %fixed-stack.19, addrspace 5)
				; GCN: [[FRAME_INDEX18:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.18
				; GCN: [[LOAD18:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX18]](p5) :: (invariant load 4 from %fixed-stack.18, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX19:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.17
				; GCN: [[LOAD19:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX19]](p5) :: (invariant load 4 from %fixed-stack.17, addrspace 5)
				; GCN: [[FRAME_INDEX20:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.16
				; GCN: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX20]](p5) :: (invariant load 4 from %fixed-stack.16, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX21:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.15
				; GCN: [[LOAD21:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX21]](p5) :: (invariant load 4 from %fixed-stack.15, addrspace 5)
				; GCN: [[FRAME_INDEX22:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.14
				; GCN: [[LOAD22:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX22]](p5) :: (invariant load 4 from %fixed-stack.14, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX23:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.13
				; GCN: [[LOAD23:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX23]](p5) :: (invariant load 4 from %fixed-stack.13, addrspace 5)
				; GCN: [[FRAME_INDEX24:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.12
				; GCN: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX24]](p5) :: (invariant load 4 from %fixed-stack.12, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX25:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.11
				; GCN: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX25]](p5) :: (invariant load 4 from %fixed-stack.11, addrspace 5)
				; GCN: [[FRAME_INDEX26:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.10
				; GCN: [[LOAD26:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX26]](p5) :: (invariant load 4 from %fixed-stack.10, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX27:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.9
				; GCN: [[LOAD27:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX27]](p5) :: (invariant load 4 from %fixed-stack.9, addrspace 5)
				; GCN: [[FRAME_INDEX28:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.8
				; GCN: [[LOAD28:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX28]](p5) :: (invariant load 4 from %fixed-stack.8, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX29:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7
				; GCN: [[LOAD29:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX29]](p5) :: (invariant load 4 from %fixed-stack.7, addrspace 5)
				; GCN: [[FRAME_INDEX30:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6
				; GCN: [[LOAD30:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX30]](p5) :: (invariant load 4 from %fixed-stack.6, align 8, addrspace 5)
				; GCN: [[FRAME_INDEX31:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5
				; GCN: [[LOAD31:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX31]](p5) :: (invariant load 4 from %fixed-stack.5, addrspace 5)
				; GCN: [[FRAME_INDEX32:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4
				; GCN: [[LOAD32:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX32]](p5) :: (invariant load 4 from %fixed-stack.4, align 16, addrspace 5)
				; GCN: [[FRAME_INDEX33:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
				; GCN: [[LOAD33:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX33]](p5) :: (invariant load 4 from %fixed-stack.3, addrspace 5)
				; GCN: [[COPY39:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
				; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
				; GCN: [[FRAME_INDEX34:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.alloca
				; GCN: G_STORE [[C]](s32), [[FRAME_INDEX34]](p5) :: (store 4 into %ir.alloca, addrspace 5)
				; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
				; GCN: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX34]], [[C2]](s32)
				; GCN: G_STORE [[C]](s32), [[PTR_ADD]](p5) :: (store 4 into %ir.alloca + 4, addrspace 5)
				; GCN: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
				; GCN: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX34]], [[C3]](s32)
				; GCN: G_STORE [[C]](s32), [[PTR_ADD1]](p5) :: (store 4 into %ir.alloca + 8, addrspace 5)
				; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @void_fastcc_byval_and_stack_passed
				; GCN: [[COPY40:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; GCN: [[COPY41:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; GCN: [[COPY42:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; GCN: [[COPY43:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; GCN: [[COPY44:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; GCN: [[COPY45:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; GCN: [[COPY46:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; GCN: [[COPY47:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; GCN: [[FRAME_INDEX35:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
				; GCN: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
				; GCN: G_MEMCPY [[FRAME_INDEX35]](p5), [[FRAME_INDEX34]](p5), [[C4]](s32), 0 :: (dereferenceable store 12 into %fixed-stack.2, align 16, addrspace 5), (dereferenceable load 12 from %ir.alloca, align 16, addrspace 5)
				; GCN: $vgpr0 = COPY [[C1]](s32)
				; GCN: $vgpr1 = COPY [[C1]](s32)
				; GCN: $vgpr2 = COPY [[C1]](s32)
				; GCN: $vgpr3 = COPY [[C1]](s32)
				; GCN: $vgpr4 = COPY [[C1]](s32)
				; GCN: $vgpr5 = COPY [[C1]](s32)
				; GCN: $vgpr6 = COPY [[C1]](s32)
				; GCN: $vgpr7 = COPY [[C1]](s32)
				; GCN: $vgpr8 = COPY [[C1]](s32)
				; GCN: $vgpr9 = COPY [[C1]](s32)
				; GCN: $vgpr10 = COPY [[C1]](s32)
				; GCN: $vgpr11 = COPY [[C1]](s32)
				; GCN: $vgpr12 = COPY [[C1]](s32)
				; GCN: $vgpr13 = COPY [[C1]](s32)
				; GCN: $vgpr14 = COPY [[C1]](s32)
				; GCN: $vgpr15 = COPY [[C1]](s32)
				; GCN: $vgpr16 = COPY [[C1]](s32)
				; GCN: $vgpr17 = COPY [[C1]](s32)
				; GCN: $vgpr18 = COPY [[C1]](s32)
				; GCN: $vgpr19 = COPY [[C1]](s32)
				; GCN: $vgpr20 = COPY [[C1]](s32)
				; GCN: $vgpr21 = COPY [[C1]](s32)
				; GCN: $vgpr22 = COPY [[C1]](s32)
				; GCN: $vgpr23 = COPY [[C1]](s32)
				; GCN: $vgpr24 = COPY [[C1]](s32)
				; GCN: $vgpr25 = COPY [[C1]](s32)
				; GCN: $vgpr26 = COPY [[C1]](s32)
				; GCN: $vgpr27 = COPY [[C1]](s32)
				; GCN: $vgpr28 = COPY [[C1]](s32)
				; GCN: $vgpr29 = COPY [[C1]](s32)
				; GCN: $vgpr30 = COPY [[C1]](s32)
				; GCN: [[FRAME_INDEX36:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
				; GCN: G_STORE [[C1]](s32), [[FRAME_INDEX36]](p5) :: (store 4 into %fixed-stack.1, addrspace 5)
				; GCN: [[FRAME_INDEX37:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
				; GCN: G_STORE [[COPY8]](s32), [[FRAME_INDEX37]](p5) :: (store 4 into %fixed-stack.0, align 16, addrspace 5)
				; GCN: [[COPY48:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY48]](<4 x s32>)
				; GCN: $sgpr4_sgpr5 = COPY [[COPY40]](p4)
				; GCN: $sgpr6_sgpr7 = COPY [[COPY41]](p4)
				; GCN: $sgpr8_sgpr9 = COPY [[COPY42]](p4)
				; GCN: $sgpr10_sgpr11 = COPY [[COPY43]](s64)
				; GCN: $sgpr12 = COPY [[COPY44]](s32)
				; GCN: $sgpr13 = COPY [[COPY45]](s32)
				; GCN: $sgpr14 = COPY [[COPY46]](s32)
				; GCN: $vgpr31 = COPY [[COPY47]](s32)
				; GCN: SI_TCRETURN [[GV]](p0), @void_fastcc_byval_and_stack_passed, 0, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
				entry:
				%alloca = alloca [3 x i32], align 16, addrspace(5)
				store [3 x i32] [i32 9, i32 9, i32 9], [3 x i32] addrspace(5)* %alloca
				tail call fastcc void @void_fastcc_byval_and_stack_passed([3 x i32] addrspace(5)* byval([3 x i32]) %alloca, [32 x i32] zeroinitializer, i32 %stack.out.arg)
				ret void
				}

				attributes #0 = { nounwind }
				attributes #1 = { nounwind noinline }

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-tail-call.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
				; RUN: llc -global-isel -amdgpu-fixed-function-abi -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s \| FileCheck -enable-var-scope %s

				declare hidden void @external_void_func_void()

				define void @tail_call_void_func_void() {
				; CHECK-LABEL: name: tail_call_void_func_void
				; CHECK: bb.1 (%ir-block.0):
				; CHECK: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
				; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
				; CHECK: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14
				; CHECK: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13
				; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12
				; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
				; CHECK: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
				; CHECK: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
				; CHECK: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
				; CHECK: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
				; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc
				; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_void_func_void
				; CHECK: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
				; CHECK: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY6]]
				; CHECK: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY5]]
				; CHECK: [[COPY12:%[0-9]+]]:_(s64) = COPY [[COPY4]]
				; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY3]]
				; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY2]]
				; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY1]]
				; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
				; CHECK: [[COPY17:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
				; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY17]](<4 x s32>)
				; CHECK: $sgpr4_sgpr5 = COPY [[COPY9]](p4)
				; CHECK: $sgpr6_sgpr7 = COPY [[COPY10]](p4)
				; CHECK: $sgpr8_sgpr9 = COPY [[COPY11]](p4)
				; CHECK: $sgpr10_sgpr11 = COPY [[COPY12]](s64)
				; CHECK: $sgpr12 = COPY [[COPY13]](s32)
				; CHECK: $sgpr13 = COPY [[COPY14]](s32)
				; CHECK: $sgpr14 = COPY [[COPY15]](s32)
				; CHECK: $vgpr31 = COPY [[COPY16]](s32)
				; CHECK: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_void_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
				; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
				; CHECK: [[COPY18:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
				; CHECK: S_SETPC_B64_return [[COPY18]]
				tail call void @external_void_func_void()
				ret void
				}

llvm/test/CodeGen/AMDGPU/call-constant.ll

Show All 13 Lines	define amdgpu_kernel void @test_call_undef() #0 {
store volatile i32 %op, i32 addrspace(1)* undef		store volatile i32 %op, i32 addrspace(1)* undef
ret void		ret void
}		}

; GCN-LABEL: {{^}}test_tail_call_undef:		; GCN-LABEL: {{^}}test_tail_call_undef:
; SDAG: s_waitcnt		; SDAG: s_waitcnt
; SDAG-NEXT: .Lfunc_end		; SDAG-NEXT: .Lfunc_end

; GISEL: s_swappc_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}		; GISEL: s_setpc_b64 s{{\[[0-9]+:[0-9]+\]}}
define i32 @test_tail_call_undef() #0 {		define i32 @test_tail_call_undef() #0 {
%call = tail call i32 undef(i32 1)		%call = tail call i32 undef(i32 1)
ret i32 %call		ret i32 %call
}		}

; GCN-LABEL: {{^}}test_call_null:		; GCN-LABEL: {{^}}test_call_null:
; SDAG: s_mov_b32 flat_scratch_lo, s5		; SDAG: s_mov_b32 flat_scratch_lo, s5
; SDAG: s_add_u32 s4, s4, s7		; SDAG: s_add_u32 s4, s4, s7
; SDAG: s_lshr_b32		; SDAG: s_lshr_b32

; GISEL: s_swappc_b64 s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}		; GISEL: s_swappc_b64 s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN: s_endpgm		; GCN: s_endpgm
define amdgpu_kernel void @test_call_null() #0 {		define amdgpu_kernel void @test_call_null() #0 {
%val = call i32 null(i32 1)		%val = call i32 null(i32 1)
%op = add i32 %val, 1		%op = add i32 %val, 1
store volatile i32 %op, i32 addrspace(1)* null		store volatile i32 %op, i32 addrspace(1)* null
ret void		ret void
}		}

; GCN-LABEL: {{^}}test_tail_call_null:		; GCN-LABEL: {{^}}test_tail_call_null:
; SDAG: s_waitcnt		; SDAG: s_waitcnt
; SDAG-NEXT: .Lfunc_end		; SDAG-NEXT: .Lfunc_end

; GISEL: s_swappc_b64 s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}		; GISEL: s_setpc_b64 s{{\[[0-9]+:[0-9]+\]$}}
define i32 @test_tail_call_null() #0 {		define i32 @test_tail_call_null() #0 {
%call = tail call i32 null(i32 1)		%call = tail call i32 null(i32 1)
ret i32 %call		ret i32 %call
}		}

llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN -enable-var-scope %s			; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN -enable-var-scope %s
				; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN -enable-var-scope %s

	; Callee with SGPR and VGPR arguments			; Callee with SGPR and VGPR arguments
	define hidden amdgpu_gfx float @callee(float %v.arg0, float inreg %s.arg1) {			define hidden amdgpu_gfx float @callee(float %v.arg0, float inreg %s.arg1) {
	; GCN-LABEL: callee:			; GCN-LABEL: callee:
	; GCN: ; %bb.0:			; GCN: ; %bb.0:
	; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_add_f32_e32 v0, s4, v0			; GCN-NEXT: v_add_f32_e32 v0, s4, v0
	; GCN-NEXT: s_setpc_b64 s[30:31]			; GCN-NEXT: s_setpc_b64 s[30:31]
	Show All 18 Lines

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU/GlobalISel: Implement tail callsClosedPublic

Details

Diff Detail

Event Timeline