diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -62,6 +62,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" @@ -2380,6 +2381,8 @@ Info.OrigRet = {Register(), Type::getVoidTy(CI.getContext()), 0}; return CLI->lowerCall(MIRBuilder, Info); } + case Intrinsic::amdgcn_cs_chain: + return translateCallBase(CI, MIRBuilder); case Intrinsic::fptrunc_round: { uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -76,6 +76,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsWebAssembly.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" @@ -7362,6 +7363,54 @@ setValue(&I, Val); return; } + case Intrinsic::amdgcn_cs_chain: { + assert(I.arg_size() == 5 && "Additional args not supported yet"); + assert(dyn_cast(I.getOperand(4))->isZero() && + "Non-zero flags not supported yet"); + + // At this point we don't care if it's amdgpu_cs_chain or + // amdgpu_cs_chain_preserve. + CallingConv::ID CC = CallingConv::AMDGPU_CS_Chain; + + Type *RetTy = I.getType(); + assert(RetTy->isVoidTy() && "Should not return"); + + SDValue Callee = getValue(I.getOperand(0)); + + // We only have 2 actual args: one for the SGPRs and one for the VGPRs. + TargetLowering::ArgListTy Args; + Args.reserve(2); + + for (unsigned i : {2, 3}) { + TargetLowering::ArgListEntry Arg; + Arg.Node = getValue(I.getOperand(i)); + Arg.Ty = I.getOperand(i)->getType(); + Arg.setAttributes(&I, i); + Args.push_back(Arg); + } + + Args[0].IsInReg = true; // Make sure these go in the SGPRs. + assert(!Args[1].IsInReg && "VGPR args should not be marked inreg"); + + // TODO: Set exec. + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(getCurSDLoc()) + .setChain(getRoot()) + .setCallee(CC, RetTy, Callee, std::move(Args)) + .setNoReturn(true) + .setTailCall(true) + .setConvergent(I.isConvergent()); + CLI.CB = &I; + std::pair Result = + lowerInvokable(CLI, /*EHPadBB*/ nullptr); + (void)Result; + assert(!Result.first.getNode() && !Result.second.getNode() && + "Should've lowered as tail call"); + + HasTailCall = true; + return; + } case Intrinsic::ptrmask: { SDValue Ptr = getValue(I.getOperand(0)); SDValue Const = getValue(I.getOperand(1)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -75,10 +75,13 @@ void handleImplicitCallArguments( MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst, const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI, + CallingConv::ID CalleeCC, ArrayRef> ImplicitArgRegs) const; bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, SmallVectorImpl &OutArgs) const; + bool lowerChainCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const; bool lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -959,8 +959,10 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, bool IsTailCall, CallingConv::ID CC) { - assert(!(IsIndirect && IsTailCall) && "Indirect calls can't be tail calls, " - "because the address can be divergent"); + // For calls to amdgpu_cs_chain functions, the address is known to be uniform. + assert((AMDGPU::isChainCC(CC) || !(IsIndirect && IsTailCall)) && + "Indirect calls can't be tail calls, " + "because the address can be divergent"); if (!IsTailCall) return AMDGPU::G_SI_CALL; @@ -1151,14 +1153,20 @@ void AMDGPUCallLowering::handleImplicitCallArguments( MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst, const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo, + CallingConv::ID CalleeCC, ArrayRef> ImplicitArgRegs) const { if (!ST.enableFlatScratch()) { // Insert copies for the SRD. In the HSA case, this should be an identity // copy. auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32), FuncInfo.getScratchRSrcReg()); - MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); - CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); + + auto CalleeRSrcReg = AMDGPU::isChainCC(CalleeCC) + ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51 + : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; + + MIRBuilder.buildCopy(CalleeRSrcReg, ScratchRSrcReg); + CallInst.addReg(CalleeRSrcReg, RegState::Implicit); } for (std::pair ArgReg : ImplicitArgRegs) { @@ -1254,7 +1262,8 @@ // after the ordinary user argument registers. SmallVector, 12> ImplicitArgRegs; - if (Info.CallConv != CallingConv::AMDGPU_Gfx) { + if (Info.CallConv != CallingConv::AMDGPU_Gfx && + !AMDGPU::isChainCC(Info.CallConv)) { // With a fixed ABI, allocate fixed registers before user arguments. if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) return false; @@ -1270,7 +1279,8 @@ if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder)) return false; - handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs); + handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, CalleeCC, + ImplicitArgRegs); // If we have -tailcallopt, we need to adjust the stack. We'll do the call // sequence start and end here. @@ -1304,8 +1314,61 @@ return true; } +/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic. +bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const { + ArgInfo Callee = Info.OrigArgs[0]; + ArgInfo Exec = Info.OrigArgs[1]; + ArgInfo SGPRArgs = Info.OrigArgs[2]; + ArgInfo VGPRArgs = Info.OrigArgs[3]; + ArgInfo Flags = Info.OrigArgs[4]; + + assert(dyn_cast(Flags.OrigValue)->isZero() && + "Non-zero flags aren't supported yet."); + assert(Info.OrigArgs.size() == 5 && "Additional args aren't supported yet."); + + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = MF.getFunction(); + const DataLayout &DL = F.getParent()->getDataLayout(); + + // The function to jump to is actually the first argument, so we'll change the + // Callee and other info to match that before using our existing helper. + const Value *CalleeV = Callee.OrigValue->stripPointerCasts(); + if (const Function *F = dyn_cast(CalleeV)) { + Info.Callee = MachineOperand::CreateGA(F, 0); + Info.CallConv = F->getCallingConv(); + } else { + assert(Callee.Regs.size() == 1 && "Too many regs for the callee"); + Info.Callee = MachineOperand::CreateReg(Callee.Regs[0], false); + Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve + // behaves the same here. + } + + // The function that we're calling cannot be vararg (only the intrinsic is). + Info.IsVarArg = false; + + // Mark SGPRArgs with `inreg`. + for (auto &F : SGPRArgs.Flags) + F.setInReg(); + assert(std::none_of(VGPRArgs.Flags.begin(), VGPRArgs.Flags.end(), + [](ISD::ArgFlagsTy F) { return F.isInReg(); }) && + "VGPR arguments should not be marked inreg"); + + SmallVector OutArgs; + splitToValueTypes(SGPRArgs, OutArgs, DL, Info.CallConv); + splitToValueTypes(VGPRArgs, OutArgs, DL, Info.CallConv); + + return lowerTailCall(MIRBuilder, Info, OutArgs); + // TODO: Copy %exec into EXEC +} + bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const { + if (Function *F = Info.CB->getCalledFunction()) + if (F->isIntrinsic()) + return F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain && + lowerChainCall(MIRBuilder, Info); + if (Info.IsVarArg) { LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n"); return false; @@ -1393,7 +1456,8 @@ const SIMachineFunctionInfo *MFI = MF.getInfo(); - handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs); + handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, Info.CallConv, + ImplicitArgRegs); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getStackSize(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3078,6 +3078,9 @@ const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const { + if (AMDGPU::isChainCC(CalleeCC)) + return true; + if (!mayTailCallThisCC(CalleeCC)) return false; @@ -3174,6 +3177,7 @@ bool IsVarArg = CLI.IsVarArg; bool IsSibCall = false; bool IsThisReturn = false; + bool IsChain = AMDGPU::isChainCC(CallConv); MachineFunction &MF = DAG.getMachineFunction(); if (Callee.isUndef() || isNullConstant(Callee)) { @@ -3226,7 +3230,7 @@ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); - if (CallConv != CallingConv::AMDGPU_Gfx) { + if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) { // With a fixed ABI, allocate fixed registers before user arguments. passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); } @@ -3252,16 +3256,19 @@ // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - if (!IsSibCall) { + if (!IsSibCall) Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); + if (!IsSibCall || IsChain) { if (!Subtarget->enableFlatScratch()) { SmallVector CopyFromChains; // In the HSA case, this should be an identity copy. SDValue ScratchRSrcReg = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); - RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); + RegsToPass.emplace_back(IsChain ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51 + : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, + ScratchRSrcReg); CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); Chain = DAG.getTokenFactor(DL, CopyFromChains); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -421,6 +421,10 @@ case CallingConv::AMDGPU_Gfx: return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask : CSR_AMDGPU_SI_Gfx_RegMask; + case CallingConv::AMDGPU_CS_Chain: + case CallingConv::AMDGPU_CS_ChainPreserve: + // TODO: This is a lie, we need a new CSR. + return CSR_AMDGPU_RegMask; default: return nullptr; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu-cs-chain.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu-cs-chain.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu-cs-chain.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +; RUN: llc --global-isel=1 -march=amdgcn -mcpu=gfx1100 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=GFX11 +; RUN: llc --global-isel=1 -march=amdgcn -mcpu=gfx1030 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=GFX10 + +declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }) +declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }) +declare void @llvm.amdgcn.cs.chain(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) noreturn + +define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GFX11-LABEL: name: chain_call + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:_(p5) = COPY $vgpr9 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX11-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX11-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee + ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) + ; GFX11-NEXT: $sgpr0 = COPY [[UV]](s32) + ; GFX11-NEXT: $sgpr1 = COPY [[UV1]](s32) + ; GFX11-NEXT: $sgpr2 = COPY [[UV2]](s32) + ; GFX11-NEXT: $vgpr8 = COPY [[COPY3]](s32) + ; GFX11-NEXT: $vgpr9 = COPY [[COPY4]](p5) + ; GFX11-NEXT: $vgpr10 = COPY [[COPY5]](s32) + ; GFX11-NEXT: $vgpr11 = COPY [[COPY6]](s32) + ; GFX11-NEXT: SI_TCRETURN [[GV1]](p0), @callee, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GFX10-LABEL: name: chain_call + ; GFX10: bb.1 (%ir-block.0): + ; GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(p5) = COPY $vgpr9 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) + ; GFX10-NEXT: $sgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $sgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: $sgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: $vgpr8 = COPY [[COPY3]](s32) + ; GFX10-NEXT: $vgpr9 = COPY [[COPY4]](p5) + ; GFX10-NEXT: $vgpr10 = COPY [[COPY5]](s32) + ; GFX10-NEXT: $vgpr11 = COPY [[COPY6]](s32) + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]](<4 x s32>) + ; GFX10-NEXT: SI_TCRETURN [[GV1]](p0), @callee, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 -1, <3 x i32> %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GFX11-LABEL: name: chain_preserve_call + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:_(p5) = COPY $vgpr9 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX11-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX11-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve + ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) + ; GFX11-NEXT: $sgpr0 = COPY [[UV]](s32) + ; GFX11-NEXT: $sgpr1 = COPY [[UV1]](s32) + ; GFX11-NEXT: $sgpr2 = COPY [[UV2]](s32) + ; GFX11-NEXT: $vgpr8 = COPY [[COPY3]](s32) + ; GFX11-NEXT: $vgpr9 = COPY [[COPY4]](p5) + ; GFX11-NEXT: $vgpr10 = COPY [[COPY5]](s32) + ; GFX11-NEXT: $vgpr11 = COPY [[COPY6]](s32) + ; GFX11-NEXT: SI_TCRETURN [[GV1]](p0), @callee_preserve, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GFX10-LABEL: name: chain_preserve_call + ; GFX10: bb.1 (%ir-block.0): + ; GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(p5) = COPY $vgpr9 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) + ; GFX10-NEXT: $sgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $sgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: $sgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: $vgpr8 = COPY [[COPY3]](s32) + ; GFX10-NEXT: $vgpr9 = COPY [[COPY4]](p5) + ; GFX10-NEXT: $vgpr10 = COPY [[COPY5]](s32) + ; GFX10-NEXT: $vgpr11 = COPY [[COPY6]](s32) + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]](<4 x s32>) + ; GFX10-NEXT: SI_TCRETURN [[GV1]](p0), @callee_preserve, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i32 -1, <3 x i32> %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + + diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-intrinsic.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-intrinsic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-intrinsic.ll @@ -0,0 +1,482 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s + +declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }) +declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }) +declare void @llvm.amdgcn.cs.chain(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) noreturn + +define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: chain_to_chain + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX11-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX10-LABEL: name: chain_to_chain + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX10-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX11-LABEL: name: chain_to_chain + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; DAGISEL-GFX10-LABEL: name: chain_to_chain + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, csr_amdgpu, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 -1, <3 x i32> %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: cs_to_chain + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX11-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX10-LABEL: name: cs_to_chain + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX10-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX11-LABEL: name: cs_to_chain + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; DAGISEL-GFX10-LABEL: name: cs_to_chain + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, csr_amdgpu, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 -1, <3 x i32> %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: chain_to_chain_preserve + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX11-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX10-LABEL: name: chain_to_chain_preserve + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX10-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX11-LABEL: name: chain_to_chain_preserve + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc + ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; DAGISEL-GFX10-LABEL: name: chain_to_chain_preserve + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc + ; DAGISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, csr_amdgpu, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i32 -1, <3 x i32> %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: cs_to_chain_preserve + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX11-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX10-LABEL: name: cs_to_chain_preserve + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX10-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX11-LABEL: name: cs_to_chain_preserve + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc + ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; DAGISEL-GFX10-LABEL: name: cs_to_chain_preserve + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc + ; DAGISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, csr_amdgpu, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i32 -1, <3 x i32> %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: indirect + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY7]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY8]] + ; GISEL-GFX11-NEXT: SI_TCRETURN [[REG_SEQUENCE]], 0, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX10-LABEL: name: indirect + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY8]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]] + ; GISEL-GFX10-NEXT: SI_TCRETURN [[REG_SEQUENCE]], 0, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX11-LABEL: name: indirect + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, csr_amdgpu, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; DAGISEL-GFX10-LABEL: name: indirect + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, csr_amdgpu, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 -1, <3 x i32> %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +}