diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -62,6 +62,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" @@ -2372,6 +2373,8 @@ Info.OrigRet = {Register(), Type::getVoidTy(CI.getContext()), 0}; return CLI->lowerCall(MIRBuilder, Info); } + case Intrinsic::amdgcn_cs_chain: + return translateCallBase(CI, MIRBuilder); case Intrinsic::fptrunc_round: { uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -76,6 +76,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsWebAssembly.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" @@ -7380,6 +7381,59 @@ setValue(&I, Val); return; } + case Intrinsic::amdgcn_cs_chain: { + assert(I.arg_size() == 5 && "Additional args not supported yet"); + assert(cast(I.getOperand(4))->isZero() && + "Non-zero flags not supported yet"); + + // At this point we don't care if it's amdgpu_cs_chain or + // amdgpu_cs_chain_preserve. + CallingConv::ID CC = CallingConv::AMDGPU_CS_Chain; + + Type *RetTy = I.getType(); + assert(RetTy->isVoidTy() && "Should not return"); + + SDValue Callee = getValue(I.getOperand(0)); + + // We only have 2 actual args: one for the SGPRs and one for the VGPRs. + TargetLowering::ArgListTy Args; + Args.reserve(2); + + for (unsigned Idx : {2, 3}) { + TargetLowering::ArgListEntry Arg; + Arg.Node = getValue(I.getOperand(Idx)); + Arg.Ty = I.getOperand(Idx)->getType(); + Arg.setAttributes(&I, Idx); + Args.push_back(Arg); + } + + assert(Args[0].IsInReg && "SGPR args should be marked inreg"); + assert(!Args[1].IsInReg && "VGPR args should not be marked inreg"); + + // We're also going to pass the EXEC mask as the last argument. + TargetLowering::ArgListEntry Arg; + Arg.Node = getValue(I.getOperand(1)); + Arg.Ty = I.getOperand(1)->getType(); + Arg.IsInReg = true; + Args.push_back(Arg); + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(getCurSDLoc()) + .setChain(getRoot()) + .setCallee(CC, RetTy, Callee, std::move(Args)) + .setNoReturn(true) + .setTailCall(true) + .setConvergent(I.isConvergent()); + CLI.CB = &I; + std::pair Result = + lowerInvokable(CLI, /*EHPadBB*/ nullptr); + (void)Result; + assert(!Result.first.getNode() && !Result.second.getNode() && + "Should've lowered as tail call"); + + HasTailCall = true; + return; + } case Intrinsic::ptrmask: { SDValue Ptr = getValue(I.getOperand(0)); SDValue Const = getValue(I.getOperand(1)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -75,10 +75,13 @@ void handleImplicitCallArguments( MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst, const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI, + CallingConv::ID CalleeCC, ArrayRef> ImplicitArgRegs) const; bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, SmallVectorImpl &OutArgs) const; + bool lowerChainCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const; bool lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -956,8 +956,10 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, bool IsTailCall, CallingConv::ID CC) { - assert(!(IsIndirect && IsTailCall) && "Indirect calls can't be tail calls, " - "because the address can be divergent"); + // For calls to amdgpu_cs_chain functions, the address is known to be uniform. + assert((AMDGPU::isChainCC(CC) || !IsIndirect || !IsTailCall) && + "Indirect calls can't be tail calls, " + "because the address can be divergent"); if (!IsTailCall) return AMDGPU::G_SI_CALL; @@ -1148,14 +1150,20 @@ void AMDGPUCallLowering::handleImplicitCallArguments( MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst, const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo, + CallingConv::ID CalleeCC, ArrayRef> ImplicitArgRegs) const { if (!ST.enableFlatScratch()) { // Insert copies for the SRD. In the HSA case, this should be an identity // copy. auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32), FuncInfo.getScratchRSrcReg()); - MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); - CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); + + auto CalleeRSrcReg = AMDGPU::isChainCC(CalleeCC) + ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51 + : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; + + MIRBuilder.buildCopy(CalleeRSrcReg, ScratchRSrcReg); + CallInst.addReg(CalleeRSrcReg, RegState::Implicit); } for (std::pair ArgReg : ImplicitArgRegs) { @@ -1251,7 +1259,8 @@ // after the ordinary user argument registers. SmallVector, 12> ImplicitArgRegs; - if (Info.CallConv != CallingConv::AMDGPU_Gfx) { + if (Info.CallConv != CallingConv::AMDGPU_Gfx && + !AMDGPU::isChainCC(Info.CallConv)) { // With a fixed ABI, allocate fixed registers before user arguments. if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) return false; @@ -1267,7 +1276,8 @@ if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder)) return false; - handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs); + handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, CalleeCC, + ImplicitArgRegs); // If we have -tailcallopt, we need to adjust the stack. We'll do the call // sequence start and end here. @@ -1281,6 +1291,23 @@ MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN).addImm(NumBytes).addImm(0); } + // If this is a chain call, we need to set EXEC right before the call. + if (AMDGPU::isChainCC(Info.CallConv)) { + ArgInfo ExecArg = Info.OrigArgs[1]; + assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC"); + + if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize())) + return false; + + unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = TRI->getExec(); + auto SetExec = + MIRBuilder.buildInstr(MovOpc).addDef(Exec).addReg(ExecArg.Regs[0]); + SetExec->getOperand(1).setReg(constrainOperandRegClass( + MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *SetExec, + SetExec->getDesc(), SetExec->getOperand(1), 1)); + } + // Now we can add the actual call instruction to the correct basic block. MIRBuilder.insertInstr(MIB); @@ -1301,8 +1328,59 @@ return true; } +/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic. +bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const { + ArgInfo Callee = Info.OrigArgs[0]; + ArgInfo SGPRArgs = Info.OrigArgs[2]; + ArgInfo VGPRArgs = Info.OrigArgs[3]; + ArgInfo Flags = Info.OrigArgs[4]; + + assert(cast(Flags.OrigValue)->isZero() && + "Non-zero flags aren't supported yet."); + assert(Info.OrigArgs.size() == 5 && "Additional args aren't supported yet."); + + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = MF.getFunction(); + const DataLayout &DL = F.getParent()->getDataLayout(); + + // The function to jump to is actually the first argument, so we'll change the + // Callee and other info to match that before using our existing helper. + const Value *CalleeV = Callee.OrigValue->stripPointerCasts(); + if (const Function *F = dyn_cast(CalleeV)) { + Info.Callee = MachineOperand::CreateGA(F, 0); + Info.CallConv = F->getCallingConv(); + } else { + assert(Callee.Regs.size() == 1 && "Too many regs for the callee"); + Info.Callee = MachineOperand::CreateReg(Callee.Regs[0], false); + Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve + // behaves the same here. + } + + // The function that we're calling cannot be vararg (only the intrinsic is). + Info.IsVarArg = false; + + assert(std::all_of(SGPRArgs.Flags.begin(), SGPRArgs.Flags.end(), + [](ISD::ArgFlagsTy F) { return F.isInReg(); }) && + "SGPR arguments should be marked inreg"); + assert(std::none_of(VGPRArgs.Flags.begin(), VGPRArgs.Flags.end(), + [](ISD::ArgFlagsTy F) { return F.isInReg(); }) && + "VGPR arguments should not be marked inreg"); + + SmallVector OutArgs; + splitToValueTypes(SGPRArgs, OutArgs, DL, Info.CallConv); + splitToValueTypes(VGPRArgs, OutArgs, DL, Info.CallConv); + + return lowerTailCall(MIRBuilder, Info, OutArgs); +} + bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const { + if (Function *F = Info.CB->getCalledFunction()) + if (F->isIntrinsic()) + return F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain && + lowerChainCall(MIRBuilder, Info); + if (Info.IsVarArg) { LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n"); return false; @@ -1393,7 +1471,8 @@ const SIMachineFunctionInfo *MFI = MF.getInfo(); - handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs); + handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, Info.CallConv, + ImplicitArgRegs); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getStackSize(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3088,6 +3088,9 @@ const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const { + if (AMDGPU::isChainCC(CalleeCC)) + return true; + if (!mayTailCallThisCC(CalleeCC)) return false; @@ -3172,7 +3175,36 @@ // The wave scratch offset register is used as the global base pointer. SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { + CallingConv::ID CallConv = CLI.CallConv; + bool IsChain = AMDGPU::isChainCC(CallConv); + SelectionDAG &DAG = CLI.DAG; + + TargetLowering::ArgListEntry RequestedExec; + if (IsChain) { + // The last argument should be the value that we need to put in EXEC. + // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we + // don't treat it like the rest of the arguments. + RequestedExec = CLI.Args.back(); + assert(RequestedExec.Node && "No node for EXEC"); + + if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize())) + return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC"); + + assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg"); + CLI.Outs.pop_back(); + CLI.OutVals.pop_back(); + + if (RequestedExec.Ty->isIntegerTy(64)) { + assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up"); + CLI.Outs.pop_back(); + CLI.OutVals.pop_back(); + } + + assert(CLI.Outs.back().OrigArgIndex != 2 && + "Haven't popped all the pieces of the EXEC mask"); + } + const SDLoc &DL = CLI.DL; SmallVector &Outs = CLI.Outs; SmallVector &OutVals = CLI.OutVals; @@ -3180,7 +3212,6 @@ SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &IsTailCall = CLI.IsTailCall; - CallingConv::ID CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; bool IsSibCall = false; bool IsThisReturn = false; @@ -3236,7 +3267,7 @@ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); - if (CallConv != CallingConv::AMDGPU_Gfx) { + if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) { // With a fixed ABI, allocate fixed registers before user arguments. passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); } @@ -3262,16 +3293,19 @@ // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - if (!IsSibCall) { + if (!IsSibCall) Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); + if (!IsSibCall || IsChain) { if (!Subtarget->enableFlatScratch()) { SmallVector CopyFromChains; // In the HSA case, this should be an identity copy. SDValue ScratchRSrcReg = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); - RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); + RegsToPass.emplace_back(IsChain ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51 + : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, + ScratchRSrcReg); CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); Chain = DAG.getTokenFactor(DL, CopyFromChains); } @@ -3397,6 +3431,15 @@ InGlue = Chain.getValue(1); } + auto *TRI = static_cast(Subtarget->getRegisterInfo()); + + if (IsChain) { + // Set EXEC right before the call. + MCRegister ExecReg = TRI->getExec(); + Chain = DAG.getCopyToReg(Chain, DL, ExecReg, RequestedExec.Node, InGlue); + InGlue = Chain.getValue(1); + } + std::vector Ops; Ops.push_back(Chain); Ops.push_back(Callee); @@ -3425,7 +3468,6 @@ // Add a register mask operand representing the call-preserved registers. - auto *TRI = static_cast(Subtarget->getRegisterInfo()); const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -421,6 +421,11 @@ case CallingConv::AMDGPU_Gfx: return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask : CSR_AMDGPU_SI_Gfx_RegMask; + case CallingConv::AMDGPU_CS_Chain: + case CallingConv::AMDGPU_CS_ChainPreserve: + // Calls to these functions never return, so we can pretend everything is + // preserved. + return AMDGPU_AllVGPRs_RegMask; default: return nullptr; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu-cs-chain.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu-cs-chain.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu-cs-chain.ll @@ -0,0 +1,129 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +; RUN: llc --global-isel=1 -march=amdgcn -mcpu=gfx1100 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=GFX11 +; RUN: llc --global-isel=1 -march=amdgcn -mcpu=gfx1030 -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=GFX10 + +declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }) +declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }) +declare void @llvm.amdgcn.cs.chain(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) noreturn + +define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GFX11-LABEL: name: chain_call + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:_(p5) = COPY $vgpr9 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX11-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX11-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee + ; GFX11-NEXT: [[C:%[0-9]+]]:sreg_32(s32) = G_CONSTANT i32 -1 + ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee + ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) + ; GFX11-NEXT: $sgpr0 = COPY [[UV]](s32) + ; GFX11-NEXT: $sgpr1 = COPY [[UV1]](s32) + ; GFX11-NEXT: $sgpr2 = COPY [[UV2]](s32) + ; GFX11-NEXT: $vgpr8 = COPY [[COPY3]](s32) + ; GFX11-NEXT: $vgpr9 = COPY [[COPY4]](p5) + ; GFX11-NEXT: $vgpr10 = COPY [[COPY5]](s32) + ; GFX11-NEXT: $vgpr11 = COPY [[COPY6]](s32) + ; GFX11-NEXT: $exec_lo = S_MOV_B32 [[C]](s32) + ; GFX11-NEXT: SI_TCRETURN [[GV1]](p0), @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; GFX10-LABEL: name: chain_call + ; GFX10: bb.1 (%ir-block.0): + ; GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(p5) = COPY $vgpr9 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee + ; GFX10-NEXT: [[C:%[0-9]+]]:sreg_32(s32) = G_CONSTANT i32 -1 + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) + ; GFX10-NEXT: $sgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $sgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: $sgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: $vgpr8 = COPY [[COPY3]](s32) + ; GFX10-NEXT: $vgpr9 = COPY [[COPY4]](p5) + ; GFX10-NEXT: $vgpr10 = COPY [[COPY5]](s32) + ; GFX10-NEXT: $vgpr11 = COPY [[COPY6]](s32) + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]](<4 x s32>) + ; GFX10-NEXT: $exec_lo = S_MOV_B32 [[C]](s32) + ; GFX10-NEXT: SI_TCRETURN [[GV1]](p0), @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GFX11-LABEL: name: chain_preserve_call + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:_(p5) = COPY $vgpr9 + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX11-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX11-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve + ; GFX11-NEXT: [[C:%[0-9]+]]:sreg_32(s32) = G_CONSTANT i32 -1 + ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve + ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) + ; GFX11-NEXT: $sgpr0 = COPY [[UV]](s32) + ; GFX11-NEXT: $sgpr1 = COPY [[UV1]](s32) + ; GFX11-NEXT: $sgpr2 = COPY [[UV2]](s32) + ; GFX11-NEXT: $vgpr8 = COPY [[COPY3]](s32) + ; GFX11-NEXT: $vgpr9 = COPY [[COPY4]](p5) + ; GFX11-NEXT: $vgpr10 = COPY [[COPY5]](s32) + ; GFX11-NEXT: $vgpr11 = COPY [[COPY6]](s32) + ; GFX11-NEXT: $exec_lo = S_MOV_B32 [[C]](s32) + ; GFX11-NEXT: SI_TCRETURN [[GV1]](p0), @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; GFX10-LABEL: name: chain_preserve_call + ; GFX10: bb.1 (%ir-block.0): + ; GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(p5) = COPY $vgpr9 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee_preserve + ; GFX10-NEXT: [[C:%[0-9]+]]:sreg_32(s32) = G_CONSTANT i32 -1 + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) + ; GFX10-NEXT: $sgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $sgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: $sgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: $vgpr8 = COPY [[COPY3]](s32) + ; GFX10-NEXT: $vgpr9 = COPY [[COPY4]](p5) + ; GFX10-NEXT: $vgpr10 = COPY [[COPY5]](s32) + ; GFX10-NEXT: $vgpr11 = COPY [[COPY6]](s32) + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]](<4 x s32>) + ; GFX10-NEXT: $exec_lo = S_MOV_B32 [[C]](s32) + ; GFX10-NEXT: SI_TCRETURN [[GV1]](p0), @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + + diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll @@ -31,6 +31,8 @@ ret void } +; FIXME: Setup s32. + define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr, <4 x i32> %vgpr) { ; GISEL-GFX11-LABEL: amdgpu_cs_chain_simple_call: ; GISEL-GFX11: ; %bb.0: @@ -374,3 +376,425 @@ call amdgpu_gfx void @use(<24 x i32> %sgprs, <24 x i32> %vgprs) ret void } + +define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: cs_to_chain: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v10, v2 +; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; GISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-GFX10-LABEL: cs_to_chain: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_mov_b32 s100, SCRATCH_RSRC_DWORD0 +; GISEL-GFX10-NEXT: s_mov_b32 s101, SCRATCH_RSRC_DWORD1 +; GISEL-GFX10-NEXT: s_mov_b32 s102, -1 +; GISEL-GFX10-NEXT: s_mov_b32 s103, 0x31c16000 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GISEL-GFX10-NEXT: s_add_u32 s100, s100, s3 +; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 +; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v10, v2 +; GISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; GISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX11-LABEL: cs_to_chain: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v10, v2 +; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: cs_to_chain: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_mov_b32 s100, SCRATCH_RSRC_DWORD0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s101, SCRATCH_RSRC_DWORD1 +; DAGISEL-GFX10-NEXT: s_mov_b32 s102, -1 +; DAGISEL-GFX10-NEXT: s_mov_b32 s103, 0x31c16000 +; DAGISEL-GFX10-NEXT: s_add_u32 s100, s100, s3 +; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 +; DAGISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; DAGISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v3, v0 +; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; DAGISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v9, v1 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v10, v2 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() + call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0) + unreachable +} + +define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: chain_to_chain: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; GISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-GFX10-LABEL: chain_to_chain: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; GISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX11-LABEL: chain_to_chain: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: chain_to_chain: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; DAGISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() + call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0) + unreachable +} + +define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: chain_to_chain_use_all_v0_v7: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8 +; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v11 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; GISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-GFX10-LABEL: chain_to_chain_use_all_v0_v7: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: v_mov_b32_e32 v11, v8 +; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v11 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; GISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX11-LABEL: chain_to_chain_use_all_v0_v7: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8 +; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v11 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: chain_to_chain_use_all_v0_v7: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; DAGISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v11, v8 +; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v11 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + call void asm "s_nop", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v16},~{s0}"() + call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0) + unreachable +} + +define amdgpu_cs_chain void @chain_to_chain_fewer_args(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: chain_to_chain_fewer_args: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX11-NEXT: s_mov_b32 s2, s0 +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: s_mov_b32 s0, s2 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_getpc_b64 s[2:3] +; GISEL-GFX11-NEXT: s_add_u32 s2, s2, chain_callee_2@gotpcrel32@lo+4 +; GISEL-GFX11-NEXT: s_addc_u32 s3, s3, chain_callee_2@gotpcrel32@hi+12 +; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: s_setpc_b64 s[2:3] +; +; GISEL-GFX10-LABEL: chain_to_chain_fewer_args: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX10-NEXT: s_mov_b32 s2, s0 +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: s_mov_b32 s0, s2 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_getpc_b64 s[2:3] +; GISEL-GFX10-NEXT: s_add_u32 s2, s2, chain_callee_2@gotpcrel32@lo+4 +; GISEL-GFX10-NEXT: s_addc_u32 s3, s3, chain_callee_2@gotpcrel32@hi+12 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX10-NEXT: s_setpc_b64 s[2:3] +; +; DAGISEL-GFX11-LABEL: chain_to_chain_fewer_args: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee_2@gotpcrel32@lo+4 +; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee_2@gotpcrel32@hi+12 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX11-NEXT: s_mov_b32 s2, s0 +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s2 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: chain_to_chain_fewer_args: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee_2@gotpcrel32@lo+4 +; DAGISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee_2@gotpcrel32@hi+12 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s2, s0 +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s2 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + %s = shufflevector <3 x i32> %a, <3 x i32> zeroinitializer, <2 x i32> + %v = shufflevector <3 x i32> %b, <3 x i32> zeroinitializer, <2 x i32> + call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() + call void(ptr, i32, <2 x i32>, <2 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v2i32(ptr @chain_callee_2, i32 -1, <2 x i32> inreg %s, <2 x i32> %v, i32 0) + unreachable +} + +define amdgpu_cs_chain void @chain_to_chain_more_args(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: chain_to_chain_more_args: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX11-NEXT: s_mov_b32 s3, 0 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v11, 0 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee_2@gotpcrel32@lo+4 +; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee_2@gotpcrel32@hi+12 +; GISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-GFX10-LABEL: chain_to_chain_more_args: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: s_mov_b32 s3, 0 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee_2@gotpcrel32@lo+4 +; GISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee_2@gotpcrel32@hi+12 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX11-LABEL: chain_to_chain_more_args: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee_2@gotpcrel32@lo+4 +; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee_2@gotpcrel32@hi+12 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, 0 +; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v11, 0 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: chain_to_chain_more_args: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee_2@gotpcrel32@lo+4 +; DAGISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee_2@gotpcrel32@hi+12 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, 0 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v11, 0 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + %s = shufflevector <3 x i32> %a, <3 x i32> zeroinitializer, <4 x i32> + %v = shufflevector <3 x i32> %b, <3 x i32> zeroinitializer, <4 x i32> + call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() + call void(ptr, i32, <4 x i32>, <4 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v4i32(ptr @chain_callee_2, i32 -1, <4 x i32> inreg %s, <4 x i32> %v, i32 0) + unreachable +} + +declare void @llvm.amdgcn.cs.chain.v2i32(ptr, i32, <2 x i32>, <2 x i32>, i32, ...) +declare void @llvm.amdgcn.cs.chain.v3i32(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) +declare void @llvm.amdgcn.cs.chain.v4i32(ptr, i32, <4 x i32>, <4 x i32>, i32, ...) +declare amdgpu_cs_chain void @chain_callee_2(<2 x i32> inreg, <2 x i32>) +declare amdgpu_cs_chain void @chain_callee(<3 x i32> inreg, <3 x i32>) +declare amdgpu_cs_chain void @chain_callee_4(<4 x i32> inreg, <4 x i32>) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll @@ -4,8 +4,6 @@ ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s -declare amdgpu_gfx void @use(...) - ; FIXME: The values of the counters are undefined on entry to amdgpu_cs_chain_preserve functions, so these waits are unnecessary. define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_no_stack({ptr, i32, <4 x i32>} inreg %a, {ptr, i32, <4 x i32>} %b) { @@ -30,3 +28,502 @@ ; DAGISEL-GFX10-NEXT: s_endpgm ret void } + +define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: cs_to_chain_preserve: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v10, v2 +; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_preserve_callee@gotpcrel32@lo+4 +; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_preserve_callee@gotpcrel32@hi+12 +; GISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-GFX10-LABEL: cs_to_chain_preserve: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_mov_b32 s100, SCRATCH_RSRC_DWORD0 +; GISEL-GFX10-NEXT: s_mov_b32 s101, SCRATCH_RSRC_DWORD1 +; GISEL-GFX10-NEXT: s_mov_b32 s102, -1 +; GISEL-GFX10-NEXT: s_mov_b32 s103, 0x31c16000 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GISEL-GFX10-NEXT: s_add_u32 s100, s100, s3 +; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 +; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v10, v2 +; GISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_preserve_callee@gotpcrel32@lo+4 +; GISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_preserve_callee@gotpcrel32@hi+12 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX11-LABEL: cs_to_chain_preserve: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_preserve_callee@gotpcrel32@lo+4 +; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_preserve_callee@gotpcrel32@hi+12 +; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v10, v2 +; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: cs_to_chain_preserve: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_mov_b32 s100, SCRATCH_RSRC_DWORD0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s101, SCRATCH_RSRC_DWORD1 +; DAGISEL-GFX10-NEXT: s_mov_b32 s102, -1 +; DAGISEL-GFX10-NEXT: s_mov_b32 s103, 0x31c16000 +; DAGISEL-GFX10-NEXT: s_add_u32 s100, s100, s3 +; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 +; DAGISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_preserve_callee@gotpcrel32@lo+4 +; DAGISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_preserve_callee@gotpcrel32@hi+12 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v3, v0 +; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; DAGISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v9, v1 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v10, v2 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() + call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_preserve_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0) + unreachable +} + +define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: chain_to_chain_preserve: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_preserve_callee@gotpcrel32@lo+4 +; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_preserve_callee@gotpcrel32@hi+12 +; GISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-GFX10-LABEL: chain_to_chain_preserve: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_preserve_callee@gotpcrel32@lo+4 +; GISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_preserve_callee@gotpcrel32@hi+12 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX11-LABEL: chain_to_chain_preserve: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_preserve_callee@gotpcrel32@lo+4 +; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_preserve_callee@gotpcrel32@hi+12 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: chain_to_chain_preserve: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_preserve_callee@gotpcrel32@lo+4 +; DAGISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_preserve_callee@gotpcrel32@hi+12 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() + call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_preserve_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0) + unreachable +} + +; FIXME: Preserve things (i.e. v16)! +; FIXME: Setup s32. + +define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_preserve_callee@gotpcrel32@lo+4 +; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_preserve_callee@gotpcrel32@hi+12 +; GISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_preserve_callee@gotpcrel32@lo+4 +; GISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_preserve_callee@gotpcrel32@hi+12 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_preserve_callee@gotpcrel32@lo+4 +; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_preserve_callee@gotpcrel32@hi+12 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_preserve_callee@gotpcrel32@lo+4 +; DAGISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_preserve_callee@gotpcrel32@hi+12 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() + call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_preserve_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0) + unreachable +} + +define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: chain_preserve_to_chain: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; GISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-GFX10-LABEL: chain_preserve_to_chain: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; GISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX11-LABEL: chain_preserve_to_chain: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: chain_preserve_to_chain: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; DAGISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() + call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0) + unreachable +} + +define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: chain_preserve_to_chain_use_all_v0_v7: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8 +; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v11 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; GISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: v_mov_b32_e32 v11, v8 +; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v11 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; GISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; GISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_use_all_v0_v7: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8 +; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v11 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_callee@gotpcrel32@lo+4 +; DAGISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_callee@gotpcrel32@hi+12 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v11, v8 +; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v11 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + call void asm "s_nop", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v16},~{s0}"() + call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0) + unreachable +} + +define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_args(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX11-NEXT: s_mov_b32 s2, s0 +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: s_mov_b32 s0, s2 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_getpc_b64 s[2:3] +; GISEL-GFX11-NEXT: s_add_u32 s2, s2, chain_preserve_callee_2@gotpcrel32@lo+4 +; GISEL-GFX11-NEXT: s_addc_u32 s3, s3, chain_preserve_callee_2@gotpcrel32@hi+12 +; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: s_setpc_b64 s[2:3] +; +; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX10-NEXT: s_mov_b32 s2, s0 +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: s_mov_b32 s0, s2 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_getpc_b64 s[2:3] +; GISEL-GFX10-NEXT: s_add_u32 s2, s2, chain_preserve_callee_2@gotpcrel32@lo+4 +; GISEL-GFX10-NEXT: s_addc_u32 s3, s3, chain_preserve_callee_2@gotpcrel32@hi+12 +; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX10-NEXT: s_setpc_b64 s[2:3] +; +; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, chain_preserve_callee_2@gotpcrel32@lo+4 +; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, chain_preserve_callee_2@gotpcrel32@hi+12 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX11-NEXT: s_mov_b32 s2, s0 +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s2 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_getpc_b64 s[4:5] +; DAGISEL-GFX10-NEXT: s_add_u32 s4, s4, chain_preserve_callee_2@gotpcrel32@lo+4 +; DAGISEL-GFX10-NEXT: s_addc_u32 s5, s5, chain_preserve_callee_2@gotpcrel32@hi+12 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; DAGISEL-GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; DAGISEL-GFX10-NEXT: s_mov_b32 s2, s0 +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s2 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + %s = shufflevector <3 x i32> %a, <3 x i32> zeroinitializer, <2 x i32> + %v = shufflevector <3 x i32> %b, <3 x i32> zeroinitializer, <2 x i32> + call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() + call void(ptr, i32, <2 x i32>, <2 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v2i32(ptr @chain_preserve_callee_2, i32 -1, <2 x i32> inreg %s, <2 x i32> %v, i32 0) + unreachable +} + +; Note that amdgpu_cs_chain_preserve functions are not allowed to call +; llvm.amdgcn.cs.chain with more vgpr args than they received as parameters. + +declare void @llvm.amdgcn.cs.chain.v3i32(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) +declare amdgpu_cs_chain_preserve void @chain_preserve_callee(<3 x i32> inreg, <3 x i32>) +declare amdgpu_cs_chain void @chain_callee(<3 x i32> inreg, <3 x i32>) + +declare void @llvm.amdgcn.cs.chain.v2i32(ptr, i32, <2 x i32>, <2 x i32>, i32, ...) +declare amdgpu_cs_chain_preserve void @chain_preserve_callee_2(<2 x i32> inreg, <2 x i32>) diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-intrinsic-w32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-intrinsic-w32.ll @@ -0,0 +1,637 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s + +declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }) +declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }) +declare void @llvm.amdgcn.cs.chain(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) noreturn + +define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: chain_to_chain + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GISEL-GFX11-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX11-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; GISEL-GFX10-LABEL: name: chain_to_chain + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GISEL-GFX10-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX10-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; + ; DAGISEL-GFX11-LABEL: name: chain_to_chain + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: $exec_lo = COPY [[S_MOV_B32_]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; DAGISEL-GFX10-LABEL: name: chain_to_chain + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: $exec_lo = COPY [[S_MOV_B32_]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: cs_to_chain + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GISEL-GFX11-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX11-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; GISEL-GFX10-LABEL: name: cs_to_chain + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GISEL-GFX10-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX10-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; + ; DAGISEL-GFX11-LABEL: name: cs_to_chain + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: $exec_lo = COPY [[S_MOV_B32_]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; DAGISEL-GFX10-LABEL: name: cs_to_chain + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: $exec_lo = COPY [[S_MOV_B32_]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: chain_to_chain_preserve + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GISEL-GFX11-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX11-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; GISEL-GFX10-LABEL: name: chain_to_chain_preserve + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GISEL-GFX10-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX10-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; + ; DAGISEL-GFX11-LABEL: name: chain_to_chain_preserve + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc + ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: $exec_lo = COPY [[S_MOV_B32_]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; DAGISEL-GFX10-LABEL: name: chain_to_chain_preserve + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc + ; DAGISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: $exec_lo = COPY [[S_MOV_B32_]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: cs_to_chain_preserve + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GISEL-GFX11-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX11-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; GISEL-GFX10-LABEL: name: cs_to_chain_preserve + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GISEL-GFX10-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX10-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; + ; DAGISEL-GFX11-LABEL: name: cs_to_chain_preserve + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc + ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: $exec_lo = COPY [[S_MOV_B32_]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; DAGISEL-GFX10-LABEL: name: cs_to_chain_preserve + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc + ; DAGISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: $exec_lo = COPY [[S_MOV_B32_]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: indirect + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY7]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY8]] + ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GISEL-GFX11-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] + ; GISEL-GFX11-NEXT: SI_TCRETURN [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; GISEL-GFX10-LABEL: name: indirect + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY8]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]] + ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GISEL-GFX10-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]] + ; GISEL-GFX10-NEXT: SI_TCRETURN [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; + ; DAGISEL-GFX11-LABEL: name: indirect + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: $exec_lo = COPY [[S_MOV_B32_]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; DAGISEL-GFX10-LABEL: name: indirect + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: $exec_lo = COPY [[S_MOV_B32_]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: non_imm_exec + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY7]] + ; GISEL-GFX11-NEXT: $exec_lo = S_MOV_B32 [[COPY]] + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX11-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; GISEL-GFX10-LABEL: name: non_imm_exec + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY8]] + ; GISEL-GFX10-NEXT: $exec_lo = S_MOV_B32 [[COPY]] + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX10-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; + ; DAGISEL-GFX11-LABEL: name: non_imm_exec + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: $exec_lo = COPY [[COPY7]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; DAGISEL-GFX10-LABEL: name: non_imm_exec + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY8]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: $exec_lo = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-intrinsic-w64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-intrinsic-w64.ll @@ -0,0 +1,645 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s + +declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }) +declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }) +declare void @llvm.amdgcn.cs.chain(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) noreturn + +define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: chain_to_chain + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GISEL-GFX11-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX11-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; GISEL-GFX10-LABEL: name: chain_to_chain + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GISEL-GFX10-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX10-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; + ; DAGISEL-GFX11-LABEL: name: chain_to_chain + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: $exec = COPY [[S_MOV_B64_]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; DAGISEL-GFX10-LABEL: name: chain_to_chain + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: $exec = COPY [[S_MOV_B64_]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: cs_to_chain + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GISEL-GFX11-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX11-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; GISEL-GFX10-LABEL: name: cs_to_chain + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GISEL-GFX10-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX10-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; + ; DAGISEL-GFX11-LABEL: name: cs_to_chain + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: $exec = COPY [[S_MOV_B64_]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; DAGISEL-GFX10-LABEL: name: cs_to_chain + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: $exec = COPY [[S_MOV_B64_]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: chain_to_chain_preserve + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GISEL-GFX11-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX11-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; GISEL-GFX10-LABEL: name: chain_to_chain_preserve + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GISEL-GFX10-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX10-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; + ; DAGISEL-GFX11-LABEL: name: chain_to_chain_preserve + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc + ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: $exec = COPY [[S_MOV_B64_]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; DAGISEL-GFX10-LABEL: name: chain_to_chain_preserve + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc + ; DAGISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: $exec = COPY [[S_MOV_B64_]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: cs_to_chain_preserve + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GISEL-GFX11-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX11-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; GISEL-GFX10-LABEL: name: cs_to_chain_preserve + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GISEL-GFX10-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX10-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; + ; DAGISEL-GFX11-LABEL: name: cs_to_chain_preserve + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc + ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: $exec = COPY [[S_MOV_B64_]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; DAGISEL-GFX10-LABEL: name: cs_to_chain_preserve + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee_preserve + 4, target-flags(amdgpu-gotprel32-hi) @callee_preserve + 12, implicit-def dead $scc + ; DAGISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: $exec = COPY [[S_MOV_B64_]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee_preserve, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee_preserve, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: indirect + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY7]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY8]] + ; GISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GISEL-GFX11-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; GISEL-GFX11-NEXT: SI_TCRETURN [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; GISEL-GFX10-LABEL: name: indirect + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY8]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]] + ; GISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GISEL-GFX10-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; GISEL-GFX10-NEXT: SI_TCRETURN [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; + ; DAGISEL-GFX11-LABEL: name: indirect + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: $exec = COPY [[S_MOV_B64_]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; DAGISEL-GFX10-LABEL: name: indirect + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: $exec = COPY [[S_MOV_B64_]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[REG_SEQUENCE]], 0, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i64 -1, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +} + +define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { + ; GISEL-GFX11-LABEL: name: non_imm_exec + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY7]] + ; GISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY8]] + ; GISEL-GFX11-NEXT: $exec = S_MOV_B64 [[REG_SEQUENCE]] + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX11-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; GISEL-GFX10-LABEL: name: non_imm_exec + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY8]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]] + ; GISEL-GFX10-NEXT: $exec = S_MOV_B64 [[REG_SEQUENCE]] + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GISEL-GFX10-NEXT: SI_TCRETURN [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; + ; DAGISEL-GFX11-LABEL: name: non_imm_exec + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; DAGISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: $exec = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX11-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; + ; DAGISEL-GFX10-LABEL: name: non_imm_exec + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr11 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; DAGISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; DAGISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:ccr_sgpr_64 = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] + ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] + ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: $exec = COPY [[REG_SEQUENCE]] + ; DAGISEL-GFX10-NEXT: SI_TCRETURN killed [[S_LOAD_DWORDX2_IMM]], @callee, 0, amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + call void(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr @callee, i64 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 0) + unreachable +}