diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h @@ -117,6 +117,8 @@ /// vreg that the swifterror should be copied into after the call. Register SwiftErrorVReg; + Register ConvergenceCtrlToken; + /// Original IR callsite corresponding to this call, if available. const CallBase *CB = nullptr; @@ -580,6 +582,7 @@ bool lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &Call, ArrayRef ResRegs, ArrayRef> ArgRegs, Register SwiftErrorVReg, + Register ConvergenceCtrlToken, std::function GetCalleeReg) const; /// For targets which want to use big-endian can enable it with diff --git a/llvm/include/llvm/CodeGen/LowLevelType.h b/llvm/include/llvm/CodeGen/LowLevelType.h --- a/llvm/include/llvm/CodeGen/LowLevelType.h +++ b/llvm/include/llvm/CodeGen/LowLevelType.h @@ -45,6 +45,14 @@ /*AddressSpace=*/0}; } + /// Get a low-level token; just a scalar with zero bits (or no size). + static constexpr LLT token() { + return LLT{/*isPointer=*/false, + /*isVector=*/false, + /*isScalar=*/true, ElementCount::getFixed(0), 0, + /*AddressSpace=*/0}; + } + /// Get a low-level pointer in the given address space. static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits) { assert(SizeInBits > 0 && "invalid pointer size"); @@ -289,6 +297,28 @@ /// described in static const *Field variables. Each of these variables /// is a 2-element array, with the first element describing the bitfield size /// and the second element describing the bitfield offset. + /// + /// +--------+---------+--------+----------+----------------------+ + /// |isScalar|isPointer|isVector| RawData |Notes | + /// +--------+---------+--------+----------+----------------------+ + /// | 0 | 0 | 0 | 0 |Invalid | + /// +--------+---------+--------+----------+----------------------+ + /// | 0 | 0 | 1 | 0 |Tombstone Key | + /// +--------+---------+--------+----------+----------------------+ + /// | 0 | 1 | 0 | 0 |Empty Key | + /// +--------+---------+--------+----------+----------------------+ + /// | 1 | 0 | 0 | 0 |Token | + /// +--------+---------+--------+----------+----------------------+ + /// | 1 | 0 | 0 | non-zero |Scalar | + /// +--------+---------+--------+----------+----------------------+ + /// | 0 | 1 | 0 | non-zero |Pointer | + /// +--------+---------+--------+----------+----------------------+ + /// | 0 | 0 | 1 | non-zero |Vector of non-pointer | + /// +--------+---------+--------+----------+----------------------+ + /// | 0 | 1 | 1 | non-zero |Vector of pointer | + /// +--------+---------+--------+----------+----------------------+ + /// + /// Everything else is reserved. typedef int BitFieldInfo[2]; /// /// This is how the bitfields are packed per Kind: diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Target/TargetMachine.h" @@ -87,10 +88,20 @@ }); } +static bool isEntryTokenIfPresent(const CallBase &CB) { + auto Bundle = CB.getOperandBundle(LLVMContext::OB_convergencectrl); + if (!Bundle) + return true; + auto *Token = Bundle->Inputs[0].get(); + auto *Def = cast(Token); + return Def->getIntrinsicID() == Intrinsic::experimental_convergence_entry; +} + bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB, ArrayRef ResRegs, ArrayRef> ArgRegs, Register SwiftErrorVReg, + Register ConvergenceCtrlToken, std::function GetCalleeReg) const { CallLoweringInfo Info; const DataLayout &DL = MIRBuilder.getDataLayout(); @@ -119,6 +130,8 @@ CanBeTailCalled = false; } + if (!isEntryTokenIfPresent(CB)) + CanBeTailCalled = false; // First step is to marshall all the function's parameters into the correct // physregs and memory locations. Gather the sequence of argument types that @@ -174,6 +187,7 @@ Info.KnownCallees = CB.getMetadata(LLVMContext::MD_callees); Info.CallConv = CallConv; Info.SwiftErrorVReg = SwiftErrorVReg; + Info.ConvergenceCtrlToken = ConvergenceCtrlToken; Info.IsMustTailCall = CB.isMustTailCall(); Info.IsTailCall = CanBeTailCalled; Info.IsVarArg = IsVarArg; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -210,7 +210,7 @@ auto *VRegs = VMap.getVRegs(Val); auto *Offsets = VMap.getOffsets(Val); - assert(Val.getType()->isSized() && + assert((Val.getType()->isSized() || Val.getType()->isTokenTy()) && "Don't know how to create an empty vreg"); SmallVector SplitTys; @@ -2456,12 +2456,18 @@ } } + Register ConvergenceCtrlToken = 0; + if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_convergencectrl)) { + auto *Token = Bundle->Inputs[0].get(); + ConvergenceCtrlToken = getOrCreateVReg(*Token); + } + // We don't set HasCalls on MFI here yet because call lowering may decide to // optimize into tail calls. Instead, we defer that to selection where a final // scan is done to check if any instructions are calls. - bool Success = - CLI->lowerCall(MIRBuilder, CB, Res, Args, SwiftErrorVReg, - [&]() { return getOrCreateVReg(*CB.getCalledOperand()); }); + bool Success = CLI->lowerCall( + MIRBuilder, CB, Res, Args, SwiftErrorVReg, ConvergenceCtrlToken, + [&]() { return getOrCreateVReg(*CB.getCalledOperand()); }); // Check if we just inserted a tail call. if (Success) { @@ -2509,8 +2515,13 @@ assert(ID != Intrinsic::not_intrinsic && "unknown intrinsic"); - if (translateKnownIntrinsic(CI, ID, MIRBuilder)) + // Note: Known intrinsics are target-independent, and not expected to be + // convergent. Hence we don't look for a convergencectrl operand bundle if we + // are calling a known intrinsic. + if (translateKnownIntrinsic(CI, ID, MIRBuilder)) { + assert(!CI.countOperandBundlesOfType(LLVMContext::OB_convergencectrl)); return true; + } ArrayRef ResultRegs; if (!CI.getType()->isVoidTy()) @@ -2575,6 +2586,14 @@ MF->getMachineMemOperand(MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata())); } + if (CI.isConvergent()) { + if (auto Bundle = CI.getOperandBundle(LLVMContext::OB_convergencectrl)) { + auto *Token = Bundle->Inputs[0].get(); + Register TokenReg = getOrCreateVReg(*Token); + MIB.addUse(TokenReg, RegState::Implicit); + } + } + return true; } diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp --- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp @@ -594,6 +594,14 @@ } } + if (auto Bundle = Call.getOperandBundle(LLVMContext::OB_convergencectrl)) { + auto *Token = Bundle->Inputs[0].get(); + ArrayRef SourceRegs = GetOrCreateVRegs(*Token); + assert(SourceRegs.size() == 1 && + "Expected the control token to fit into a single virtual register"); + Inst.addUse(SourceRegs[0], RegState::Implicit); + } + if (const MDNode *SrcLoc = Call.getMetadata("srcloc")) Inst.addMetadata(SrcLoc); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -119,8 +119,27 @@ MIRBuilder.setInstrAndDebugLoc(MI); - if (isa(MI)) + if (auto *GI = dyn_cast(&MI)) { + auto ID = GI->getIntrinsicID(); + assert(ID != Intrinsic::not_intrinsic); + switch (ID) { + default: + break; + case Intrinsic::experimental_convergence_anchor: + case Intrinsic::experimental_convergence_entry: + case Intrinsic::experimental_convergence_loop: + assert(MI.getNumDefs() == 1); + Register Token = MI.defs().begin()->getReg(); + for (auto &Use : MRI.use_operands(Token)) { + auto *UserInstr = Use.getParent(); + UserInstr->removeOperand(Use.getOperandNo()); + } + MI.eraseFromParent(); + return Legalized; + } + return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize; + } auto Step = LI.getAction(MI, MRI); switch (Step.Action) { case Legal: diff --git a/llvm/lib/CodeGen/LowLevelTypeUtils.cpp b/llvm/lib/CodeGen/LowLevelTypeUtils.cpp --- a/llvm/lib/CodeGen/LowLevelTypeUtils.cpp +++ b/llvm/lib/CodeGen/LowLevelTypeUtils.cpp @@ -39,6 +39,10 @@ return LLT::scalar(SizeInBits); } + if (Ty.isTokenTy()) { + return LLT::token(); + } + return LLT(); } diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -1916,10 +1916,13 @@ if (Token.range().front() == 's') { auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue(); - if (!verifyScalarSize(ScalarSize)) - return error("invalid size for scalar type"); - - Ty = LLT::scalar(ScalarSize); + if (ScalarSize) { + if (!verifyScalarSize(ScalarSize)) + return error("invalid size for scalar type"); + Ty = LLT::scalar(ScalarSize); + } else { + Ty = LLT::token(); + } lex(); return false; } else if (Token.range().front() == 'p') { @@ -1960,7 +1963,7 @@ if (Token.range().front() == 's') { auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue(); if (!verifyScalarSize(ScalarSize)) - return error("invalid size for scalar type"); + return error("invalid size for scalar element in vector"); Ty = LLT::scalar(ScalarSize); } else if (Token.range().front() == 'p') { const DataLayout &DL = MF.getDataLayout(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -1267,6 +1267,9 @@ if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder)) return false; + if (Info.ConvergenceCtrlToken) { + MIB.addUse(Info.ConvergenceCtrlToken, RegState::Implicit); + } handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs); // If we have -tailcallopt, we need to adjust the stack. We'll do the call @@ -1390,6 +1393,9 @@ const SIMachineFunctionInfo *MFI = MF.getInfo(); + if (Info.ConvergenceCtrlToken) { + MIB.addUse(Info.ConvergenceCtrlToken, RegState::Implicit); + } handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs); // Get a count of how many bytes are to be pushed on the stack. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1022,6 +1022,10 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { unsigned IntrinsicID = cast(I).getIntrinsicID(); switch (IntrinsicID) { + case Intrinsic::experimental_convergence_anchor: + case Intrinsic::experimental_convergence_entry: + case Intrinsic::experimental_convergence_loop: + return true; case Intrinsic::amdgcn_if_break: { MachineBasicBlock *BB = I.getParent(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/convergence-tokens.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/convergence-tokens.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s + +define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) #1 { +; CHECK-LABEL: test_readfirstlane: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-NEXT: global_store_dword v[0:1], v2, off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %t = call token @llvm.experimental.convergence.anchor() + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t) ] + store i32 %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +declare i32 @llvm.amdgcn.readfirstlane(i32) #0 + +declare token @llvm.experimental.convergence.entry() +declare token @llvm.experimental.convergence.anchor() +declare token @llvm.experimental.convergence.loop() + +attributes #0 = { nounwind readnone convergent } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-convergence-tokens.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-convergence-tokens.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s + +define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) #1 { + ; CHECK-LABEL: name: test_readfirstlane + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s0) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.experimental.convergence.anchor) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY2]](s32), implicit [[INTRINSIC_CONVERGENT]](s0) + ; CHECK-NEXT: G_STORE [[INTRINSIC_CONVERGENT1]](s32), [[MV]](p1) :: (store (s32) into %ir.out, addrspace 1) + ; CHECK-NEXT: SI_RETURN + %t = call token @llvm.experimental.convergence.anchor() + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t) ] + store i32 %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +declare i32 @llvm.amdgcn.readfirstlane(i32) #0 + +declare token @llvm.experimental.convergence.entry() +declare token @llvm.experimental.convergence.anchor() +declare token @llvm.experimental.convergence.loop() + +attributes #0 = { nounwind readnone convergent } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid4.mir b/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid4.mir deleted file mode 100644 --- a/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid4.mir +++ /dev/null @@ -1,10 +0,0 @@ -# RUN: not llc -mtriple=aarch64-- -run-pass none -o /dev/null %s 2>&1 | FileCheck %s -# When a low-level type is 0 bits ---- -name: test_scalar_size_0 -body: | - bb.0: - liveins: $x0 - ; CHECK: [[@LINE+1]]:10: invalid size for scalar type - %0:_(s0) = G_IMPLICIT_DEF -... diff --git a/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid6.mir b/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid6.mir --- a/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid6.mir +++ b/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid6.mir @@ -5,6 +5,6 @@ body: | bb.0: liveins: $x0 - ; CHECK: [[@LINE+1]]:15: invalid size for scalar type + ; CHECK: [[@LINE+1]]:15: invalid size for scalar element in vector %0:_(<2 x s0>) = G_IMPLICIT_DEF ...