diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -220,12 +220,21 @@ // The 1st parameter is a mask for the types of instructions that may be allowed // to cross the SCHED_BARRIER during scheduling. -// MASK = 0: No instructions may be scheduled across SCHED_BARRIER. -// MASK = 1: Non-memory, non-side-effect producing instructions may be -// scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. +// MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER. +// MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be +// scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. +// MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER. +// MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER. +// MASK = 0x0000 0008: MFMA instructions may be scheduled across SCHED_BARRIER. +// MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER. +// MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER. +// MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER. +// MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER. +// MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER. +// MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER. def int_amdgcn_sched_barrier : GCCBuiltin<"__builtin_amdgcn_sched_barrier">, - Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem, - IntrHasSideEffects, IntrConvergent, IntrWillReturn]>; + Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrConvergent, + IntrWillReturn]>; def int_amdgcn_s_waitcnt : GCCBuiltin<"__builtin_amdgcn_s_waitcnt">, Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h rename from llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.h rename to llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h @@ -14,7 +14,8 @@ namespace llvm { -std::unique_ptr createMFMAIGroupLPDAGMutation(); +std::unique_ptr createIGroupLPDAGMutation(); +std::unique_ptr createSchedBarrierDAGMutation(); } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -0,0 +1,439 @@ +//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file This file defines a set of schedule DAG mutations that can be used to +// override default scheduler behavior to enforce specific scheduling patterns. +// They should be used in cases where runtime performance considerations such as +// inter-wavefront interactions, mean that compile-time heuristics cannot +// predict the optimal instruction ordering, or in kernels where optimum +// instruction scheduling is important enough to warrant manual intervention. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUIGroupLP.h" +#include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/BitmaskEnum.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/TargetOpcodes.h" + +using namespace llvm; + +#define DEBUG_TYPE "machine-scheduler" + +namespace { + +static cl::opt + EnableIGroupLP("amdgpu-igrouplp", + cl::desc("Enable construction of Instruction Groups and " + "their ordering for scheduling"), + cl::init(false)); + +static cl::opt> + VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None), + cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in VMEM group.")); + +static cl::opt> + MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None), + cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in MFMA group.")); + +static cl::opt> + LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None), + cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in lds/gds read group.")); + +static cl::opt> + LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None), + cl::Hidden, + cl::desc("The maximum number of instructions to include " + "in lds/gds write group.")); + +typedef function_ref + CanAddMIFn; + +// Classify instructions into groups to enable fine tuned control over the +// scheduler. These groups may be more specific than current SchedModel +// instruction classes. +class SchedGroup { +private: + // Function that returns true if a non-bundle MI may be inserted into this + // group. + const CanAddMIFn canAddMI; + + // Maximum number of SUnits that can be added to this group. + Optional MaxSize; + + // Collection of SUnits that are classified as members of this group. + SmallVector Collection; + + ScheduleDAGInstrs *DAG; + + void tryAddEdge(SUnit *A, SUnit *B) { + if (A != B && DAG->canAddEdge(B, A)) { + DAG->addEdge(B, SDep(A, SDep::Artificial)); + LLVM_DEBUG(dbgs() << "Adding edge...\n" + << "from: SU(" << A->NodeNum << ") " << *A->getInstr() + << "to: SU(" << B->NodeNum << ") " << *B->getInstr()); + } + } + +public: + // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If + // MakePred is true, SU will be a predecessor of the SUnits in this + // SchedGroup, otherwise SU will be a successor. + void link(SUnit &SU, bool MakePred = false) { + for (auto A : Collection) { + SUnit *B = &SU; + if (MakePred) + std::swap(A, B); + + tryAddEdge(A, B); + } + } + + // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use + // the predicate to determine whether SU should be a predecessor (P = true) + // or a successor (P = false) of this SchedGroup. + void link(SUnit &SU, function_ref P) { + for (auto A : Collection) { + SUnit *B = &SU; + if (P(A, B)) + std::swap(A, B); + + tryAddEdge(A, B); + } + } + + // Add DAG dependencies such that SUnits in this group shall be ordered + // before SUnits in OtherGroup. + void link(SchedGroup &OtherGroup) { + for (auto B : OtherGroup.Collection) + link(*B); + } + + // Returns true if no more instructions may be added to this group. + bool isFull() { return MaxSize.hasValue() && Collection.size() >= *MaxSize; } + + // Returns true if SU can be added to this SchedGroup. + bool canAddSU(SUnit &SU, const SIInstrInfo *TII) { + if (isFull()) + return false; + + MachineInstr &MI = *SU.getInstr(); + if (MI.getOpcode() != TargetOpcode::BUNDLE) + return canAddMI(MI, TII); + + // Special case for bundled MIs. + const MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; + while (E != MBB->end() && E->isBundledWithPred()) + ++E; + + // Return true if all of the bundled MIs can be added to this group. + return std::all_of( + B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); }); + } + + void add(SUnit &SU) { Collection.push_back(&SU); } + + SchedGroup(CanAddMIFn canAddMI, Optional MaxSize, + ScheduleDAGInstrs *DAG) + : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {} +}; + +bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isMFMA(MI); +} + +bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isVALU(MI) && !TII->isMFMA(MI); +} + +bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isSALU(MI); +} + +bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)); +} + +bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayLoad() && + (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); +} + +bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayStore() && + (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); +} + +bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayStore() && TII->isDS(MI); +} + +bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { + return MI.mayLoad() && TII->isDS(MI); +} + +class IGroupLPDAGMutation : public ScheduleDAGMutation { +public: + const SIInstrInfo *TII; + ScheduleDAGMI *DAG; + + IGroupLPDAGMutation() = default; + void apply(ScheduleDAGInstrs *DAGInstrs) override; +}; + +// DAG mutation that coordinates with the SCHED_BARRIER instruction and +// corresponding builtin. The mutation adds edges from specific instruction +// classes determined by the SCHED_BARRIER mask so that they cannot be +// scheduled around the SCHED_BARRIER. +class SchedBarrierDAGMutation : public ScheduleDAGMutation { +private: + const SIInstrInfo *TII; + + ScheduleDAGMI *DAG; + + // Components of the mask that determines which instructions may not be + // scheduled across the SCHED_BARRIER. + enum class SchedBarrierMasks { + NONE = 0u, + ALU = 1u << 0, + VALU = 1u << 1, + SALU = 1u << 2, + MFMA = 1u << 3, + VMEM = 1u << 4, + VMEM_READ = 1u << 5, + VMEM_WRITE = 1u << 6, + DS = 1u << 7, + DS_READ = 1u << 8, + DS_WRITE = 1u << 9, + LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE) + }; + + // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a + // region. + // + std::unique_ptr MFMASchedGroup = nullptr; + std::unique_ptr VALUSchedGroup = nullptr; + std::unique_ptr SALUSchedGroup = nullptr; + std::unique_ptr VMEMReadSchedGroup = nullptr; + std::unique_ptr VMEMWriteSchedGroup = nullptr; + std::unique_ptr DSWriteSchedGroup = nullptr; + std::unique_ptr DSReadSchedGroup = nullptr; + + // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should + // not be reordered accross the SCHED_BARRIER. + void getSchedGroupsFromMask(int32_t Mask, + SmallVectorImpl &SchedGroups); + + // Add DAG edges that enforce SCHED_BARRIER ordering. + void addSchedBarrierEdges(SUnit &SU); + + // Classify instructions and add them to the SchedGroup. + void initSchedGroup(SchedGroup *SG); + + // Remove all existing edges from a SCHED_BARRIER. + void resetSchedBarrierEdges(SUnit &SU); + +public: + void apply(ScheduleDAGInstrs *DAGInstrs) override; + + SchedBarrierDAGMutation() = default; +}; + +void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { + const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); + TII = ST.getInstrInfo(); + DAG = static_cast(DAGInstrs); + const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAG->SUnits.empty()) + return; + + LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); + + // The order of InstructionGroups in this vector defines the + // order in which edges will be added. In other words, given the + // present ordering, we will try to make each VMEMRead instruction + // a predecessor of each DSRead instruction, and so on. + SmallVector PipelineOrderGroups = { + SchedGroup(&isVMEMSGMember, VMEMGroupMaxSize, DAG), + SchedGroup(&isDSReadSGMember, LDRGroupMaxSize, DAG), + SchedGroup(&isMFMASGMember, MFMAGroupMaxSize, DAG), + SchedGroup(&isDSWriteSGMember, LDWGroupMaxSize, DAG)}; + + for (SUnit &SU : DAG->SUnits) { + LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU)); + for (auto &SG : PipelineOrderGroups) + if (SG.canAddSU(SU, TII)) + SG.add(SU); + } + + for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) { + auto &GroupA = PipelineOrderGroups[i]; + for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) { + auto &GroupB = PipelineOrderGroups[j]; + GroupA.link(GroupB); + } + } +} + +void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { + const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAGInstrs->SUnits.empty()) + return; + + LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n"); + + const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); + TII = ST.getInstrInfo(); + DAG = static_cast(DAGInstrs); + for (auto &SU : DAG->SUnits) + if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER) + addSchedBarrierEdges(SU); +} + +void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { + MachineInstr &MI = *SchedBarrier.getInstr(); + assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); + // Remove all existing edges from the SCHED_BARRIER that were added due to the + // instruction having side effects. + resetSchedBarrierEdges(SchedBarrier); + SmallVector SchedGroups; + int32_t Mask = MI.getOperand(0).getImm(); + getSchedGroupsFromMask(Mask, SchedGroups); + for (auto SG : SchedGroups) + SG->link( + SchedBarrier, (function_ref)[]( + const SUnit *A, const SUnit *B) { + return A->NodeNum > B->NodeNum; + }); +} + +void SchedBarrierDAGMutation::getSchedGroupsFromMask( + int32_t Mask, SmallVectorImpl &SchedGroups) { + SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask; + // See IntrinsicsAMDGPU.td for an explanation of these masks and their + // mappings. + // + if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { + if (!VALUSchedGroup) { + VALUSchedGroup = std::make_unique(isVALUSGMember, None, DAG); + initSchedGroup(VALUSchedGroup.get()); + } + + SchedGroups.push_back(VALUSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { + if (!SALUSchedGroup) { + SALUSchedGroup = std::make_unique(isSALUSGMember, None, DAG); + initSchedGroup(SALUSchedGroup.get()); + } + + SchedGroups.push_back(SALUSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { + if (!MFMASchedGroup) { + MFMASchedGroup = std::make_unique(isMFMASGMember, None, DAG); + initSchedGroup(MFMASchedGroup.get()); + } + + SchedGroups.push_back(MFMASchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { + if (!VMEMReadSchedGroup) { + VMEMReadSchedGroup = + std::make_unique(isVMEMReadSGMember, None, DAG); + initSchedGroup(VMEMReadSchedGroup.get()); + } + + SchedGroups.push_back(VMEMReadSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { + if (!VMEMWriteSchedGroup) { + VMEMWriteSchedGroup = + std::make_unique(isVMEMWriteSGMember, None, DAG); + initSchedGroup(VMEMWriteSchedGroup.get()); + } + + SchedGroups.push_back(VMEMWriteSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { + if (!DSReadSchedGroup) { + DSReadSchedGroup = + std::make_unique(isDSReadSGMember, None, DAG); + initSchedGroup(DSReadSchedGroup.get()); + } + + SchedGroups.push_back(DSReadSchedGroup.get()); + } + + if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE && + (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { + if (!DSWriteSchedGroup) { + DSWriteSchedGroup = + std::make_unique(isDSWriteSGMember, None, DAG); + initSchedGroup(DSWriteSchedGroup.get()); + } + + SchedGroups.push_back(DSWriteSchedGroup.get()); + } +} + +void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) { + assert(SG); + for (auto &SU : DAG->SUnits) + if (SG->canAddSU(SU, TII)) + SG->add(SU); +} + +void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) { + assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER); + for (auto &P : SU.Preds) + SU.removePred(P); + + for (auto &S : SU.Succs) { + for (auto &SP : S.getSUnit()->Preds) { + if (SP.getSUnit() == &SU) { + S.getSUnit()->removePred(SP); + } + } + } +} + +} // namespace + +namespace llvm { + +std::unique_ptr createIGroupLPDAGMutation() { + return EnableIGroupLP ? std::make_unique() : nullptr; +} + +std::unique_ptr createSchedBarrierDAGMutation() { + return std::make_unique(); +} + +} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp deleted file mode 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMFMAIGroupLP.cpp +++ /dev/null @@ -1,219 +0,0 @@ -//===--- AMDGPUMFMAIGroupLP.cpp - AMDGPU MFMA IGroupLP ------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// \file This file contains a DAG scheduling mutation which tries to coerce -// the scheduler into generating an ordering based on ordering of groups -// of instructions. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUMFMAIGroupLP.h" -#include "AMDGPUTargetMachine.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineScheduler.h" -#include "llvm/CodeGen/TargetOpcodes.h" - -using namespace llvm; - -#define DEBUG_TYPE "amdgpu-MFMA-IGroupLP" - -namespace { - -static cl::opt - EnableMFMAIGroupLP("amdgpu-mfma-igrouplp", - cl::desc("Enable construction of Instruction Groups and " - "their ordering for scheduling"), - cl::init(false)); - -static cl::opt - VMEMGroupMaxSize("amdgpu-mfma-igrouplp-vmem-group-size", cl::init(-1), - cl::Hidden, - cl::desc("The maximum number of instructions to include " - "in VMEM group.")); - -static cl::opt - MFMAGroupMaxSize("amdgpu-mfma-igrouplp-mfma-group-size", cl::init(-1), - cl::Hidden, - cl::desc("The maximum number of instructions to include " - "in MFMA group.")); - -static cl::opt - LDRGroupMaxSize("amdgpu-mfma-igrouplp-ldr-group-size", cl::init(-1), - cl::Hidden, - cl::desc("The maximum number of instructions to include " - "in lds/gds read group.")); - -static cl::opt - LDWGroupMaxSize("amdgpu-mfma-igrouplp-ldw-group-size", cl::init(-1), - cl::Hidden, - cl::desc("The maximum number of instructions to include " - "in lds/gds write group.")); - -typedef function_ref IsInstructionType; - -struct InstructionClass { - SmallVector Collection; - const IsInstructionType isInstructionClass; - // MaxSize is initialized to -1 by default, if MaxSize is < 0, then - // the collection will not have a size limit - const int MaxSize; - - InstructionClass(IsInstructionType IsInstructionClass, int maxSize) - : isInstructionClass(IsInstructionClass), MaxSize(maxSize){}; - - bool IsFull() { return !(MaxSize <= 0) && (int)Collection.size() >= MaxSize; } -}; - -class MFMAIGroupLPDAGMutation : public ScheduleDAGMutation { -public: - const SIInstrInfo *TII; - ScheduleDAGMI *DAG; - - MFMAIGroupLPDAGMutation() = default; - void apply(ScheduleDAGInstrs *DAGInstrs) override; -}; - -static void collectSUnits(SmallVectorImpl &PipelineOrder, - const SIInstrInfo *TII, ScheduleDAGInstrs *DAG) { - for (SUnit &SU : DAG->SUnits) { - LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU)); - - // Presently, a bundle only counts as one instruction towards - // the group's maximum size - if (SU.getInstr()->getOpcode() == TargetOpcode::BUNDLE) { - MachineInstr *MI = SU.getInstr(); - MachineBasicBlock::instr_iterator BundledMI = MI->getIterator(); - ++BundledMI; - - LLVM_DEBUG(dbgs() << "Checking bundled insts\n";); - - InstructionClass *MatchingStage = nullptr; - for (auto Stage : PipelineOrder) { - if (Stage->isInstructionClass(*BundledMI) && !Stage->IsFull()) { - MatchingStage = Stage; - break; - } - } - - if (MatchingStage != nullptr) { - while (MatchingStage->isInstructionClass(*BundledMI)) { - if (!BundledMI->isBundledWithSucc()) - break; - ++BundledMI; - } - - if (!BundledMI->isBundledWithSucc()) { - LLVM_DEBUG(dbgs() << "Bundle is all of same type\n";); - MatchingStage->Collection.push_back(&SU); - } - } - } - - for (InstructionClass *Stage : PipelineOrder) { - if (Stage->isInstructionClass(*SU.getInstr()) && !Stage->IsFull()) { - Stage->Collection.push_back(&SU); - } - } - } -} - -static void -addPipelineEdges(const llvm::ArrayRef PipelineOrder, - ScheduleDAGInstrs *DAG) { - for (int i = 0; i < (int)PipelineOrder.size() - 1; i++) { - auto StageA = PipelineOrder[i]; - for (int j = i + 1; j < (int)PipelineOrder.size(); j++) { - auto StageB = PipelineOrder[j]; - for (auto SUnitA : StageA->Collection) { - LLVM_DEBUG(dbgs() << "Adding edges for: "; DAG->dumpNode(*SUnitA);); - for (auto SUnitB : StageB->Collection) { - if (DAG->canAddEdge(SUnitB, SUnitA)) { - DAG->addEdge(SUnitB, SDep(SUnitA, SDep::Artificial)); - LLVM_DEBUG(dbgs() << "Added edge to: "; DAG->dumpNode(*SUnitB);); - } else { - LLVM_DEBUG(dbgs() << "Can't add edge to: "; - DAG->dumpNode(*SUnitB);); - } - } - } - } - } -} - -void MFMAIGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { - const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); - TII = ST.getInstrInfo(); - if (!ST.hasMAIInsts()) - return; - DAG = static_cast(DAGInstrs); - const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); - if (!TSchedModel || DAG->SUnits.empty()) - return; - - const IsInstructionType isMFMAFn = [this](const MachineInstr &MI) { - if (TII->isMAI(MI) && MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && - MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64) { - LLVM_DEBUG(dbgs() << "Found MFMA\n";); - return true; - } - return false; - }; - InstructionClass MFMASUnits(isMFMAFn, MFMAGroupMaxSize); - - const IsInstructionType isVMEMReadFn = [this](const MachineInstr &MI) { - if (((TII->isFLAT(MI) && !TII->isDS(MI)) || TII->isVMEM(MI)) && - MI.mayLoad()) { - LLVM_DEBUG(dbgs() << "Found VMEM read\n";); - return true; - } - return false; - }; - InstructionClass VMEMReadSUnits(isVMEMReadFn, VMEMGroupMaxSize); - - const IsInstructionType isDSWriteFn = [this](const MachineInstr &MI) { - if (TII->isDS(MI) && MI.mayStore()) { - LLVM_DEBUG(dbgs() << "Found DS Write\n";); - return true; - } - return false; - }; - InstructionClass DSWriteSUnits(isDSWriteFn, LDWGroupMaxSize); - - const IsInstructionType isDSReadFn = [this](const MachineInstr &MI) { - if (TII->isDS(MI) && MI.mayLoad()) { - LLVM_DEBUG(dbgs() << "Found DS Read\n";); - return true; - } - return false; - }; - InstructionClass DSReadSUnits(isDSReadFn, LDRGroupMaxSize); - - // The order of InstructionClasses in this vector defines the - // order in which edges will be added. In other words, given the - // present ordering, we will try to make each VMEMRead instruction - // a predecessor of each DSRead instruction, and so on. - SmallVector PipelineOrder = { - &VMEMReadSUnits, &DSReadSUnits, &MFMASUnits, &DSWriteSUnits}; - - collectSUnits(PipelineOrder, TII, DAG); - - addPipelineEdges(PipelineOrder, DAG); -} - -} // namespace - -namespace llvm { - -std::unique_ptr createMFMAIGroupLPDAGMutation() { - return EnableMFMAIGroupLP ? std::make_unique() - : nullptr; -} - -} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -16,7 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" #include "AMDGPUExportClustering.h" -#include "AMDGPUMFMAIGroupLP.h" +#include "AMDGPUIGroupLP.h" #include "AMDGPUMacroFusion.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" @@ -399,7 +399,8 @@ ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createMFMAIGroupLPDAGMutation()); + DAG->addMutation(createIGroupLPDAGMutation()); + DAG->addMutation(createSchedBarrierDAGMutation()); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; @@ -898,7 +899,8 @@ const GCNSubtarget &ST = C->MF->getSubtarget(); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); - DAG->addMutation(createMFMAIGroupLPDAGMutation()); + DAG->addMutation(createIGroupLPDAGMutation()); + DAG->addMutation(createSchedBarrierDAGMutation()); return DAG; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -75,7 +75,7 @@ AMDGPUMachineModuleInfo.cpp AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp - AMDGPUMFMAIGroupLP.cpp + AMDGPUIGroupLP.cpp AMDGPUMIRFormatter.cpp AMDGPUOpenCLEnqueuedBlockLowering.cpp AMDGPUPerfHintAnalysis.cpp diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -define amdgpu_kernel void @test_wave_barrier() #0 { -; GCN-LABEL: test_wave_barrier: +define amdgpu_kernel void @test_sched_barrier() #0 { +; GCN-LABEL: test_sched_barrier: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: ; sched_barrier mask(0x00000000) ; GCN-NEXT: ; sched_barrier mask(0x00000001) diff --git a/llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir b/llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir --- a/llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir +++ b/llvm/test/CodeGen/AMDGPU/mfma-igrouplp-dag-mutation.mir @@ -181,3 +181,41 @@ $agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec DS_WRITE_B32 $vgpr9, $vgpr24, 0, 16, implicit $m0, implicit $exec ... + +--- +name: block_ends_in_bundle +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 + ; DEFAULT-LABEL: name: block_ends_in_bundle + ; DEFAULT: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 + ; DEFAULT-NEXT: {{ $}} + ; DEFAULT-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec { + ; DEFAULT-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; DEFAULT-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; DEFAULT-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; DEFAULT-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; DEFAULT-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec + ; DEFAULT-NEXT: } + ; DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; PIPELINE-LABEL: name: block_ends_in_bundle + ; PIPELINE: liveins: $vgpr0, $vgpr1, $vgpr7, $agpr0_agpr1_agpr2_agpr3 + ; PIPELINE-NEXT: {{ $}} + ; PIPELINE-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit killed $vgpr7, implicit $exec { + ; PIPELINE-NEXT: $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + ; PIPELINE-NEXT: $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + ; PIPELINE-NEXT: $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + ; PIPELINE-NEXT: $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + ; PIPELINE-NEXT: $vgpr16 = DS_READ_U16_gfx9 killed $vgpr7, 0, 2048, implicit $exec + ; PIPELINE-NEXT: } + ; PIPELINE-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr1, killed $vgpr0, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr7, implicit $exec { + $vgpr10 = DS_READ_U16_gfx9 $vgpr7, 0, 512, implicit $exec + $vgpr11 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + $vgpr12 = DS_READ_U16_gfx9 $vgpr7, 0, 1024, implicit $exec + $vgpr15 = DS_READ_U16_gfx9 $vgpr7, 0, 4096, implicit $exec + $vgpr16 = DS_READ_U16_gfx9 $vgpr7, 0, 2048, implicit $exec + } +... diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir @@ -0,0 +1,122 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=postmisched -verify-misched -o - %s | FileCheck %s + +--- | + define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + + !0 = distinct !{!0} + !1 = !{!1, !0} +... + +--- +name: no_sched_barrier +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: no_sched_barrier + ; CHECK: renamable $sgpr0_sgpr1 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec { + ; CHECK-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: } + ; CHECK-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec + ; CHECK-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec + ; CHECK-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 { + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: } + ; CHECK-NEXT: S_ENDPGM 0 + renamable $sgpr0_sgpr1 = IMPLICIT_DEF + renamable $vgpr0 = IMPLICIT_DEF + BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec { + renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + } + renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec + renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec + BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 { + GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + } + S_ENDPGM 0 +... + +# MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER. + +--- +name: sched_barrier_mask_0 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_0 + ; CHECK: renamable $sgpr0_sgpr1 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec { + ; CHECK-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: } + ; CHECK-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec + ; CHECK-NEXT: SCHED_BARRIER 0 + ; CHECK-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec + ; CHECK-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 { + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: } + ; CHECK-NEXT: S_ENDPGM 0 + renamable $sgpr0_sgpr1 = IMPLICIT_DEF + renamable $vgpr0 = IMPLICIT_DEF + BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec { + renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + } + renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec + SCHED_BARRIER 0 + renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec + BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 { + GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + } + S_ENDPGM 0 +... + +# MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be +# scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. + +--- +name: sched_barrier_mask_1 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_1 + ; CHECK: renamable $sgpr0_sgpr1 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec { + ; CHECK-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: } + ; CHECK-NEXT: SCHED_BARRIER 1 + ; CHECK-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec + ; CHECK-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec + ; CHECK-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 { + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: } + ; CHECK-NEXT: S_ENDPGM 0 + renamable $sgpr0_sgpr1 = IMPLICIT_DEF + renamable $vgpr0 = IMPLICIT_DEF + BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec { + renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + } + renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec + SCHED_BARRIER 1 + renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec + BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1, implicit $exec, implicit killed $vgpr2 { + GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + } + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir @@ -0,0 +1,570 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx908 -misched-cluster=false -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s + +--- | + define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_2(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_4(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_8(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_16(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_64(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_128(i32 addrspace(3)* noalias %out, i32 addrspace(3)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_256(i32 addrspace(3)* noalias %out, i32 addrspace(3)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_512(i32 addrspace(3)* noalias %out, i32 addrspace(3)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_masks_8_12(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_4_bundle(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_barrier_mask_0_bundle(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + + !0 = distinct !{!0} + !1 = !{!1, !0} +... + +--- +name: no_sched_barrier +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: no_sched_barrier + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_NOP 0 + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0 +... + +# MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER. + +--- +name: sched_barrier_mask_0 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_0 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: SCHED_BARRIER 0 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_NOP 0 + SCHED_BARRIER 0 + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0 +... + +# MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be +# scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. + +--- +name: sched_barrier_mask_1 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_1 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: SCHED_BARRIER 1 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_NOP 0 + SCHED_BARRIER 1 + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0 +... + +# MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER. + +--- +name: sched_barrier_mask_2 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_2 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: SCHED_BARRIER 2 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]] + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %2, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %1, implicit $exec + %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %1, implicit $exec + S_NOP 0 + SCHED_BARRIER 2 + %6:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %7:vgpr_32 = nsw V_MUL_LO_U32_e64 %6, %6, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %7, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0, implicit %4, implicit %5 +... + +# MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER. + +--- +name: sched_barrier_mask_4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_4 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: SCHED_BARRIER 4 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]] + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:areg_128 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec + S_NOP 0 + %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec + %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec + %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec + %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec + %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec + SCHED_BARRIER 4 + %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0, implicit %5, implicit %6, implicit %11 +... + +# MASK = 0x0000 0008: MFMA instructions may be scheduled across SCHED_BARRIER. + +--- +name: sched_barrier_mask_8 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_8 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: SCHED_BARRIER 8 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]] + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:areg_128 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec + S_NOP 0 + %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec + %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec + %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec + %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec + %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec + SCHED_BARRIER 8 + %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0, implicit %5, implicit %6, implicit %11 +... + +# MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER. + +--- +name: sched_barrier_mask_16 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_16 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: SCHED_BARRIER 16 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_NOP 0 + SCHED_BARRIER 16 + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0 +... + +# MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER. + +--- +name: sched_barrier_mask_32 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_32 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: SCHED_BARRIER 32 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_NOP 0 + SCHED_BARRIER 32 + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0 +... + +# MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER. + +--- +name: sched_barrier_mask_64 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_64 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: SCHED_BARRIER 64 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_NOP 0 + SCHED_BARRIER 64 + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0 +... + +# MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER. + +--- +name: sched_barrier_mask_128 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_128 + ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec + ; CHECK-NEXT: SCHED_BARRIER 128 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec + ; CHECK-NEXT: dead %0:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %2, implicit $exec + DS_WRITE_B32 %3, %1, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + S_NOP 0 + SCHED_BARRIER 128 + %4:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %4, %4, implicit $exec + DS_WRITE_B32 %5, %3, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + S_ENDPGM 0 +... + +# MASK = 0x0000 0100: ALL DS read instructions may be scheduled across SCHED_BARRIER. + +--- +name: sched_barrier_mask_256 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_256 + ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec + ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + ; CHECK-NEXT: SCHED_BARRIER 256 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec + ; CHECK-NEXT: dead %0:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %2, implicit $exec + DS_WRITE_B32 %3, %1, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + S_NOP 0 + SCHED_BARRIER 256 + %4:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %4, %4, implicit $exec + DS_WRITE_B32 %5, %3, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + S_ENDPGM 0 +... + +# MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER. + +--- +name: sched_barrier_mask_512 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_512 + ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec + ; CHECK-NEXT: SCHED_BARRIER 512 + ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec + ; CHECK-NEXT: dead %0:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + %3:vgpr_32 = nsw V_MUL_LO_U32_e64 %2, %2, implicit $exec + DS_WRITE_B32 %3, %1, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + S_NOP 0 + SCHED_BARRIER 512 + %4:vgpr_32 = DS_READ_U16_gfx9 %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3) + %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %4, %4, implicit $exec + DS_WRITE_B32 %5, %3, 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3) + S_ENDPGM 0 +... + +# MASK = 0x0000 0008: MFMA instructions may be scheduled across SCHED_BARRIER. +# MASK = 0x0000 000C: MFMA and SALU may be scheduled across SCHED_BARRIER. +# Check that S_NOP can move moved before the first SCHED_BARRIER but not the second. + +--- +name: sched_barrier_masks_8_12 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_masks_8_12 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: SCHED_BARRIER 12 + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: SCHED_BARRIER 8 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]] + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:areg_128 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec + SCHED_BARRIER 12 + %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec + %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec + %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec + %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec + %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec + S_NOP 0 + SCHED_BARRIER 8 + S_NOP 0 + %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0, implicit %5, implicit %6, implicit %11 +... + +# MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER. + +--- +name: sched_barrier_mask_4_bundle +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_4_bundle + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: BUNDLE [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[GLOBAL_LOAD_DWORD_SADDR1]] { + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: } + ; CHECK-NEXT: SCHED_BARRIER 4 + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec + SCHED_BARRIER 4 + BUNDLE implicit %3, %5 { + S_NOP 0 + S_NOP 0 + } + GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0 +... + +# MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER. + +--- +name: sched_barrier_mask_0_bundle +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_barrier_mask_0_bundle + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: SCHED_BARRIER 0 + ; CHECK-NEXT: BUNDLE [[GLOBAL_LOAD_DWORD_SADDR1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]] { + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: } + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec + SCHED_BARRIER 0 + BUNDLE implicit %3, %5 { + S_NOP 0 + S_NOP 0 + } + GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/sched_barrier.mir b/llvm/test/CodeGen/AMDGPU/sched_barrier.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/sched_barrier.mir +++ /dev/null @@ -1,99 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s - ---- | - define amdgpu_kernel void @no_sched_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } - define amdgpu_kernel void @sched_barrier_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } - define amdgpu_kernel void @sched_barrier_1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } - - !0 = distinct !{!0} - !1 = !{!1, !0} -... - ---- -name: no_sched_barrier -tracksRegLiveness: true -body: | - bb.0: - ; CHECK-LABEL: name: no_sched_barrier - ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec - ; CHECK-NEXT: S_NOP 0 - ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) - ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) - ; CHECK-NEXT: S_ENDPGM 0 - %0:sreg_64 = IMPLICIT_DEF - %1:vgpr_32 = IMPLICIT_DEF - %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec - GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) - S_NOP 0 - %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec - GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) - S_ENDPGM 0 -... - ---- -name: sched_barrier_0 -tracksRegLiveness: true -body: | - bb.0: - ; CHECK-LABEL: name: sched_barrier_0 - ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) - ; CHECK-NEXT: S_NOP 0 - ; CHECK-NEXT: SCHED_BARRIER 0 - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) - ; CHECK-NEXT: S_ENDPGM 0 - %0:sreg_64 = IMPLICIT_DEF - %1:vgpr_32 = IMPLICIT_DEF - %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec - GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) - S_NOP 0 - SCHED_BARRIER 0 - %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec - GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) - S_ENDPGM 0 -... - ---- -name: sched_barrier_1 -tracksRegLiveness: true -body: | - bb.0: - ; CHECK-LABEL: name: sched_barrier_1 - ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec - ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) - ; CHECK-NEXT: SCHED_BARRIER 1 - ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec - ; CHECK-NEXT: S_NOP 0 - ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) - ; CHECK-NEXT: S_ENDPGM 0 - %0:sreg_64 = IMPLICIT_DEF - %1:vgpr_32 = IMPLICIT_DEF - %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec - GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) - S_NOP 0 - SCHED_BARRIER 1 - %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) - %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %5, %5, implicit $exec - GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) - S_ENDPGM 0 -...