diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,6 +63,7 @@ BUILTIN(__builtin_amdgcn_s_barrier, "v", "n") BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n") BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n") +BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n") BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n") BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n") BUILTIN(__builtin_amdgcn_ds_gws_init, "vUiUi", "n") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -409,6 +409,19 @@ __builtin_amdgcn_sched_barrier(15); } +// CHECK-LABEL: @test_sched_group_barrier +// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 0, i32 1, i32 2) +// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 1, i32 2, i32 4) +// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 4, i32 8, i32 16) +// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 15, i32 10000, i32 -1) +void test_sched_group_barrier() +{ + __builtin_amdgcn_sched_group_barrier(0, 1, 2); + __builtin_amdgcn_sched_group_barrier(1, 2, 4); + __builtin_amdgcn_sched_group_barrier(4, 8, 16); + __builtin_amdgcn_sched_group_barrier(15, 10000, -1); +} + // CHECK-LABEL: @test_s_sleep // CHECK: call void @llvm.amdgcn.s.sleep(i32 1) // CHECK: call void @llvm.amdgcn.s.sleep(i32 15) diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl --- a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl @@ -65,6 +65,11 @@ __builtin_amdgcn_sched_barrier(x); // expected-error {{argument to '__builtin_amdgcn_sched_barrier' must be a constant integer}} } +void test_sched_group_barrier(int x) +{ + __builtin_amdgcn_sched_group_barrier(x, 0, 1); // expected-error {{argument to '__builtin_amdgcn_sched_group_barrier' must be a constant integer}} +} + void test_sicmp_i32(global ulong* out, int a, int b, uint c) { *out = __builtin_amdgcn_sicmp(a, b, c); // expected-error {{argument to '__builtin_amdgcn_sicmp' must be a constant integer}} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -236,6 +236,19 @@ Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>; +// The first parameter is a mask that determines the types of instructions that +// you would like to synchronize around and add to a scheduling group. The +// values of the mask are defined above for sched_barrier. These instructions +// will be selected from the bottom up starting from the sched_group_barrier's +// location during instruction scheduling. The second parameter is the number of +// matching instructions that will be associated with this sched_group_barrier. +// The third parameter is an identifier which is used to describe what other +// sched_group_barriers should be synchronized with. +def int_amdgcn_sched_group_barrier : GCCBuiltin<"__builtin_amdgcn_sched_group_barrier">, + Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [ImmArg>, ImmArg>, ImmArg>, IntrNoMem, IntrHasSideEffects, + IntrConvergent, IntrWillReturn]>; + def int_amdgcn_s_waitcnt : GCCBuiltin<"__builtin_amdgcn_s_waitcnt">, Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -21,6 +21,7 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/ADT/BitmaskEnum.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/TargetOpcodes.h" @@ -60,133 +61,102 @@ cl::desc("The maximum number of instructions to include " "in lds/gds write group.")); -typedef function_ref - CanAddMIFn; +// Components of the mask that determines which instruction types may be may be +// classified into a SchedGroup. +enum class SchedGroupMask { + NONE = 0u, + ALU = 1u << 0, + VALU = 1u << 1, + SALU = 1u << 2, + MFMA = 1u << 3, + VMEM = 1u << 4, + VMEM_READ = 1u << 5, + VMEM_WRITE = 1u << 6, + DS = 1u << 7, + DS_READ = 1u << 8, + DS_WRITE = 1u << 9, + ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | + DS_READ | DS_WRITE, + LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) +}; // Classify instructions into groups to enable fine tuned control over the // scheduler. These groups may be more specific than current SchedModel // instruction classes. class SchedGroup { private: - // Function that returns true if a non-bundle MI may be inserted into this - // group. - const CanAddMIFn canAddMI; + // Mask that defines which instruction types can be classified into this + // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER + // and SCHED_GROUP_BARRIER. + SchedGroupMask SGMask; // Maximum number of SUnits that can be added to this group. Optional MaxSize; + // SchedGroups will only synchronize with other SchedGroups that have the same + // SyncID. + int SyncID = 0; + // Collection of SUnits that are classified as members of this group. SmallVector Collection; ScheduleDAGInstrs *DAG; - void tryAddEdge(SUnit *A, SUnit *B) { - if (A != B && DAG->canAddEdge(B, A)) { - DAG->addEdge(B, SDep(A, SDep::Artificial)); - LLVM_DEBUG(dbgs() << "Adding edge...\n" - << "from: SU(" << A->NodeNum << ") " << *A->getInstr() - << "to: SU(" << B->NodeNum << ") " << *B->getInstr()); - } - } + const SIInstrInfo *TII; + + // Try to add and edge from SU A to SU B. + bool tryAddEdge(SUnit *A, SUnit *B); + + // Use SGMask to determine whether we can classify MI as a member of this + // SchedGroup object. + bool canAddMI(const MachineInstr &MI) const; + + // Returns true if SU can be added to this SchedGroup. + bool canAddSU(SUnit &SU) const; + + // Returns true if no more instructions may be added to this group. + bool isFull() const; + + // Add SU to the SchedGroup. + void add(SUnit &SU) { Collection.push_back(&SU); } public: // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If // MakePred is true, SU will be a predecessor of the SUnits in this // SchedGroup, otherwise SU will be a successor. - void link(SUnit &SU, bool MakePred = false) { - for (auto A : Collection) { - SUnit *B = &SU; - if (MakePred) - std::swap(A, B); - - tryAddEdge(A, B); - } - } + void link(SUnit &SU, bool MakePred = false); // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use // the predicate to determine whether SU should be a predecessor (P = true) // or a successor (P = false) of this SchedGroup. - void link(SUnit &SU, function_ref P) { - for (auto A : Collection) { - SUnit *B = &SU; - if (P(A, B)) - std::swap(A, B); - - tryAddEdge(A, B); - } - } + void link(SUnit &SU, function_ref P); // Add DAG dependencies such that SUnits in this group shall be ordered // before SUnits in OtherGroup. - void link(SchedGroup &OtherGroup) { - for (auto B : OtherGroup.Collection) - link(*B); - } + void link(SchedGroup &OtherGroup); - // Returns true if no more instructions may be added to this group. - bool isFull() { return MaxSize.hasValue() && Collection.size() >= *MaxSize; } + // Identify and add all relevant SUs from the DAG to this SchedGroup. + void initSchedGroup(); - // Returns true if SU can be added to this SchedGroup. - bool canAddSU(SUnit &SU, const SIInstrInfo *TII) { - if (isFull()) - return false; + // Add instructions to the SchedGroup bottom up starting from RIter. + // ConflictedInstrs is a set of instructions that should not be added to the + // SchedGroup even when the other conditions for adding it are satisfied. + // RIter will be added to the SchedGroup as well, and dependencies will be + // added so that RIter will always be scheduled at the end of the group. + void initSchedGroup(std::vector::reverse_iterator RIter, + DenseSet &ConflictedInstrs); - MachineInstr &MI = *SU.getInstr(); - if (MI.getOpcode() != TargetOpcode::BUNDLE) - return canAddMI(MI, TII); + int getSyncID() { return SyncID; } - // Special case for bundled MIs. - const MachineBasicBlock *MBB = MI.getParent(); - MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; - while (E != MBB->end() && E->isBundledWithPred()) - ++E; - - // Return true if all of the bundled MIs can be added to this group. - return std::all_of( - B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); }); - } - - void add(SUnit &SU) { Collection.push_back(&SU); } + SchedGroup(SchedGroupMask SGMask, Optional MaxSize, + ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {} - SchedGroup(CanAddMIFn canAddMI, Optional MaxSize, - ScheduleDAGInstrs *DAG) - : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {} + SchedGroup(SchedGroupMask SGMask, Optional MaxSize, int SyncID, + ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {} }; -bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return TII->isMFMA(MI); -} - -bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return TII->isVALU(MI) && !TII->isMFMA(MI); -} - -bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return TII->isSALU(MI); -} - -bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)); -} - -bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return MI.mayLoad() && - (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); -} - -bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return MI.mayStore() && - (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))); -} - -bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return MI.mayStore() && TII->isDS(MI); -} - -bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) { - return MI.mayLoad() && TII->isDS(MI); -} - class IGroupLPDAGMutation : public ScheduleDAGMutation { public: const SIInstrInfo *TII; @@ -199,54 +169,41 @@ // DAG mutation that coordinates with the SCHED_BARRIER instruction and // corresponding builtin. The mutation adds edges from specific instruction // classes determined by the SCHED_BARRIER mask so that they cannot be -// scheduled around the SCHED_BARRIER. class SchedBarrierDAGMutation : public ScheduleDAGMutation { private: const SIInstrInfo *TII; ScheduleDAGMI *DAG; - // Components of the mask that determines which instructions may not be - // scheduled across the SCHED_BARRIER. - enum class SchedBarrierMasks { - NONE = 0u, - ALU = 1u << 0, - VALU = 1u << 1, - SALU = 1u << 2, - MFMA = 1u << 3, - VMEM = 1u << 4, - VMEM_READ = 1u << 5, - VMEM_WRITE = 1u << 6, - DS = 1u << 7, - DS_READ = 1u << 8, - DS_WRITE = 1u << 9, - LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE) - }; - - // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a - // region. - // - std::unique_ptr MFMASchedGroup = nullptr; - std::unique_ptr VALUSchedGroup = nullptr; - std::unique_ptr SALUSchedGroup = nullptr; - std::unique_ptr VMEMReadSchedGroup = nullptr; - std::unique_ptr VMEMWriteSchedGroup = nullptr; - std::unique_ptr DSWriteSchedGroup = nullptr; - std::unique_ptr DSReadSchedGroup = nullptr; + // Organize lists of SchedGroups by their SyncID. SchedGroups / + // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added + // between then. + DenseMap> SyncedSchedGroupsMap; - // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should - // not be reordered accross the SCHED_BARRIER. - void getSchedGroupsFromMask(int32_t Mask, - SmallVectorImpl &SchedGroups); + // Used to track instructions that are already to added to a different + // SchedGroup with the same SyncID. + DenseMap> SyncedInstrsMap; // Add DAG edges that enforce SCHED_BARRIER ordering. void addSchedBarrierEdges(SUnit &SU); - // Classify instructions and add them to the SchedGroup. - void initSchedGroup(SchedGroup *SG); - - // Remove all existing edges from a SCHED_BARRIER. - void resetSchedBarrierEdges(SUnit &SU); + // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should + // not be reordered accross the SCHED_BARRIER. This is used for the base + // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that + // SCHED_BARRIER will always block all instructions that can be classified + // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size + // and may only synchronize with some SchedGroups. Returns the inverse of + // Mask. SCHED_BARRIER's mask describes which instruction types should be + // allowed to be scheduled across it. Invert the mask to get the + // SchedGroupMask of instructions that should be barred. + SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const; + + // Create SchedGroups for a SCHED_GROUP_BARRIER. + void initSchedGroupBarrier(std::vector::reverse_iterator RIter); + + // Add DAG edges that try to enforce ordering defined by SCHED_GROUP_BARRIER + // instructions. + void addSchedGroupBarrierEdges(); public: void apply(ScheduleDAGInstrs *DAGInstrs) override; @@ -254,6 +211,175 @@ SchedBarrierDAGMutation() = default; }; +bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { + if (A != B && DAG->canAddEdge(B, A)) { + DAG->addEdge(B, SDep(A, SDep::Artificial)); + LLVM_DEBUG(dbgs() << "Adding edge...\n" + << "from: SU(" << A->NodeNum << ") " << *A->getInstr() + << "to: SU(" << B->NodeNum << ") " << *B->getInstr()); + return true; + } + return false; +} + +bool SchedGroup::canAddMI(const MachineInstr &MI) const { + bool Result = false; + if (MI.isMetaInstruction()) + Result = false; + + else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && + (TII->isVALU(MI) || TII->isMFMA(MI) || TII->isSALU(MI))) + Result = true; + + else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && + TII->isVALU(MI) && !TII->isMFMA(MI)) + Result = true; + + else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && + TII->isSALU(MI)) + Result = true; + + else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && + TII->isMFMA(MI)) + Result = true; + + else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && + (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) + Result = true; + + else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && + MI.mayLoad() && + (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) + Result = true; + + else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) && + MI.mayStore() && + (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) + Result = true; + + else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) && + TII->isDS(MI)) + Result = true; + + else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) && + MI.mayLoad() && TII->isDS(MI)) + Result = true; + + else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) && + MI.mayStore() && TII->isDS(MI)) + Result = true; + + LLVM_DEBUG(dbgs() << "For SchedGroup with mask " + << format_hex((int)SGMask, 10, true) + << (Result ? " added " : " unable to add ") << MI); + + return Result; +} + +void SchedGroup::link(SUnit &SU, bool MakePred) { + for (auto A : Collection) { + SUnit *B = &SU; + if (MakePred) + std::swap(A, B); + + tryAddEdge(A, B); + } +} + +void SchedGroup::link(SUnit &SU, + function_ref P) { + for (auto A : Collection) { + SUnit *B = &SU; + if (P(A, B)) + std::swap(A, B); + + tryAddEdge(A, B); + } +} + +void SchedGroup::link(SchedGroup &OtherGroup) { + for (auto B : OtherGroup.Collection) + link(*B); +} + +bool SchedGroup::isFull() const { + return MaxSize.hasValue() && Collection.size() >= *MaxSize; +} + +bool SchedGroup::canAddSU(SUnit &SU) const { + MachineInstr &MI = *SU.getInstr(); + if (MI.getOpcode() != TargetOpcode::BUNDLE) + return canAddMI(MI); + + // Special case for bundled MIs. + const MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; + while (E != MBB->end() && E->isBundledWithPred()) + ++E; + + // Return true if all of the bundled MIs can be added to this group. + return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); }); +} + +void SchedGroup::initSchedGroup() { + for (auto &SU : DAG->SUnits) { + if (isFull()) + break; + + if (canAddSU(SU)) + add(SU); + } +} + +void SchedGroup::initSchedGroup(std::vector::reverse_iterator RIter, + DenseSet &UsedInstrs) { + SUnit &InitSU = *RIter; + for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) { + auto &SU = *RIter; + if (isFull()) + break; + + if (!UsedInstrs.count(&SU) && canAddSU(SU)) { + add(SU); + UsedInstrs.insert(&SU); + } + } + + add(InitSU); + assert(MaxSize.hasValue()); + (*MaxSize)++; + + link(InitSU); +} + +// Create a pipeline from the SchedGroups in PipelineOrderGroups such that we +// try to enforce the relative ordering of instructions in each group. +static void makePipeline(SmallVectorImpl &PipelineOrderGroups) { + auto I = PipelineOrderGroups.begin(); + auto E = PipelineOrderGroups.end(); + for (; I != E; ++I) { + auto &GroupA = *I; + for (auto J = std::next(I); J != E; ++J) { + auto &GroupB = *J; + GroupA.link(GroupB); + } + } +} + +// Same as makePipeline but with reverse ordering. +static void +makeReversePipeline(SmallVectorImpl &PipelineOrderGroups) { + auto I = PipelineOrderGroups.rbegin(); + auto E = PipelineOrderGroups.rend(); + for (; I != E; ++I) { + auto &GroupA = *I; + for (auto J = std::next(I); J != E; ++J) { + auto &GroupB = *J; + GroupA.link(GroupB); + } + } +} + void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -269,25 +395,31 @@ // present ordering, we will try to make each VMEMRead instruction // a predecessor of each DSRead instruction, and so on. SmallVector PipelineOrderGroups = { - SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG), - SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG), - SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG), - SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)}; - - for (SUnit &SU : DAG->SUnits) { - LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU)); - for (auto &SG : PipelineOrderGroups) - if (SG.canAddSU(SU, TII)) - SG.add(SU); - } + SchedGroup(SchedGroupMask::VMEM, VMEMGroupMaxSize, DAG, TII), + SchedGroup(SchedGroupMask::DS_READ, LDRGroupMaxSize, DAG, TII), + SchedGroup(SchedGroupMask::MFMA, MFMAGroupMaxSize, DAG, TII), + SchedGroup(SchedGroupMask::DS_WRITE, LDWGroupMaxSize, DAG, TII)}; - for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) { - auto &GroupA = PipelineOrderGroups[i]; - for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) { - auto &GroupB = PipelineOrderGroups[j]; - GroupA.link(GroupB); - } - } + for (auto &SG : PipelineOrderGroups) + SG.initSchedGroup(); + + makePipeline(PipelineOrderGroups); +} + +// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER. +static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { + assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || + SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); + + while (!SU.Preds.empty()) + for (auto &P : SU.Preds) + SU.removePred(P); + + while (!SU.Succs.empty()) + for (auto &S : SU.Succs) + for (auto &SP : S.getSUnit()->Preds) + if (SP.getSUnit() == &SU) + S.getSUnit()->removePred(SP); } void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { @@ -300,9 +432,17 @@ const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); TII = ST.getInstrInfo(); DAG = static_cast(DAGInstrs); - for (auto &SU : DAG->SUnits) - if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER) - addSchedBarrierEdges(SU); + for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) { + if (R->getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER) + addSchedBarrierEdges(*R); + + else if (R->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) + initSchedGroupBarrier(R); + } + + // SCHED_GROUP_BARRIER edges can only be added after we have found and + // initialized all of the SCHED_GROUP_BARRIER SchedGroups. + addSchedGroupBarrierEdges(); } void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { @@ -310,118 +450,80 @@ assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); // Remove all existing edges from the SCHED_BARRIER that were added due to the // instruction having side effects. - resetSchedBarrierEdges(SchedBarrier); - SmallVector SchedGroups; - int32_t Mask = MI.getOperand(0).getImm(); - getSchedGroupsFromMask(Mask, SchedGroups); - for (auto SG : SchedGroups) - SG->link( - SchedBarrier, (function_ref)[]( - const SUnit *A, const SUnit *B) { - return A->NodeNum > B->NodeNum; - }); + resetEdges(SchedBarrier, DAG); + auto InvertedMask = + invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); + SchedGroup SG(InvertedMask, None, DAG, TII); + SG.initSchedGroup(); + // Preserve original instruction ordering relative to the SCHED_BARRIER. + SG.link( + SchedBarrier, + (function_ref)[]( + const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; }); } -void SchedBarrierDAGMutation::getSchedGroupsFromMask( - int32_t Mask, SmallVectorImpl &SchedGroups) { - SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask; - // See IntrinsicsAMDGPU.td for an explanation of these masks and their - // mappings. - // - if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE && - (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { - if (!VALUSchedGroup) { - VALUSchedGroup = std::make_unique(isVALUSGMember, None, DAG); - initSchedGroup(VALUSchedGroup.get()); - } - - SchedGroups.push_back(VALUSchedGroup.get()); - } - - if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE && - (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { - if (!SALUSchedGroup) { - SALUSchedGroup = std::make_unique(isSALUSGMember, None, DAG); - initSchedGroup(SALUSchedGroup.get()); - } - - SchedGroups.push_back(SALUSchedGroup.get()); - } - - if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE && - (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) { - if (!MFMASchedGroup) { - MFMASchedGroup = std::make_unique(isMFMASGMember, None, DAG); - initSchedGroup(MFMASchedGroup.get()); - } - - SchedGroups.push_back(MFMASchedGroup.get()); - } - - if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE && - (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { - if (!VMEMReadSchedGroup) { - VMEMReadSchedGroup = - std::make_unique(isVMEMReadSGMember, None, DAG); - initSchedGroup(VMEMReadSchedGroup.get()); - } - - SchedGroups.push_back(VMEMReadSchedGroup.get()); - } - - if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE && - (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) { - if (!VMEMWriteSchedGroup) { - VMEMWriteSchedGroup = - std::make_unique(isVMEMWriteSGMember, None, DAG); - initSchedGroup(VMEMWriteSchedGroup.get()); - } - - SchedGroups.push_back(VMEMWriteSchedGroup.get()); - } - - if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE && - (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { - if (!DSReadSchedGroup) { - DSReadSchedGroup = - std::make_unique(isDSReadSGMember, None, DAG); - initSchedGroup(DSReadSchedGroup.get()); - } - - SchedGroups.push_back(DSReadSchedGroup.get()); - } - - if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE && - (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) { - if (!DSWriteSchedGroup) { - DSWriteSchedGroup = - std::make_unique(isDSWriteSGMember, None, DAG); - initSchedGroup(DSWriteSchedGroup.get()); - } - - SchedGroups.push_back(DSWriteSchedGroup.get()); - } +SchedGroupMask +SchedBarrierDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { + // Invert mask and erase bits for types of instructions that are implied to be + // allowed past the SCHED_BARRIER. + SchedGroupMask InvertedMask = ~Mask; + + // ALU implies VALU, SALU, MFMA. + if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) + InvertedMask &= + ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA; + // VALU, SALU, MFMA implies ALU. + else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE || + (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE || + (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE) + InvertedMask &= ~SchedGroupMask::ALU; + + // VMEM implies VMEM_READ, VMEM_WRITE. + if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) + InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE; + // VMEM_READ, VMEM_WRITE implies VMEM. + else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE || + (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE) + InvertedMask &= ~SchedGroupMask::VMEM; + + // DS implies DS_READ, DS_WRITE. + if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE) + InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE; + // DS_READ, DS_WRITE implies DS. + else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE || + (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE) + InvertedMask &= ~SchedGroupMask::DS; + + return InvertedMask; } -void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) { - assert(SG); - for (auto &SU : DAG->SUnits) - if (SG->canAddSU(SU, TII)) - SG->add(SU); +void SchedBarrierDAGMutation::initSchedGroupBarrier( + std::vector::reverse_iterator RIter) { + // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due + // to the instruction having side effects. + resetEdges(*RIter, DAG); + MachineInstr &SGB = *RIter->getInstr(); + assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); + int32_t SGMask = SGB.getOperand(0).getImm(); + int32_t Size = SGB.getOperand(1).getImm(); + int32_t SyncID = SGB.getOperand(2).getImm(); + // Create a new SchedGroup and add it to a list that is mapped to the SyncID. + // SchedGroups only enforce ordering between SchedGroups with the same SyncID. + auto &SG = SyncedSchedGroupsMap[SyncID].emplace_back((SchedGroupMask)SGMask, + Size, SyncID, DAG, TII); + + // SyncedInstrsMap is used here is used to avoid adding the same SUs in + // multiple SchedGroups that have the same SyncID. This only matters for + // SCHED_GROUP_BARRIER and not SCHED_BARRIER. + SG.initSchedGroup(RIter, SyncedInstrsMap[SG.getSyncID()]); } -void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) { - assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER); - for (auto &P : SU.Preds) - SU.removePred(P); - - for (auto &S : SU.Succs) { - for (auto &SP : S.getSUnit()->Preds) { - if (SP.getSUnit() == &SU) { - S.getSUnit()->removePred(SP); - } - } - } +void SchedBarrierDAGMutation::addSchedGroupBarrierEdges() { + // Since we traversed the DAG in reverse order when initializing + // SCHED_GROUP_BARRIERs we need to reverse the order in the vector to maintain + // user intentions and program order. + for (auto &SchedGroups : SyncedSchedGroupsMap) + makeReversePipeline(SchedGroups.second); } } // namespace diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -217,6 +217,19 @@ return; } + if (MI->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) { + if (isVerbose()) { + std::string HexString; + raw_string_ostream HexStream(HexString); + HexStream << format_hex(MI->getOperand(0).getImm(), 10, true); + OutStreamer->emitRawComment( + " sched_group_barrier mask(" + HexString + ") size(" + + Twine(MI->getOperand(1).getImm()) + ") SyncID(" + + Twine(MI->getOperand(2).getImm()) + ")"); + } + return; + } + if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) { if (isVerbose()) OutStreamer->emitRawComment(" divergent unreachable"); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1777,6 +1777,7 @@ case AMDGPU::SI_MASKED_UNREACHABLE: case AMDGPU::WAVE_BARRIER: case AMDGPU::SCHED_BARRIER: + case AMDGPU::SCHED_GROUP_BARRIER: return 0; } } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -327,6 +327,20 @@ let Size = 0; } +def SCHED_GROUP_BARRIER : SPseudoInstSI< + (outs), + (ins i32imm:$mask, i32imm:$size, i32imm:$syncid), + [(int_amdgcn_sched_group_barrier (i32 timm:$mask), (i32 timm:$size), (i32 timm:$syncid))]> { + let SchedRW = []; + let hasNoSchedulingInfo = 1; + let hasSideEffects = 1; + let mayLoad = 0; + let mayStore = 0; + let isConvergent = 1; + let FixedSize = 1; + let Size = 0; +} + // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -149,6 +149,7 @@ case Intrinsic::amdgcn_s_barrier: case Intrinsic::amdgcn_wave_barrier: case Intrinsic::amdgcn_sched_barrier: + case Intrinsic::amdgcn_sched_group_barrier: return false; default: break; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +define amdgpu_kernel void @test_sched_group_barrier() #0 { +; GCN-LABEL: test_sched_group_barrier: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: ; sched_group_barrier mask(0x00000000) size(1) SyncID(2) +; GCN-NEXT: ; sched_group_barrier mask(0x00000001) size(2) SyncID(4) +; GCN-NEXT: ; sched_group_barrier mask(0x00000004) size(8) SyncID(16) +; GCN-NEXT: ; sched_group_barrier mask(0x0000000F) size(10000) SyncID(-1) +; GCN-NEXT: s_endpgm +entry: + call void @llvm.amdgcn.sched.group.barrier(i32 0, i32 1, i32 2) #1 + call void @llvm.amdgcn.sched.group.barrier(i32 1, i32 2, i32 4) #1 + call void @llvm.amdgcn.sched.group.barrier(i32 4, i32 8, i32 16) #1 + call void @llvm.amdgcn.sched.group.barrier(i32 15, i32 10000, i32 -1) #1 + ret void +} + +declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir @@ -0,0 +1,173 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx908 -misched-cluster=false -amdgpu-disable-power-sched=true -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s + +--- | + define amdgpu_kernel void @no_sched_group_barrier(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + define amdgpu_kernel void @sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) { ret void } + + !0 = distinct !{!0} + !1 = !{!1, !0} +... + +--- +name: no_sched_group_barrier +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: no_sched_group_barrier + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]] + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:areg_128 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec + S_NOP 0 + %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec + %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec + %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec + %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec + %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec + %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + S_ENDPGM 0, implicit %5, implicit %6, implicit %11 +... + +--- +name: sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_group_barrier_1_VMEM_READ_1_VALU_5_MFMA_1_VMEM_READ_3_VALU_2_VMEM_WRITE + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; CHECK-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 1, 0 + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: SCHED_GROUP_BARRIER 8, 5, 0 + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 32, 1, 0 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 3, 0 + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_2]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: SCHED_GROUP_BARRIER 64, 2, 0 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_3]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]] + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:areg_128 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec + S_NOP 0 + %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec + %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec + %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec + %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec + %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec + %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; 1 VMEM_READ + SCHED_GROUP_BARRIER 32, 1, 0 + ; 1 VALU + SCHED_GROUP_BARRIER 2, 1, 0 + ; 5 MFMA + SCHED_GROUP_BARRIER 8, 5, 0 + ; 1 VMEM_READ + SCHED_GROUP_BARRIER 32, 1, 0 + ; 3 VALU + SCHED_GROUP_BARRIER 2, 3, 0 + ; 2 VMEM_WRITE + SCHED_GROUP_BARRIER 64, 2, 0 + S_ENDPGM 0, implicit %5, implicit %6, implicit %11 +... + +--- +name: sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sched_group_barrier_2_VMEM_1000_ALU_5_MFMA_2_VMEM_WRITE + ; CHECK: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF + ; CHECK-NEXT: SCHED_GROUP_BARRIER 16, 2, 0 + ; CHECK-NEXT: S_NOP 0 + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec + ; CHECK-NEXT: SCHED_GROUP_BARRIER 1, 1000, 0 + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: SCHED_GROUP_BARRIER 8, 5, 0 + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; CHECK-NEXT: SCHED_GROUP_BARRIER 64, 2, 0 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]] + %0:sreg_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:areg_128 = IMPLICIT_DEF + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %4:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %3, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + %5:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec + %6:vgpr_32 = nsw V_MUL_LO_U32_e64 %3, %1, implicit $exec + S_NOP 0 + %7:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %2, 0, 0, 0, implicit $mode, implicit $exec + %8:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %7, 0, 0, 0, implicit $mode, implicit $exec + %9:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %8, 0, 0, 0, implicit $mode, implicit $exec + %10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %9, 0, 0, 0, implicit $mode, implicit $exec + %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %1, %3, %10, 0, 0, 0, implicit $mode, implicit $exec + %12:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1) + %13:vgpr_32 = nsw V_MUL_LO_U32_e64 %12, %12, implicit $exec + GLOBAL_STORE_DWORD_SADDR %1, %13, %0, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1) + ; 2 VMEM + SCHED_GROUP_BARRIER 16, 2, 0 + ; 10 ALU + SCHED_GROUP_BARRIER 1, 1000, 0 + ; 5 MFMA + SCHED_GROUP_BARRIER 8, 5, 0 + ; 2 VMEM_WRITE + SCHED_GROUP_BARRIER 64, 2, 0 + S_ENDPGM 0, implicit %5, implicit %6, implicit %11 +...