diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -49,17 +49,49 @@ static cl::opt MaxBranchesExplored( "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden, - cl::desc("The amount of branches that we are willing to explore with" + cl::desc("The amount of branches that we are willing to explore with " "the exact algorithm before giving up.")); -static cl::opt UseCostHeur( - "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden, - cl::desc("Whether to use the cost heuristic to make choices as we " - "traverse the search space using the exact solver. Defaulted " - "to on, and if turned off, we will use the node order -- " - "attempting to put the later nodes in the later sched groups. " - "Experimentally, results are mixed, so this should be set on a " - "case-by-case basis.")); +// The heuristic we should use when prioritizing SchedGroups for a given +// pipeline instruction +enum class SGPriority { Order, Cost, Dep }; + +static SGPriority SGPriorityHeur; + +struct SGPriorityHeurParser : public cl::parser { + SGPriorityHeurParser(cl::Option &O) : cl::parser(O) {} + + bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, + std::string &Value) { + Value = Arg.str(); + std::transform(Value.begin(), Value.end(), Value.begin(), ::tolower); + + if (Value == "order") { + SGPriorityHeur = SGPriority::Order; + return false; + } + + if (Value == "cost") { + SGPriorityHeur = SGPriority::Cost; + return false; + } + + if (Value == "dep") { + SGPriorityHeur = SGPriority::Dep; + return false; + } + + return O.error("'" + Arg + "' invalid. Valid options: {order, cost, dep}."); + } +}; + +static cl::opt SGHeur( + "amdgpu-igrouplp-exact-solver-sg-priority-heur", cl::init("order"), + cl::Hidden, + cl::desc("The heuristic we should use when prioritizing the sched group " + "candidates for a given pipeline instructions. The valid options " + "are: order (first available), cost (fewest missed edges), or dep " + "(first available after all assigned predecessors).")); // Components of the mask that determines which instruction types may be may be // classified into a SchedGroup. @@ -99,7 +131,9 @@ // SyncID. int SyncID = 0; - // SGID is used to map instructions to candidate SchedGroups + // SGID is used to map instructions to candidate SchedGroups. SGID also + // represents the bottom up order of SchedGroups in a given synchronized + // pipeline -- the minimum SGID is the last SG in the synchronized pipeline unsigned SGID; // Count of the number of created SchedGroups, used to initialize SGID. @@ -208,6 +242,9 @@ typedef std::pair> SUToCandSGsPair; typedef SmallVector SUsToCandSGsVec; +typedef DenseMap> SUnitToSUnitMap; +typedef DenseMap, 4>> + SUnitToAssignedSUnitMap; // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline // in non-trivial cases. For example, if the requested pipeline is // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction @@ -229,6 +266,13 @@ // The pipeline that has the best solution found so far SmallVector, 4> BestPipeline; + // Map of SU -> Pipeline Recursive Preds for each Sync stage + SmallVector PipelinePreds; + // Map of SU -> Pipeline Recursive Succs for each Sync stage + SmallVector PipelineSuccs; + // The SchedGroup Assignments of recursive Recursive Preds for a given SU + SmallVector PipelineAssignments; + // Whether or not we actually have any SyncedInstrs to try to solve. bool NeedsSolver = false; @@ -236,6 +280,8 @@ // the product of each conflictedInst.Matches.size() across all SyncPipelines unsigned computeProblemSize(); + void mapDependencies(); + // The cost penalty of not assigning a SU to a SchedGroup int MissPenalty = 0; @@ -255,10 +301,10 @@ uint64_t BranchesExplored = 0; // Update indices to fit next conflicting instruction - void advancePosition(); + void advancePosition(SUnit *SU = nullptr, int SGID = -1); // Recede indices to attempt to find better fit for previous conflicting // instruction - void retreatPosition(); + void retreatPosition(SUnit *); // The exponential time algorithm which finds the provably best fit bool solveExact(); @@ -436,7 +482,14 @@ } } -void PipelineSolver::advancePosition() { +void PipelineSolver::advancePosition(SUnit *SU, int CandSGID) { + if (SU) { + // For top-down processing, we want to track the assignments of predecessors + for (auto &SuccSU : PipelineSuccs[CurrSyncGroupIdx][SU]) + PipelineAssignments[CurrSyncGroupIdx][SuccSU].push_back( + std::make_pair(SU, CandSGID)); + } + ++CurrConflInstNo; if (static_cast(CurrConflInstNo) >= @@ -450,7 +503,18 @@ } } -void PipelineSolver::retreatPosition() { +void PipelineSolver::retreatPosition(SUnit *SU) { + int TempSyncGroupIdx = + CurrConflInstNo == 0 ? CurrSyncGroupIdx - 1 : CurrSyncGroupIdx; + if (SU && TempSyncGroupIdx >= 0) { + for (auto &SuccSU : PipelineSuccs[TempSyncGroupIdx][SU]) { + auto &Assignments = PipelineAssignments[TempSyncGroupIdx][SuccSU]; + // The assignment we are trying to remove will always be the most recent + // one push_back()ed + Assignments.pop_back(); + } + } + assert(CurrConflInstNo >= 0); assert(CurrSyncGroupIdx >= 0); @@ -495,18 +559,38 @@ SUToCandSGsPair &CurrSU, SmallVectorImpl> &ReadyList, SmallVectorImpl &SyncPipeline) { assert(CurrSU.second.size() >= 1); - auto I = CurrSU.second.rbegin(); - auto E = CurrSU.second.rend(); - for (; I != E; ++I) { - std::vector> AddedEdges; - int CandSGID = *I; - SchedGroup *Match; - for (auto &SG : SyncPipeline) { - if (SG.getSGID() == CandSGID) - Match = &SG; - } - if (UseCostHeur) { + switch (SGPriorityHeur) { + case SGPriority::Order: { + // Prioritize SchedGroups based on their top down order in the DAG. + // The assumption is that earlier instructions will populate earlier + // SchedGroups, and the Pipeline Assignments will follow the natural + // order of the DAG. + auto I = CurrSU.second.rbegin(); + auto E = CurrSU.second.rend(); + for (; I != E; ++I) + ReadyList.push_back(std::make_pair(*I, -1)); + break; + } + case SGPriority::Cost: { + // Prioritize SchedGroups based on the fewest number of missed edges. + // This is an accurate, but computationally expensive, heuristic. The major + // downside to this heuristic is that it can't make good decisions at the + // top of the search tree due to lack of information. Moreover, these + // decisions are the most significant as we are least likely to undo them + // with a standard Branch and Bound algorithm. + auto I = CurrSU.second.rbegin(); + auto E = CurrSU.second.rend(); + + for (; I != E; ++I) { + std::vector> AddedEdges; + int CandSGID = *I; + SchedGroup *Match; + for (auto &SG : SyncPipeline) { + if (SG.getSGID() == CandSGID) + Match = &SG; + } + if (Match->isFull()) { ReadyList.push_back(std::make_pair(*I, MissPenalty)); continue; @@ -515,17 +599,92 @@ int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); ReadyList.push_back(std::make_pair(*I, TempCost)); removeEdges(AddedEdges); - } else - ReadyList.push_back(std::make_pair(*I, -1)); - } + } - if (UseCostHeur) { std::sort(ReadyList.begin(), ReadyList.end(), [](std::pair A, std::pair B) { return A.second < B.second; }); + break; } + case SGPriority::Dep: { + // Prioritize SchedGroups based on the earliest available that does + // not violate dependencies. Assuming that we process instructions + // in a top-down manner, we track the predecessors for each instruction. + // Additionally, we track the SchedGroups each of these preds are assigned + // to. The priority is then the SchedGroups that satisfy all dependency + // in order from earliest to latest. The next prioritiezed are those that + // satisfy all but one dependency in order from earliest to latest, and so + // on. + SmallVector DepGroups; + // Sort the assigned SG of predecssors in bottom-up order + for (auto &Assignment : + PipelineAssignments[CurrSyncGroupIdx][CurrSU.first]) { + if (Assignment.second == -1) + continue; + if (DepGroups.size() == 0) { + DepGroups.push_back(Assignment.second); + continue; + } + + auto I = DepGroups.begin(); + while (Assignment.second > *I && I != DepGroups.end()) + ++I; + + DepGroups.insert(I, Assignment.second); + } + // Sort the Candidate SG in top-down order. The candidate SG should already + // be in bottom up order due to the way that they are collected in + // InitSchedGroupBarrier, however, it is prefered to not create coupling + // based on that assumption, and insertion sort is O(n) if the vector is in + // backwards order. + SmallVector CandSGs; + for (auto &CandSG : CurrSU.second) { + if (CandSGs.size() == 0) { + CandSGs.push_back(CandSG); + continue; + } + auto J = CandSGs.begin(); + while (CandSG < *J && J != CandSGs.end()) + ++J; + + CandSGs.insert(J, CandSG); + } + + auto InsertPair = [&ReadyList](int CandSGID) { + ReadyList.insert(ReadyList.end(), std::make_pair(CandSGID, -1)); + }; + + // If the instruction has no predecessors, then just insert the CandSGs in + // top-down order. + if (!DepGroups.size()) { + std::for_each(CandSGs.begin(), CandSGs.end(), InsertPair); + return; + } + + // Get the latest SG for the preds, and add the candidate SGs that follow + // in top-down order. + for (auto &Dep : DepGroups) { + if (!CandSGs.size()) + break; + auto I = CandSGs.begin(); + // a larger SGID implies earlier in pipeline + while (*I > Dep && I != CandSGs.end()) + ++I; + + if (I == CandSGs.end()) + continue; + std::for_each(I, CandSGs.end(), InsertPair); + CandSGs.erase(I, CandSGs.end()); + } + + // Add any remaining candidate SGs after processing the predecessor SGs. + // These correspond with SGs that violate all dependencies. + std::for_each(CandSGs.begin(), CandSGs.end(), InsertPair); + break; + } + } assert(ReadyList.size() == CurrSU.second.size()); } @@ -577,7 +736,7 @@ AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n"); CurrCost += AddedCost; - advancePosition(); + advancePosition(CurrSU.first, CandSGID); ++BranchesExplored; bool FinishedExploring = false; // If the Cost after adding edges is greater than a known solution, @@ -590,7 +749,7 @@ } } - retreatPosition(); + retreatPosition(CurrSU.first); CurrCost -= AddedCost; removeEdges(AddedEdges); Match->pop(); @@ -603,7 +762,7 @@ // Potentially if we omit a problematic instruction from the pipeline, // all the other instructions can nicely fit. CurrCost += MissPenalty; - advancePosition(); + advancePosition(CurrSU.first, -1); LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n"); @@ -616,7 +775,7 @@ } } - retreatPosition(); + retreatPosition(CurrSU.first); CurrCost -= MissPenalty; return FinishedExploring; } @@ -695,6 +854,37 @@ return ProblemSize; } +void PipelineSolver::mapDependencies() { + PipelineSuccs.resize(PipelineInstrs.size()); + PipelinePreds.resize(PipelineInstrs.size()); + PipelineAssignments.resize(PipelineInstrs.size()); + + int SyncStage = 0; + for (auto &SyncPipe : PipelineInstrs) { + auto I = SyncPipe.begin(); + auto E = SyncPipe.end(); + for (; I != E; I++) { + auto PipeSUA = *I; + for (auto J = std::next(I); J != E; J++) { + auto PipeSUB = *J; + SUnit *PotentialPred = PipeSUA.first; + SUnit *PotentialSucc = PipeSUB.first; + // If SUnitB occurs before SUnitA in the DAG, then we + // should look for SUnitA in the predecessors of SUnitB + if (PipeSUB.first->NodeNum < PipeSUA.first->NodeNum) { + PotentialPred = PipeSUB.first; + PotentialSucc = PipeSUA.first; + } + if (DAG->IsReachable(PotentialSucc, PotentialPred)) { + PipelinePreds[SyncStage][PotentialSucc].push_back(PotentialPred); + PipelineSuccs[SyncStage][PotentialPred].push_back(PotentialSucc); + } + } + } + ++SyncStage; + } +} + void PipelineSolver::solve() { if (!NeedsSolver) return; @@ -705,6 +895,8 @@ bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact; MissPenalty = (ProblemSize / 2) + 1; + mapDependencies(); + LLVM_DEBUG(DAG->dump()); if (EnableExactSolver || BelowCutoff) { LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n"); diff --git a/llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-sgheuristic.ll b/llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-sgheuristic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-sgheuristic.ll @@ -0,0 +1,336 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver=1 -amdgpu-igrouplp-exact-solver-max-branches=100000 -amdgpu-igrouplp-exact-solver-sg-priority-heur=dep < %s | FileCheck -check-prefix=DEPHEUR %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver=1 -amdgpu-igrouplp-exact-solver-max-branches=100000 -amdgpu-igrouplp-exact-solver-sg-priority-heur=cost < %s | FileCheck -check-prefix=COSTHEUR %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver=1 -amdgpu-igrouplp-exact-solver-max-branches=100000 -amdgpu-igrouplp-exact-solver-sg-priority-heur=order < %s | FileCheck -check-prefix=ORDERHEUR %s + + + + +define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(<32 x i32> addrspace(1)* noalias %in, <32 x i32> addrspace(1)* noalias %out) #0 { +; DEPHEUR-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: +; DEPHEUR: ; %bb.0: +; DEPHEUR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; DEPHEUR-NEXT: v_lshlrev_b32_e32 v16, 7, v0 +; DEPHEUR-NEXT: s_waitcnt lgkmcnt(0) +; DEPHEUR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; DEPHEUR-NEXT: s_waitcnt vmcnt(0) +; DEPHEUR-NEXT: v_mul_lo_u32 v13, v13, v13 +; DEPHEUR-NEXT: v_mul_lo_u32 v12, v12, v12 +; DEPHEUR-NEXT: v_mul_lo_u32 v15, v15, v15 +; DEPHEUR-NEXT: v_mul_lo_u32 v14, v14, v14 +; DEPHEUR-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; DEPHEUR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; DEPHEUR-NEXT: s_waitcnt vmcnt(0) +; DEPHEUR-NEXT: v_mul_lo_u32 v3, v3, v3 +; DEPHEUR-NEXT: v_mul_lo_u32 v2, v2, v2 +; DEPHEUR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; DEPHEUR-NEXT: s_waitcnt vmcnt(0) +; DEPHEUR-NEXT: v_mul_lo_u32 v9, v9, v9 +; DEPHEUR-NEXT: v_mul_lo_u32 v8, v8, v8 +; DEPHEUR-NEXT: v_mul_lo_u32 v11, v11, v11 +; DEPHEUR-NEXT: v_mul_lo_u32 v10, v10, v10 +; DEPHEUR-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; DEPHEUR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:112 +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; DEPHEUR-NEXT: s_waitcnt vmcnt(0) +; DEPHEUR-NEXT: v_mul_lo_u32 v7, v7, v7 +; DEPHEUR-NEXT: v_mul_lo_u32 v6, v6, v6 +; DEPHEUR-NEXT: v_mul_lo_u32 v1, v1, v1 +; DEPHEUR-NEXT: v_mul_lo_u32 v0, v0, v0 +; DEPHEUR-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; DEPHEUR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80 +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; DEPHEUR-NEXT: s_waitcnt vmcnt(0) +; DEPHEUR-NEXT: v_mul_lo_u32 v3, v3, v3 +; DEPHEUR-NEXT: v_mul_lo_u32 v2, v2, v2 +; DEPHEUR-NEXT: v_mul_lo_u32 v1, v1, v1 +; DEPHEUR-NEXT: v_mul_lo_u32 v0, v0, v0 +; DEPHEUR-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80 +; DEPHEUR-NEXT: v_mul_lo_u32 v5, v5, v5 +; DEPHEUR-NEXT: v_mul_lo_u32 v4, v4, v4 +; DEPHEUR-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:112 +; DEPHEUR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:96 +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; DEPHEUR-NEXT: s_waitcnt vmcnt(0) +; DEPHEUR-NEXT: v_mul_lo_u32 v7, v7, v7 +; DEPHEUR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:64 +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; DEPHEUR-NEXT: s_waitcnt vmcnt(0) +; DEPHEUR-NEXT: v_mul_lo_u32 v3, v3, v3 +; DEPHEUR-NEXT: v_mul_lo_u32 v2, v2, v2 +; DEPHEUR-NEXT: v_mul_lo_u32 v1, v1, v1 +; DEPHEUR-NEXT: v_mul_lo_u32 v0, v0, v0 +; DEPHEUR-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:64 +; DEPHEUR-NEXT: v_mul_lo_u32 v6, v6, v6 +; DEPHEUR-NEXT: v_mul_lo_u32 v5, v5, v5 +; DEPHEUR-NEXT: v_mul_lo_u32 v4, v4, v4 +; DEPHEUR-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:96 +; DEPHEUR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; DEPHEUR-NEXT: s_waitcnt vmcnt(0) +; DEPHEUR-NEXT: v_mul_lo_u32 v5, v5, v5 +; DEPHEUR-NEXT: v_mul_lo_u32 v4, v4, v4 +; DEPHEUR-NEXT: v_mul_lo_u32 v7, v7, v7 +; DEPHEUR-NEXT: v_mul_lo_u32 v6, v6, v6 +; DEPHEUR-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; DEPHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; DEPHEUR-NEXT: s_endpgm +; +; COSTHEUR-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: +; COSTHEUR: ; %bb.0: +; COSTHEUR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; COSTHEUR-NEXT: v_lshlrev_b32_e32 v12, 7, v0 +; COSTHEUR-NEXT: s_waitcnt lgkmcnt(0) +; COSTHEUR-NEXT: global_load_dwordx4 v[8:11], v12, s[0:1] offset:64 +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; COSTHEUR-NEXT: s_waitcnt vmcnt(0) +; COSTHEUR-NEXT: v_mul_lo_u32 v11, v11, v11 +; COSTHEUR-NEXT: v_mul_lo_u32 v10, v10, v10 +; COSTHEUR-NEXT: v_mul_lo_u32 v9, v9, v9 +; COSTHEUR-NEXT: v_mul_lo_u32 v8, v8, v8 +; COSTHEUR-NEXT: global_store_dwordx4 v12, v[8:11], s[2:3] offset:64 +; COSTHEUR-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; COSTHEUR-NEXT: s_waitcnt vmcnt(0) +; COSTHEUR-NEXT: v_mul_lo_u32 v3, v3, v3 +; COSTHEUR-NEXT: v_mul_lo_u32 v2, v2, v2 +; COSTHEUR-NEXT: global_load_dwordx4 v[8:11], v12, s[0:1] offset:32 +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; COSTHEUR-NEXT: s_waitcnt vmcnt(0) +; COSTHEUR-NEXT: v_mul_lo_u32 v9, v9, v9 +; COSTHEUR-NEXT: v_mul_lo_u32 v8, v8, v8 +; COSTHEUR-NEXT: v_mul_lo_u32 v11, v11, v11 +; COSTHEUR-NEXT: v_mul_lo_u32 v10, v10, v10 +; COSTHEUR-NEXT: global_store_dwordx4 v12, v[8:11], s[2:3] offset:32 +; COSTHEUR-NEXT: global_load_dwordx4 v[4:7], v12, s[0:1] offset:112 +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; COSTHEUR-NEXT: s_waitcnt vmcnt(0) +; COSTHEUR-NEXT: v_mul_lo_u32 v7, v7, v7 +; COSTHEUR-NEXT: v_mul_lo_u32 v6, v6, v6 +; COSTHEUR-NEXT: v_mul_lo_u32 v1, v1, v1 +; COSTHEUR-NEXT: v_mul_lo_u32 v0, v0, v0 +; COSTHEUR-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] +; COSTHEUR-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:96 +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; COSTHEUR-NEXT: s_waitcnt vmcnt(0) +; COSTHEUR-NEXT: v_mul_lo_u32 v3, v3, v3 +; COSTHEUR-NEXT: v_mul_lo_u32 v2, v2, v2 +; COSTHEUR-NEXT: v_mul_lo_u32 v1, v1, v1 +; COSTHEUR-NEXT: v_mul_lo_u32 v0, v0, v0 +; COSTHEUR-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:96 +; COSTHEUR-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:80 +; COSTHEUR-NEXT: s_waitcnt vmcnt(0) +; COSTHEUR-NEXT: v_mul_lo_u32 v3, v3, v3 +; COSTHEUR-NEXT: v_mul_lo_u32 v2, v2, v2 +; COSTHEUR-NEXT: v_mul_lo_u32 v1, v1, v1 +; COSTHEUR-NEXT: v_mul_lo_u32 v0, v0, v0 +; COSTHEUR-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:80 +; COSTHEUR-NEXT: v_mul_lo_u32 v5, v5, v5 +; COSTHEUR-NEXT: v_mul_lo_u32 v4, v4, v4 +; COSTHEUR-NEXT: global_store_dwordx4 v12, v[4:7], s[2:3] offset:112 +; COSTHEUR-NEXT: global_load_dwordx4 v[4:7], v12, s[0:1] offset:48 +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; COSTHEUR-NEXT: s_waitcnt vmcnt(0) +; COSTHEUR-NEXT: v_mul_lo_u32 v5, v5, v5 +; COSTHEUR-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:16 +; COSTHEUR-NEXT: v_mul_lo_u32 v4, v4, v4 +; COSTHEUR-NEXT: s_waitcnt vmcnt(0) +; COSTHEUR-NEXT: v_mul_lo_u32 v1, v1, v1 +; COSTHEUR-NEXT: v_mul_lo_u32 v0, v0, v0 +; COSTHEUR-NEXT: v_mul_lo_u32 v3, v3, v3 +; COSTHEUR-NEXT: v_mul_lo_u32 v2, v2, v2 +; COSTHEUR-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:16 +; COSTHEUR-NEXT: v_mul_lo_u32 v7, v7, v7 +; COSTHEUR-NEXT: v_mul_lo_u32 v6, v6, v6 +; COSTHEUR-NEXT: global_store_dwordx4 v12, v[4:7], s[2:3] offset:48 +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; COSTHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; COSTHEUR-NEXT: s_endpgm +; +; ORDERHEUR-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: +; ORDERHEUR: ; %bb.0: +; ORDERHEUR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; ORDERHEUR-NEXT: v_lshlrev_b32_e32 v16, 7, v0 +; ORDERHEUR-NEXT: ; kill: killed $sgpr0_sgpr1 +; ORDERHEUR-NEXT: s_waitcnt lgkmcnt(0) +; ORDERHEUR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; ORDERHEUR-NEXT: s_waitcnt vmcnt(0) +; ORDERHEUR-NEXT: v_mul_lo_u32 v3, v3, v3 +; ORDERHEUR-NEXT: v_mul_lo_u32 v2, v2, v2 +; ORDERHEUR-NEXT: v_mul_lo_u32 v1, v1, v1 +; ORDERHEUR-NEXT: v_mul_lo_u32 v0, v0, v0 +; ORDERHEUR-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; ORDERHEUR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; ORDERHEUR-NEXT: s_waitcnt vmcnt(0) +; ORDERHEUR-NEXT: v_mul_lo_u32 v3, v3, v3 +; ORDERHEUR-NEXT: v_mul_lo_u32 v2, v2, v2 +; ORDERHEUR-NEXT: v_mul_lo_u32 v1, v1, v1 +; ORDERHEUR-NEXT: v_mul_lo_u32 v0, v0, v0 +; ORDERHEUR-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 +; ORDERHEUR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; ORDERHEUR-NEXT: s_waitcnt vmcnt(0) +; ORDERHEUR-NEXT: v_mul_lo_u32 v3, v3, v3 +; ORDERHEUR-NEXT: v_mul_lo_u32 v2, v2, v2 +; ORDERHEUR-NEXT: v_mul_lo_u32 v1, v1, v1 +; ORDERHEUR-NEXT: v_mul_lo_u32 v0, v0, v0 +; ORDERHEUR-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 +; ORDERHEUR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; ORDERHEUR-NEXT: s_waitcnt vmcnt(0) +; ORDERHEUR-NEXT: v_mul_lo_u32 v5, v5, v5 +; ORDERHEUR-NEXT: v_mul_lo_u32 v4, v4, v4 +; ORDERHEUR-NEXT: v_mul_lo_u32 v7, v7, v7 +; ORDERHEUR-NEXT: v_mul_lo_u32 v6, v6, v6 +; ORDERHEUR-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; ORDERHEUR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; ORDERHEUR-NEXT: s_nop 0 +; ORDERHEUR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; ORDERHEUR-NEXT: s_waitcnt vmcnt(0) +; ORDERHEUR-NEXT: v_mul_lo_u32 v7, v7, v7 +; ORDERHEUR-NEXT: v_mul_lo_u32 v6, v6, v6 +; ORDERHEUR-NEXT: v_mul_lo_u32 v5, v5, v5 +; ORDERHEUR-NEXT: v_mul_lo_u32 v4, v4, v4 +; ORDERHEUR-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 +; ORDERHEUR-NEXT: v_mul_lo_u32 v9, v9, v9 +; ORDERHEUR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; ORDERHEUR-NEXT: s_waitcnt vmcnt(0) +; ORDERHEUR-NEXT: v_mul_lo_u32 v13, v13, v13 +; ORDERHEUR-NEXT: v_mul_lo_u32 v12, v12, v12 +; ORDERHEUR-NEXT: v_mul_lo_u32 v15, v15, v15 +; ORDERHEUR-NEXT: v_mul_lo_u32 v14, v14, v14 +; ORDERHEUR-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 +; ORDERHEUR-NEXT: v_mul_lo_u32 v8, v8, v8 +; ORDERHEUR-NEXT: v_mul_lo_u32 v11, v11, v11 +; ORDERHEUR-NEXT: v_mul_lo_u32 v10, v10, v10 +; ORDERHEUR-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 +; ORDERHEUR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; ORDERHEUR-NEXT: s_waitcnt vmcnt(0) +; ORDERHEUR-NEXT: v_mul_lo_u32 v11, v11, v11 +; ORDERHEUR-NEXT: v_mul_lo_u32 v10, v10, v10 +; ORDERHEUR-NEXT: v_mul_lo_u32 v9, v9, v9 +; ORDERHEUR-NEXT: v_mul_lo_u32 v8, v8, v8 +; ORDERHEUR-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; ORDERHEUR-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; ORDERHEUR-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() #2 + %gep1 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %in, i32 %tid + %load = load <32 x i32>, <32 x i32> addrspace(1)* %gep1 + %mul = mul <32 x i32> %load, %load + %gep2 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %out, i32 %tid + store <32 x i32> %mul, <32 x i32> addrspace(1)* %gep2 + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #2 +declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1 +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1 + +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" } +attributes #1 = { nounwind } +attributes #2 = { nounwind readnone speculatable } + +