diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -61,6 +61,15 @@ "Experimentally, results are mixed, so this should be set on a " "case-by-case basis.")); +static cl::opt EnableLowerBound( + "amdgpu-igrouplp-exact-solver-lower-bound", cl::Hidden, + cl::desc("Whether to use a lower bound when calculating the cost " + "for a partial fit using the exact solver. The lower bound " + "calculates the cost of assigning the remaining instructions " + "under idealized conditions. The LB reduces the overall search " + "space but adds time complexity per branch explored."), + cl::init(false)); + // Components of the mask that determines which instruction types may be may be // classified into a SchedGroup. enum class SchedGroupMask { @@ -109,7 +118,11 @@ const SIInstrInfo *TII; - // Try to add and edge from SU A to SU B. + // Try to add and edge from SU A to SU B. This returns false if there is a + // dependency which makes adding the A->B edge impossible, otherwise it + // returns true. The result is that it will return true even if no edge was + // added. For example, if there is already an edge between A->B, this will + // return true, even though DAG->addEdge does not add edge. bool tryAddEdge(SUnit *A, SUnit *B); // Use SGMask to determine whether we can classify MI as a member of this @@ -131,7 +144,7 @@ // Add DAG dependencies and track which edges are added, and the count of // missed edges int link(SUnit &SU, bool MakePred, - std::vector> &AddedEdges); + SmallVectorImpl> &AddedEdges); // Add DAG dependencies from all SUnits in this SchedGroup and this SU. // Use the predicate to determine whether SU should be a predecessor (P = @@ -243,6 +256,9 @@ int BestCost = -1; int CurrCost = 0; + // A lower bound on the optimal cost for a complete pipeline + int StaticLowerBound = 0; + // Index pointing to the conflicting instruction that is currently being // fitted int CurrConflInstNo = 0; @@ -270,14 +286,19 @@ void populateReadyList(SUToCandSGsPair &CurrSU, SmallVectorImpl> &ReadyList, SmallVectorImpl &SyncPipeline); + // Calculate best cost assignment of an unassigned SU without assigning it. + // The sum of these costs across SUs represents a Lower Bound on the true best + // cost for the set of unassigned SUs. + int calculateLowerBound(); // Add edges corresponding to the SchedGroups as assigned by solver void makePipeline(); // Add the edges from the SU to the other SchedGroups in pipeline, and // return the number of edges missed. int addEdges(SmallVectorImpl &SyncPipeline, SUnit *SU, int SGID, - std::vector> &AddedEdges); + SmallVectorImpl> &AddedEdges, + int BestCost = -1); // Remove the edges passed via AddedEdges - void removeEdges(const std::vector> &AddedEdges); + void removeEdges(SmallVectorImpl> &AddedEdges); // Convert the passed in maps to arrays for bidirectional iterators void convertSyncMapsToArrays(); @@ -395,7 +416,7 @@ int PipelineSolver::addEdges( SmallVectorImpl &SyncPipeline, SUnit *SU, int SGID, - std::vector> &AddedEdges) { + SmallVectorImpl> &AddedEdges, int BestCost) { int AddedCost = 0; bool MakePred = false; @@ -406,6 +427,8 @@ // linked as a predecessor of the subsequent SchedGroups auto GroupNo = (int)SyncPipeline.size() - 1; for (; GroupNo >= 0; GroupNo--) { + if (BestCost != -1 && AddedCost >= BestCost) + return AddedCost; if (SyncPipeline[GroupNo].getSGID() == SGID) { MakePred = true; continue; @@ -419,15 +442,18 @@ } void PipelineSolver::removeEdges( - const std::vector> &EdgesToRemove) { + SmallVectorImpl> &EdgesToRemove) { // Only remove the edges that we have added when testing // the fit. for (auto &PredSuccPair : EdgesToRemove) { SUnit *Pred = PredSuccPair.first; SUnit *Succ = PredSuccPair.second; - auto Match = llvm::find_if( - Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; }); + auto Match = + std::find_if(Succ->Preds.begin(), Succ->Preds.end(), [&Pred](SDep &P) { + return P.getSUnit() == Pred && P.isArtificial(); + }); + if (Match != Succ->Preds.end()) { assert(Match->isArtificial()); Succ->removePred(*Match); @@ -478,7 +504,7 @@ if (BestCost == -1 || CurrCost < BestCost) { BestPipeline = CurrPipeline; BestCost = CurrCost; - LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n"); + LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << '\n'); } assert(BestCost >= 0); } @@ -487,7 +513,7 @@ if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored) DoneExploring = true; - return (DoneExploring || BestCost == 0); + return (DoneExploring || BestCost == StaticLowerBound); } void PipelineSolver::populateReadyList( @@ -496,8 +522,9 @@ assert(CurrSU.second.size() >= 1); auto I = CurrSU.second.rbegin(); auto E = CurrSU.second.rend(); + SmallVector, 16> AddedEdges; for (; I != E; ++I) { - std::vector> AddedEdges; + int CandSGID = *I; SchedGroup *Match; for (auto &SG : SyncPipeline) { @@ -510,6 +537,7 @@ ReadyList.push_back(std::pair(*I, MissPenalty)); continue; } + AddedEdges.clear(); int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); ReadyList.push_back(std::pair(*I, TempCost)); @@ -528,6 +556,52 @@ assert(ReadyList.size() == CurrSU.second.size()); } +int PipelineSolver::calculateLowerBound() { + if (CurrSyncGroupIdx >= (int)CurrPipeline.size()) + return 0; + int TempConflInstNo = CurrConflInstNo; + int TmpSyncGroupIdx = CurrSyncGroupIdx; + int MinimumCost = 0; + SmallVector, 16> AddedEdges; + + for (; TmpSyncGroupIdx < (int)CurrPipeline.size(); TmpSyncGroupIdx++) { + auto SyncPipeline = CurrPipeline[TmpSyncGroupIdx]; + for (; TempConflInstNo < (int)PipelineInstrs[TmpSyncGroupIdx].size(); + TempConflInstNo++) { + auto CurrSU = PipelineInstrs[TmpSyncGroupIdx][TempConflInstNo]; + auto I = CurrSU.second.rbegin(); + auto E = CurrSU.second.rend(); + int MinCostForSU = -1; + for (; I != E; I++) { + int CandSGID = *I; + SchedGroup *Match; + for (auto &SG : SyncPipeline) { + if (SG.getSGID() == CandSGID) + Match = &SG; + } + + if (Match->isFull()) { + if (MinCostForSU == -1 || MissPenalty < MinCostForSU) + MinCostForSU = MissPenalty; + continue; + } + AddedEdges.clear(); + int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, + AddedEdges, MinCostForSU); + if (MinCostForSU == -1 || TempCost < MinCostForSU) + MinCostForSU = TempCost; + + removeEdges(AddedEdges); + if (MinCostForSU == 0) + break; + } + MinimumCost += MinCostForSU; + } + TempConflInstNo = 0; + } + return MinimumCost; +} + bool PipelineSolver::solveExact() { if (checkOptimal()) return true; @@ -540,12 +614,13 @@ PipelineInstrs[CurrSyncGroupIdx].size()); SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum - << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); + << ") in Pipeline # " << CurrSyncGroupIdx << '\n'); // SchedGroup -> Cost pairs SmallVector, 4> ReadyList; // Prioritize the candidate sched groups in terms of lowest cost first populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]); + SmallVector, 16> AddedEdges; auto I = ReadyList.begin(); auto E = ReadyList.end(); @@ -558,7 +633,6 @@ int CandSGID = I->first; int AddedCost = 0; - std::vector> AddedEdges; auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; SchedGroup *Match; for (auto &SG : SyncPipeline) { @@ -571,19 +645,22 @@ LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask " << (int)Match->getMask() << "and ID " << CandSGID - << "\n"); + << '\n'); Match->add(*CurrSU.first); + AddedEdges.clear(); AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); - LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n"); + LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << '\n'); CurrCost += AddedCost; advancePosition(); ++BranchesExplored; bool FinishedExploring = false; // If the Cost after adding edges is greater than a known solution, // backtrack - if (CurrCost < BestCost || BestCost == -1) { + int LBCost = + (EnableLowerBound && BestCost != -1) ? calculateLowerBound() : 0; + if (BestCost == -1 || CurrCost + LBCost < BestCost) { if (solveExact()) { - FinishedExploring = BestCost != 0; + FinishedExploring = BestCost != StaticLowerBound; if (!FinishedExploring) return true; } @@ -609,7 +686,7 @@ bool FinishedExploring = false; if (CurrCost < BestCost || BestCost == -1) { if (solveExact()) { - bool FinishedExploring = BestCost != 0; + bool FinishedExploring = BestCost != StaticLowerBound; if (!FinishedExploring) return true; } @@ -622,7 +699,7 @@ bool PipelineSolver::solveGreedy() { BestCost = 0; - std::vector> AddedEdges; + SmallVector, 16> AddedEdges; while (static_cast(CurrSyncGroupIdx) < PipelineInstrs.size()) { SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; @@ -632,7 +709,7 @@ int BestGroupID = -1; auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum - << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); + << ") in Pipeline # " << CurrSyncGroupIdx << '\n'); // Since we have added the potential SchedGroups from bottom up, but // traversed the DAG from top down, parse over the groups from last to @@ -641,7 +718,7 @@ auto I = CurrSU.second.rbegin(); auto E = CurrSU.second.rend(); for (; I != E; ++I) { - std::vector> AddedEdges; + SmallVector, 16> AddedEdges; int CandSGID = *I; SchedGroup *Match; for (auto &SG : SyncPipeline) { @@ -650,14 +727,15 @@ } LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " - << (int)Match->getMask() << "\n"); + << (int)Match->getMask() << '\n'); if (Match->isFull()) { LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); continue; } - TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); - LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); + TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges, + BestNodeCost); + LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << '\n'); if (TempCost < BestNodeCost || BestNodeCost == -1) { BestGroup = Match; BestNodeCost = TempCost; @@ -672,7 +750,7 @@ BestGroup->add(*CurrSU.first); addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" - << (int)BestGroup->getMask() << "\n"); + << (int)BestGroup->getMask() << '\n'); BestCost += TempCost; } else BestCost += MissPenalty; @@ -709,11 +787,14 @@ LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n"); solveGreedy(); reset(); - LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n"); - if (BestCost > 0) { + LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << '\n'); + StaticLowerBound = calculateLowerBound(); + LLVM_DEBUG(dbgs() << "Lower Bound on Pipeline Cost is " << StaticLowerBound + << '\n'); + if (BestCost > StaticLowerBound) { LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n"); solveExact(); - LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n"); + LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << '\n'); } } else { // Use the Greedy Algorithm by default LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n"); @@ -897,7 +978,7 @@ } int SchedGroup::link(SUnit &SU, bool MakePred, - std::vector> &AddedEdges) { + SmallVectorImpl> &AddedEdges) { int MissedEdges = 0; for (auto *A : Collection) { SUnit *B = &SU; @@ -906,10 +987,6 @@ if (MakePred) std::swap(A, B); - if (DAG->IsReachable(B, A)) - continue; - // tryAddEdge returns false if there is a dependency that makes adding - // the A->B edge impossible, otherwise it returns true; bool Added = tryAddEdge(A, B); if (Added) AddedEdges.push_back(std::pair(A, B)); diff --git a/llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll b/llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/igrouplp-pipelinesolver-lowerbound.ll @@ -0,0 +1,241 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACT %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -misched-cluster=0 -amdgpu-igrouplp-exact-solver=1 -amdgpu-igrouplp-exact-solver-max-branches=200000 -amdgpu-igrouplp-exact-solver-cost-heur=1 < %s | FileCheck -check-prefix=LB %s + +define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(<32 x i32> addrspace(1)* noalias %in, <32 x i32> addrspace(1)* noalias %out) #0 { +; EXACT-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: +; EXACT: ; %bb.0: +; EXACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; EXACT-NEXT: v_lshlrev_b32_e32 v16, 7, v0 +; EXACT-NEXT: ; kill: killed $sgpr0_sgpr1 +; EXACT-NEXT: s_waitcnt lgkmcnt(0) +; EXACT-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 +; EXACT-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 +; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACT-NEXT: s_waitcnt vmcnt(1) +; EXACT-NEXT: v_mul_lo_u32 v13, v13, v13 +; EXACT-NEXT: s_waitcnt vmcnt(0) +; EXACT-NEXT: v_mul_lo_u32 v7, v7, v7 +; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; EXACT-NEXT: v_mul_lo_u32 v6, v6, v6 +; EXACT-NEXT: v_mul_lo_u32 v12, v12, v12 +; EXACT-NEXT: v_mul_lo_u32 v15, v15, v15 +; EXACT-NEXT: v_mul_lo_u32 v14, v14, v14 +; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACT-NEXT: s_waitcnt vmcnt(0) +; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3 +; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2 +; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1 +; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0 +; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 +; EXACT-NEXT: s_waitcnt vmcnt(0) +; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3 +; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2 +; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1 +; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0 +; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 +; EXACT-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 +; EXACT-NEXT: s_waitcnt vmcnt(0) +; EXACT-NEXT: v_mul_lo_u32 v3, v3, v3 +; EXACT-NEXT: v_mul_lo_u32 v2, v2, v2 +; EXACT-NEXT: v_mul_lo_u32 v1, v1, v1 +; EXACT-NEXT: v_mul_lo_u32 v0, v0, v0 +; EXACT-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 +; EXACT-NEXT: v_mul_lo_u32 v5, v5, v5 +; EXACT-NEXT: v_mul_lo_u32 v4, v4, v4 +; EXACT-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 +; EXACT-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACT-NEXT: s_waitcnt vmcnt(0) +; EXACT-NEXT: v_mul_lo_u32 v7, v7, v7 +; EXACT-NEXT: v_mul_lo_u32 v6, v6, v6 +; EXACT-NEXT: v_mul_lo_u32 v5, v5, v5 +; EXACT-NEXT: v_mul_lo_u32 v4, v4, v4 +; EXACT-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 +; EXACT-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 +; EXACT-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 +; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACT-NEXT: s_waitcnt vmcnt(0) +; EXACT-NEXT: v_mul_lo_u32 v9, v9, v9 +; EXACT-NEXT: v_mul_lo_u32 v8, v8, v8 +; EXACT-NEXT: v_mul_lo_u32 v11, v11, v11 +; EXACT-NEXT: v_mul_lo_u32 v10, v10, v10 +; EXACT-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 +; EXACT-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 +; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACT-NEXT: s_waitcnt vmcnt(0) +; EXACT-NEXT: v_mul_lo_u32 v11, v11, v11 +; EXACT-NEXT: v_mul_lo_u32 v10, v10, v10 +; EXACT-NEXT: v_mul_lo_u32 v9, v9, v9 +; EXACT-NEXT: v_mul_lo_u32 v8, v8, v8 +; EXACT-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 +; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; EXACT-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; EXACT-NEXT: s_endpgm +; +; LB-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: +; LB: ; %bb.0: +; LB-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; LB-NEXT: v_lshlrev_b32_e32 v12, 7, v0 +; LB-NEXT: s_waitcnt lgkmcnt(0) +; LB-NEXT: global_load_dwordx4 v[8:11], v12, s[0:1] offset:64 +; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; LB-NEXT: s_waitcnt vmcnt(0) +; LB-NEXT: v_mul_lo_u32 v11, v11, v11 +; LB-NEXT: v_mul_lo_u32 v10, v10, v10 +; LB-NEXT: v_mul_lo_u32 v9, v9, v9 +; LB-NEXT: v_mul_lo_u32 v8, v8, v8 +; LB-NEXT: global_store_dwordx4 v12, v[8:11], s[2:3] offset:64 +; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] +; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; LB-NEXT: s_waitcnt vmcnt(0) +; LB-NEXT: v_mul_lo_u32 v3, v3, v3 +; LB-NEXT: v_mul_lo_u32 v2, v2, v2 +; LB-NEXT: global_load_dwordx4 v[8:11], v12, s[0:1] offset:32 +; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; LB-NEXT: s_waitcnt vmcnt(0) +; LB-NEXT: v_mul_lo_u32 v9, v9, v9 +; LB-NEXT: v_mul_lo_u32 v8, v8, v8 +; LB-NEXT: v_mul_lo_u32 v11, v11, v11 +; LB-NEXT: v_mul_lo_u32 v10, v10, v10 +; LB-NEXT: global_store_dwordx4 v12, v[8:11], s[2:3] offset:32 +; LB-NEXT: global_load_dwordx4 v[4:7], v12, s[0:1] offset:112 +; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; LB-NEXT: s_waitcnt vmcnt(0) +; LB-NEXT: v_mul_lo_u32 v7, v7, v7 +; LB-NEXT: v_mul_lo_u32 v6, v6, v6 +; LB-NEXT: v_mul_lo_u32 v1, v1, v1 +; LB-NEXT: v_mul_lo_u32 v0, v0, v0 +; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] +; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:96 +; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; LB-NEXT: s_waitcnt vmcnt(0) +; LB-NEXT: v_mul_lo_u32 v3, v3, v3 +; LB-NEXT: v_mul_lo_u32 v2, v2, v2 +; LB-NEXT: v_mul_lo_u32 v1, v1, v1 +; LB-NEXT: v_mul_lo_u32 v0, v0, v0 +; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:96 +; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:80 +; LB-NEXT: s_waitcnt vmcnt(0) +; LB-NEXT: v_mul_lo_u32 v3, v3, v3 +; LB-NEXT: v_mul_lo_u32 v2, v2, v2 +; LB-NEXT: v_mul_lo_u32 v1, v1, v1 +; LB-NEXT: v_mul_lo_u32 v0, v0, v0 +; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:80 +; LB-NEXT: v_mul_lo_u32 v5, v5, v5 +; LB-NEXT: v_mul_lo_u32 v4, v4, v4 +; LB-NEXT: global_store_dwordx4 v12, v[4:7], s[2:3] offset:112 +; LB-NEXT: global_load_dwordx4 v[4:7], v12, s[0:1] offset:48 +; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; LB-NEXT: s_waitcnt vmcnt(0) +; LB-NEXT: v_mul_lo_u32 v5, v5, v5 +; LB-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] offset:16 +; LB-NEXT: v_mul_lo_u32 v4, v4, v4 +; LB-NEXT: s_waitcnt vmcnt(0) +; LB-NEXT: v_mul_lo_u32 v1, v1, v1 +; LB-NEXT: v_mul_lo_u32 v0, v0, v0 +; LB-NEXT: v_mul_lo_u32 v3, v3, v3 +; LB-NEXT: v_mul_lo_u32 v2, v2, v2 +; LB-NEXT: global_store_dwordx4 v12, v[0:3], s[2:3] offset:16 +; LB-NEXT: v_mul_lo_u32 v7, v7, v7 +; LB-NEXT: v_mul_lo_u32 v6, v6, v6 +; LB-NEXT: global_store_dwordx4 v12, v[4:7], s[2:3] offset:48 +; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) +; LB-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) +; LB-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep1 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %in, i32 %tid + %load = load <32 x i32>, <32 x i32> addrspace(1)* %gep1 + %mul = mul <32 x i32> %load, %load + %gep2 = getelementptr <32 x i32>, <32 x i32> addrspace(1)* %out, i32 %tid + store <32 x i32> %mul, <32 x i32> addrspace(1)* %gep2 + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ; 1 VMEM read + call void @llvm.amdgcn.sched.group.barrier(i32 32, i32 1, i32 0) + ; 2 VALU + call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 2, i32 0) + ; 1 VMEM write + call void @llvm.amdgcn.sched.group.barrier(i32 64, i32 1, i32 0) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #0 +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #0 + +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" readnone speculatable}