diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h --- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -268,6 +268,11 @@ return SU->SchedClass; } + /// IsReachable - Checks if SU is reachable from TargetSU. + bool IsReachable(SUnit *SU, SUnit *TargetSU) { + return Topo.IsReachable(SU, TargetSU); + } + /// Returns an iterator to the top of the current scheduling region. MachineBasicBlock::iterator begin() const { return RegionBegin; } diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1530,7 +1530,10 @@ void apply(ScheduleDAGInstrs *DAGInstrs) override; protected: - void clusterNeighboringMemOps(ArrayRef MemOps, ScheduleDAGInstrs *DAG); + void clusterNeighboringMemOps(ArrayRef MemOps, + ScheduleDAGInstrs *DAG); + void collectMemOpRecords(std::vector &SUnits, + SmallVectorImpl &MemOpRecords); }; class StoreClusterMutation : public BaseMemOpClusterMutation { @@ -1566,63 +1569,53 @@ } // end namespace llvm +// Sorting all the loads/stores first, then for each load/store, checking the +// following load/store one by one, until reach the first non-dependent one and +// call target hook to see if they can cluster. void BaseMemOpClusterMutation::clusterNeighboringMemOps( - ArrayRef MemOps, ScheduleDAGInstrs *DAG) { - SmallVector MemOpRecords; - for (SUnit *SU : MemOps) { - const MachineInstr &MI = *SU->getInstr(); - SmallVector BaseOps; - int64_t Offset; - bool OffsetIsScalable; - unsigned Width; - if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, - OffsetIsScalable, Width, TRI)) { - MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset, Width)); - - LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: " - << Offset << ", OffsetIsScalable: " << OffsetIsScalable - << ", Width: " << Width << "\n"); - } -#ifndef NDEBUG - for (auto *Op : BaseOps) - assert(Op); -#endif - } - if (MemOpRecords.size() < 2) - return; - - llvm::sort(MemOpRecords); + ArrayRef MemOpRecords, ScheduleDAGInstrs *DAG) { + // Keep track of the current cluster length and bytes for each SUnit. + DenseMap> SUnit2ClusterInfo; // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to // cluster mem ops collected within `MemOpRecords` array. - unsigned ClusterLength = 1; - unsigned CurrentClusterBytes = MemOpRecords[0].Width; for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { // Decision to cluster mem ops is taken based on target dependent logic auto MemOpa = MemOpRecords[Idx]; - auto MemOpb = MemOpRecords[Idx + 1]; - ++ClusterLength; - CurrentClusterBytes += MemOpb.Width; - if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength, - CurrentClusterBytes)) { - // Current mem ops pair could not be clustered, reset cluster length, and - // go to next pair - ClusterLength = 1; - CurrentClusterBytes = MemOpb.Width; + + // Seek for the next load/store to do the cluster. + unsigned NextIdx = Idx + 1; + for (; NextIdx < End; ++NextIdx) + // Skip if MemOpb has been clustered already or has dependency with + // MemOpa. + if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) && + !DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) && + !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU)) + break; + if (NextIdx == End) continue; + + auto MemOpb = MemOpRecords[NextIdx]; + unsigned ClusterLength = 2; + unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width; + if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) { + ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1; + CurrentClusterBytes = + SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width; } + if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength, + CurrentClusterBytes)) + continue; + SUnit *SUa = MemOpa.SU; SUnit *SUb = MemOpb.SU; if (SUa->NodeNum > SUb->NodeNum) std::swap(SUa, SUb); // FIXME: Is this check really required? - if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { - ClusterLength = 1; - CurrentClusterBytes = MemOpb.Width; + if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) continue; - } LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" << SUb->NodeNum << ")\n"); @@ -1656,42 +1649,57 @@ } } + SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength, + CurrentClusterBytes}; + LLVM_DEBUG(dbgs() << " Curr cluster length: " << ClusterLength << ", Curr cluster bytes: " << CurrentClusterBytes << "\n"); } } -/// Callback from DAG postProcessing to create cluster edges for loads. -void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) { - // Map DAG NodeNum to a set of dependent MemOps in store chain. - DenseMap> StoreChains; - for (SUnit &SU : DAG->SUnits) { +void BaseMemOpClusterMutation::collectMemOpRecords( + std::vector &SUnits, SmallVectorImpl &MemOpRecords) { + for (auto &SU : SUnits) { if ((IsLoad && !SU.getInstr()->mayLoad()) || (!IsLoad && !SU.getInstr()->mayStore())) continue; - unsigned ChainPredID = DAG->SUnits.size(); - for (const SDep &Pred : SU.Preds) { - // We only want to cluster the mem ops that have the same ctrl(non-data) - // pred so that they didn't have ctrl dependency for each other. But for - // store instrs, we can still cluster them if the pred is load instr. - if ((Pred.isCtrl() && - (IsLoad || - (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) && - !Pred.isArtificial()) { - ChainPredID = Pred.getSUnit()->NodeNum; - break; - } + const MachineInstr &MI = *SU.getInstr(); + SmallVector BaseOps; + int64_t Offset; + bool OffsetIsScalable; + unsigned Width; + if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, + OffsetIsScalable, Width, TRI)) { + MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width)); + + LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: " + << Offset << ", OffsetIsScalable: " << OffsetIsScalable + << ", Width: " << Width << "\n"); } - // Insert the SU to corresponding store chain. - auto &Chain = StoreChains.FindAndConstruct(ChainPredID).second; - Chain.push_back(&SU); +#ifndef NDEBUG + for (auto *Op : BaseOps) + assert(Op); +#endif } +} + +/// Callback from DAG postProcessing to create cluster edges for loads/stores. +void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) { + // Collect all the clusterable loads/stores + SmallVector MemOpRecords; + collectMemOpRecords(DAG->SUnits, MemOpRecords); + + if (MemOpRecords.size() < 2) + return; + + // Sorting the loads/stores, so that, we can stop the cluster as early as + // possible. + llvm::sort(MemOpRecords); - // Iterate over the store chains. - for (auto &SCD : StoreChains) - clusterNeighboringMemOps(SCD.second, DAG); + // Trying to cluster all the neighboring loads/stores. + clusterNeighboringMemOps(MemOpRecords, DAG); } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll --- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll @@ -214,11 +214,11 @@ ret void } -; FIXME - The SU(4) and SU(7) can be clustered even with +; Verify that the SU(4) and SU(7) can be clustered even with ; different preds ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: cluster_with_different_preds:%bb.0 -; CHECK-NOT:Cluster ld/st SU(4) - SU(7) +; CHECK:Cluster ld/st SU(4) - SU(7) ; CHECK:SU(3): STRWui %2:gpr32, %0:gpr64common, 0 :: ; CHECK:SU(4): %3:gpr32 = LDRWui %1:gpr64common, 0 :: ; CHECK:Predecessors: diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -624,9 +624,9 @@ ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} ; FIXEDABI: s_movk_i32 s32, 0x400{{$}} ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} @@ -669,8 +669,8 @@ ; FIXED-ABI-NOT: v31 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}} -; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} ; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}} +; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} ; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -145,14 +145,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4 ; GFX9-NEXT: global_load_short_d16 v1, v0, s[0:1] offset:4 -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] -; GFX9-NEXT: global_load_dword v4, v0, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: global_load_dword v3, v0, s[0:1] +; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4 +; GFX9-NEXT: global_load_dword v4, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_pk_max_i16 v1, v2, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v3, v3, v4 +; GFX9-NEXT: v_pk_max_i16 v3, v4, v3 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] offset:4 ; GFX9-NEXT: global_store_dword v0, v3, s[4:5] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -160,16 +160,14 @@ ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2 ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s34, 3 -; GCN: s_mov_b32 s34, s32 ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 +; GCN: s_mov_b32 s34, s32 +; GCN: v_mov_b32_e32 v32, 0 +; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 ; GCN-NEXT: s_add_u32 s32, s32, 0x30000 -; GCN: v_mov_b32_e32 v33, 0 - -; GCN: buffer_store_dword v33, off, s[0:3], s33 offset:1024 - ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]