diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -129,6 +129,15 @@ static cl::opt EnableMemOpCluster("misched-cluster", cl::Hidden, cl::desc("Enable memop clustering."), cl::init(true)); +static cl::opt + ForceFastCluster("force-fast-cluster", cl::Hidden, + cl::desc("Switch to fast cluster algorithm with the lost " + "of some fusion opportunities"), + cl::init(false)); +static cl::opt + FastClusterThreshold("fast-cluster-threshold", cl::Hidden, + cl::desc("The threshold for fast cluster"), + cl::init(1000)); // DAG subtrees must have at least this many nodes. static const unsigned MinSubtreeSize = 8; @@ -1530,10 +1539,12 @@ void apply(ScheduleDAGInstrs *DAGInstrs) override; protected: - void clusterNeighboringMemOps(ArrayRef MemOps, + void clusterNeighboringMemOps(ArrayRef MemOps, bool FastCluster, ScheduleDAGInstrs *DAG); void collectMemOpRecords(std::vector &SUnits, SmallVectorImpl &MemOpRecords); + bool groupMemOps(ArrayRef MemOps, ScheduleDAGInstrs *DAG, + DenseMap> &Groups); }; class StoreClusterMutation : public BaseMemOpClusterMutation { @@ -1572,8 +1583,11 @@ // Sorting all the loads/stores first, then for each load/store, checking the // following load/store one by one, until reach the first non-dependent one and // call target hook to see if they can cluster. +// If FastCluster is enabled, we assume that, all the loads/stores have been +// preprocessed and now, they didn't have dependencies on each other. void BaseMemOpClusterMutation::clusterNeighboringMemOps( - ArrayRef MemOpRecords, ScheduleDAGInstrs *DAG) { + ArrayRef MemOpRecords, bool FastCluster, + ScheduleDAGInstrs *DAG) { // Keep track of the current cluster length and bytes for each SUnit. DenseMap> SUnit2ClusterInfo; @@ -1589,8 +1603,9 @@ // Skip if MemOpb has been clustered already or has dependency with // MemOpa. if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) && - !DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) && - !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU)) + (FastCluster || + (!DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) && + !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU)))) break; if (NextIdx == End) continue; @@ -1685,6 +1700,36 @@ } } +bool BaseMemOpClusterMutation::groupMemOps( + ArrayRef MemOps, ScheduleDAGInstrs *DAG, + DenseMap> &Groups) { + bool FastCluster = + ForceFastCluster || + MemOps.size() * DAG->SUnits.size() / 1000 > FastClusterThreshold; + + for (const auto &MemOp : MemOps) { + unsigned ChainPredID = DAG->SUnits.size(); + if (FastCluster) { + for (const SDep &Pred : MemOp.SU->Preds) { + // We only want to cluster the mem ops that have the same ctrl(non-data) + // pred so that they didn't have ctrl dependency for each other. But for + // store instrs, we can still cluster them if the pred is load instr. + if ((Pred.isCtrl() && + (IsLoad || + (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) && + !Pred.isArtificial()) { + ChainPredID = Pred.getSUnit()->NodeNum; + break; + } + } + } else + ChainPredID = 0; + + Groups[ChainPredID].push_back(MemOp); + } + return FastCluster; +} + /// Callback from DAG postProcessing to create cluster edges for loads/stores. void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) { // Collect all the clusterable loads/stores @@ -1694,12 +1739,20 @@ if (MemOpRecords.size() < 2) return; - // Sorting the loads/stores, so that, we can stop the cluster as early as - // possible. - llvm::sort(MemOpRecords); + // Put the loads/stores without dependency into the same group with some + // heuristic if the DAG is too complex to avoid compiling time blow up. + // Notice that, some fusion pair could be lost with this. + DenseMap> Groups; + bool FastCluster = groupMemOps(MemOpRecords, DAG, Groups); - // Trying to cluster all the neighboring loads/stores. - clusterNeighboringMemOps(MemOpRecords, DAG); + for (auto &Group : Groups) { + // Sorting the loads/stores, so that, we can stop the cluster as early as + // possible. + llvm::sort(Group.second); + + // Trying to cluster all the neighboring loads/stores. + clusterNeighboringMemOps(Group.second, FastCluster, DAG); + } } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll --- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts ; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -aarch64-enable-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -force-fast-cluster -verify-misched -debug-only=machine-scheduler -aarch64-enable-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK-FAST ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: stp_i64_scale:%bb.0 @@ -227,6 +228,10 @@ ; CHECK:SU(7): %5:gpr32 = LDRWui %1:gpr64common, 1 :: ; CHECK:Predecessors: ; CHECK:SU(6): Ord Latency=1 Memory +; CHECK-FAST: cluster_with_different_preds:%bb.0 +; CHECK-FAST-NOT: Cluster ld/st +; CHECK-FAST:SU(3): STRWui %2:gpr32, %0:gpr64common, 0 :: +; CHECK-FAST:SU(4): %3:gpr32 = LDRWui %1:gpr64common, 0 :: define i32 @cluster_with_different_preds(i32* %p, i32* %q) { entry: store i32 3, i32* %p, align 4