diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1276,6 +1276,10 @@ /// or /// DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); /// to TargetPassConfig::createMachineScheduler() to have an effect. + /// + /// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations. + /// \p NumLoads is the number of loads that will be in the cluster if this + /// hook returns true. virtual bool shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, unsigned NumLoads) const { diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1584,7 +1584,7 @@ SUnit *SUb = MemOpRecords[Idx+1].SU; if (TII->shouldClusterMemOps(MemOpRecords[Idx].BaseOps, MemOpRecords[Idx + 1].BaseOps, - ClusterLength)) { + ClusterLength + 1)) { if (SUa->NodeNum > SUb->NodeNum) std::swap(SUa, SUb); if (DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2422,7 +2422,7 @@ return false; // Only cluster up to a single pair. - if (NumLoads > 1) + if (NumLoads > 2) return false; if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -457,7 +457,7 @@ if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { - const unsigned MaxGlobalLoadCluster = 6; + const unsigned MaxGlobalLoadCluster = 7; if (NumLoads > MaxGlobalLoadCluster) return false; @@ -497,7 +497,11 @@ ? MRI.getRegClass(Reg) : RI.getPhysRegClass(Reg); - return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; + // FIXME: NumLoads should not be subtracted 1. This is to match behavior + // of clusterNeighboringMemOps which was previosly passing cluster length + // less 1. LoadClusterThreshold should be tuned instead. + return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <= + LoadClusterThreshold; } // FIXME: This behaves strangely. If, for example, you have 32 load + stores,