diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1580,34 +1580,47 @@ return; llvm::sort(MemOpRecords); - unsigned ClusterLength = 1; + + unsigned ClusterLength = 2; for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { - SUnit *SUa = MemOpRecords[Idx].SU; - SUnit *SUb = MemOpRecords[Idx+1].SU; - if (TII->shouldClusterMemOps(MemOpRecords[Idx].BaseOps, - MemOpRecords[Idx + 1].BaseOps, - ClusterLength + 1)) { - if (SUa->NodeNum > SUb->NodeNum) - std::swap(SUa, SUb); - if (DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { - LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" - << SUb->NodeNum << ")\n"); - // Copy successor edges from SUa to SUb. Interleaving computation - // dependent on SUa can prevent load combining due to register reuse. - // Predecessor edges do not need to be copied from SUb to SUa since - // nearby loads should have effectively the same inputs. - for (const SDep &Succ : SUa->Succs) { - if (Succ.getSUnit() == SUb) - continue; - LLVM_DEBUG(dbgs() - << " Copy Succ SU(" << Succ.getSUnit()->NodeNum << ")\n"); - DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); - } - ++ClusterLength; - } else - ClusterLength = 1; - } else - ClusterLength = 1; + // Decision to cluster mem ops is taken based on target dependent logic + auto MemOpa = MemOpRecords[Idx]; + auto MemOpb = MemOpRecords[Idx + 1]; + if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, + ClusterLength)) { + // Current mem ops pair could not be clustered, reset cluster length, and + // go to next pair + ClusterLength = 2; + continue; + } + + SUnit *SUa = MemOpa.SU; + SUnit *SUb = MemOpb.SU; + if (SUa->NodeNum > SUb->NodeNum) + std::swap(SUa, SUb); + + // FIXME: Is this check really required? + if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { + ClusterLength = 2; + continue; + } + + LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" + << SUb->NodeNum << ")\n"); + + // Copy successor edges from SUa to SUb. Interleaving computation + // dependent on SUa can prevent load combining due to register reuse. + // Predecessor edges do not need to be copied from SUb to SUa since + // nearby loads should have effectively the same inputs. + for (const SDep &Succ : SUa->Succs) { + if (Succ.getSUnit() == SUb) + continue; + LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum + << ")\n"); + DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); + } + + ++ClusterLength; } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -61,6 +61,7 @@ }; using SetVectorType = SmallSetVector; + mutable unsigned NumClusteredBytes = 0; static unsigned getBranchOpcode(BranchPredicate Cond); static BranchPredicate getBranchPredicate(unsigned Opcode); @@ -134,6 +135,11 @@ Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const; + void computeNumClusteredBytes(const MachineInstr &FirstLdSt, + const MachineOperand *FirstDst, + const MachineOperand *SecondDst, + const unsigned NumLoads) const; + protected: bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -428,6 +428,33 @@ return Base1 == Base2; } +void SIInstrInfo::computeNumClusteredBytes(const MachineInstr &FirstLdSt, + const MachineOperand *FirstDst, + const MachineOperand *SecondDst, + const unsigned NumLoads) const { + const MachineRegisterInfo &MRI = + FirstLdSt.getParent()->getParent()->getRegInfo(); + + const Register FirstDstReg = FirstDst->getReg(); + const Register SecondDstReg = SecondDst->getReg(); + + const TargetRegisterClass *FirstDstRC = + Register::isVirtualRegister(FirstDstReg) + ? MRI.getRegClass(FirstDstReg) + : RI.getPhysRegClass(FirstDstReg); + const TargetRegisterClass *SecondDstRC = + Register::isVirtualRegister(SecondDstReg) + ? MRI.getRegClass(SecondDstReg) + : RI.getPhysRegClass(SecondDstReg); + + const unsigned FirstNumDestBytes = RI.getRegSizeInBits(*FirstDstRC) / 8; + const unsigned SecondNumDestBytes = RI.getRegSizeInBits(*SecondDstRC) / 8; + + NumClusteredBytes = (NumLoads == 2) + ? (FirstNumDestBytes + SecondNumDestBytes) + : (NumClusteredBytes + SecondNumDestBytes); +} + bool SIInstrInfo::shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, unsigned NumLoads) const { @@ -465,30 +492,11 @@ if (!FirstDst || !SecondDst) return false; - // Try to limit clustering based on the total number of bytes loaded - // rather than the number of instructions. This is done to help reduce - // register pressure. The method used is somewhat inexact, though, - // because it assumes that all loads in the cluster will load the - // same number of bytes as FirstLdSt. - - // The unit of this value is bytes. - // FIXME: This needs finer tuning. - unsigned LoadClusterThreshold = 16; - - const MachineRegisterInfo &MRI = - FirstLdSt.getParent()->getParent()->getRegInfo(); - - const Register Reg = FirstDst->getReg(); - - const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) - ? MRI.getRegClass(Reg) - : RI.getPhysRegClass(Reg); - - // FIXME: NumLoads should not be subtracted 1. This is to match behavior - // of clusterNeighboringMemOps which was previosly passing cluster length - // less 1. LoadClusterThreshold should be tuned instead. - return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <= - LoadClusterThreshold; + // Try to limit clustering based on the total number of clustred bytes loaded + // rather than the number of clustered instructions. + unsigned MaxNumClusteredBytes = 16; + computeNumClusteredBytes(FirstLdSt, FirstDst, SecondDst, NumLoads); + return (NumClusteredBytes <= MaxNumClusteredBytes); } // FIXME: This behaves strangely. If, for example, you have 32 load + stores,