diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -467,57 +467,11 @@ if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) return false; - const MachineOperand *FirstDst = nullptr; - const MachineOperand *SecondDst = nullptr; - - if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || - (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || - (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { - const unsigned MaxGlobalLoadCluster = 7; - if (NumLoads > MaxGlobalLoadCluster) - return false; - - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); - if (!FirstDst) - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); - if (!SecondDst) - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); - } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); - } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); - } - - if (!FirstDst || !SecondDst) - return false; - - // Try to limit clustering based on the total number of bytes loaded - // rather than the number of instructions. This is done to help reduce - // register pressure. The method used is somewhat inexact, though, - // because it assumes that all loads in the cluster will load the - // same number of bytes as FirstLdSt. - - // The unit of this value is bytes. - // FIXME: This needs finer tuning. - unsigned LoadClusterThreshold = 16; - - const MachineRegisterInfo &MRI = - FirstLdSt.getParent()->getParent()->getRegInfo(); - - const Register Reg = FirstDst->getReg(); - - const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) - ? MRI.getRegClass(Reg) - : RI.getPhysRegClass(Reg); - - // FIXME: NumLoads should not be subtracted 1. This is to match behavior - // of clusterNeighboringMemOps which was previosly passing cluster length - // less 1. LoadClusterThreshold should be tuned instead. - return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <= - LoadClusterThreshold; + assert((NumLoads > 0) && (NumBytes > 0) && (NumBytes >= NumLoads) && + "Invalid NumLoads/NumBytes values"); + unsigned AverageBytesPerLoad = NumBytes / NumLoads; + unsigned MaxNumLoads = AverageBytesPerLoad <= 4 ? 5 : 4; + return NumLoads <= MaxNumLoads; } // FIXME: This behaves strangely. If, for example, you have 32 load + stores,