diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1259,6 +1259,13 @@ return false; } + /// Get the size (in bytes) of the destination register of memory instruction. + /// It is not guaranteed to work for all the targets though. Depending on the + /// requirement, individual target may require to implement its own logic. + virtual unsigned getDstMemOperandSize(const MachineInstr &MI) const { + return (!MI.memoperands_empty() ? MI.memoperands().front()->getSize() : 0); + } + /// Return true if the instruction contains a base register and offset. If /// true, the function also sets the operand position in the instruction /// for the base register and offset. @@ -1283,9 +1290,11 @@ /// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations. /// \p NumLoads is the number of loads that will be in the cluster if this /// hook returns true. + /// \p NumBytes is the number of bytes that will be loaded from all the + /// clustered loads if this hook returns true. virtual bool shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, - unsigned NumLoads) const { + unsigned NumLoads, unsigned NumBytes) const { llvm_unreachable("target did not implement shouldClusterMemOps()"); } diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1473,10 +1473,12 @@ SUnit *SU; SmallVector BaseOps; int64_t Offset; + unsigned Width; MemOpInfo(SUnit *SU, ArrayRef BaseOps, - int64_t Offset) - : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset) {} + int64_t Offset, unsigned Width) + : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset), + Width(Width) {} static bool Compare(const MachineOperand *const &A, const MachineOperand *const &B) { @@ -1565,12 +1567,14 @@ ArrayRef MemOps, ScheduleDAGInstrs *DAG) { SmallVector MemOpRecords; for (SUnit *SU : MemOps) { + const MachineInstr &MI = *SU->getInstr(); SmallVector BaseOps; int64_t Offset; bool OffsetIsScalable; - if (TII->getMemOperandsWithOffset(*SU->getInstr(), BaseOps, Offset, - OffsetIsScalable, TRI)) - MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset)); + if (TII->getMemOperandsWithOffset(MI, BaseOps, Offset, OffsetIsScalable, + TRI)) + MemOpRecords.push_back( + MemOpInfo(SU, BaseOps, Offset, TII->getDstMemOperandSize(MI))); #ifndef NDEBUG for (auto *Op : BaseOps) assert(Op); @@ -1584,16 +1588,19 @@ // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to // cluster mem ops collected within `MemOpRecords` array. unsigned ClusterLength = 1; + unsigned CurrentClusterBytes = MemOpRecords[0].Width; for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { // Decision to cluster mem ops is taken based on target dependent logic auto MemOpa = MemOpRecords[Idx]; auto MemOpb = MemOpRecords[Idx + 1]; ++ClusterLength; - if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, - ClusterLength)) { + CurrentClusterBytes += MemOpb.Width; + if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength, + CurrentClusterBytes)) { // Current mem ops pair could not be clustered, reset cluster length, and // go to next pair ClusterLength = 1; + CurrentClusterBytes = MemOpb.Width; continue; } @@ -1605,6 +1612,7 @@ // FIXME: Is this check really required? if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { ClusterLength = 1; + CurrentClusterBytes = MemOpb.Width; continue; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -140,7 +140,7 @@ bool shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, - unsigned NumLoads) const override; + unsigned NumLoads, unsigned NumBytes) const override; void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2492,7 +2492,8 @@ /// Only called for LdSt for which getMemOperandWithOffset returns true. bool AArch64InstrInfo::shouldClusterMemOps( ArrayRef BaseOps1, - ArrayRef BaseOps2, unsigned NumLoads) const { + ArrayRef BaseOps2, unsigned NumLoads, + unsigned NumBytes) const { assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); const MachineOperand &BaseOp1 = *BaseOps1.front(); const MachineOperand &BaseOp2 = *BaseOps2.front(); diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -127,6 +127,11 @@ return true; } + unsigned getDstMemOperandSize(const MachineInstr *MI, + const SIInstrInfo *SII) const { + return (MI ? SII->getDstMemOperandSize(*MI) : 0); + } + bool runOnMachineFunction(MachineFunction &MF) override { if (skipFunction(MF.getFunction())) return false; @@ -156,6 +161,9 @@ } } + unsigned WidthA = getDstMemOperandSize(CI.Last, SII); + unsigned WidthB = getDstMemOperandSize(&MI, SII); + if (CI.Length == 64 || (CI.Length && Type != HARDCLAUSE_INTERNAL && (Type != CI.Type || @@ -164,7 +172,8 @@ // scheduler it limits the size of the cluster to avoid increasing // register pressure too much, but this pass runs after register // allocation so there is no need for that kind of limit. - !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2)))) { + !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2, + WidthA + WidthB)))) { // Finish the current clause. Changed |= emitClause(CI, SII); CI = ClauseInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -187,9 +187,14 @@ int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const final; + unsigned getDstRegSizeInBytes(const MachineInstr &LdSt, + const MachineOperand *Dst) const; + + unsigned getDstMemOperandSize(const MachineInstr &LdSt) const final; + bool shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, - unsigned NumLoads) const override; + unsigned NumLoads, unsigned NumBytes) const override; bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -387,6 +387,32 @@ return false; } +unsigned SIInstrInfo::getDstRegSizeInBytes(const MachineInstr &LdSt, + const MachineOperand *Dst) const { + const MachineRegisterInfo &MRI = LdSt.getParent()->getParent()->getRegInfo(); + const Register Reg = Dst->getReg(); + const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) + ? MRI.getRegClass(Reg) + : RI.getPhysRegClass(Reg); + return (RI.getRegSizeInBits(*DstRC) / 8); +} + +unsigned SIInstrInfo::getDstMemOperandSize(const MachineInstr &LdSt) const { + const MachineOperand *Dst = nullptr; + + if (isMUBUF(LdSt) || isMTBUF(LdSt) || isFLAT(LdSt)) { + Dst = getNamedOperand(LdSt, AMDGPU::OpName::vdata); + if (!Dst) + Dst = getNamedOperand(LdSt, AMDGPU::OpName::vdst); + } else if (isSMRD(LdSt)) { + Dst = getNamedOperand(LdSt, AMDGPU::OpName::sdst); + } else if (isDS(LdSt)) { + Dst = getNamedOperand(LdSt, AMDGPU::OpName::vdst); + } + + return (Dst ? getDstRegSizeInBytes(LdSt, Dst) : 0); +} + static bool memOpsHaveSameBaseOperands(ArrayRef BaseOps1, ArrayRef BaseOps2) { @@ -430,7 +456,8 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, - unsigned NumLoads) const { + unsigned NumLoads, + unsigned NumBytes) const { assert(!BaseOps1.empty() && !BaseOps2.empty()); const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();