diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1251,11 +1251,10 @@ /// It returns false if no base operands and offset was found. /// It is not guaranteed to always recognize base operands and offsets in all /// cases. - virtual bool - getMemOperandsWithOffset(const MachineInstr &MI, - SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, - const TargetRegisterInfo *TRI) const { + virtual bool getMemOperandsWithOffset( + const MachineInstr &MI, SmallVectorImpl &BaseOps, + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { return false; } @@ -1283,9 +1282,11 @@ /// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations. /// \p NumLoads is the number of loads that will be in the cluster if this /// hook returns true. + /// \p NumBytes is the number of bytes that will be loaded from all the + /// clustered loads if this hook returns true. virtual bool shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, - unsigned NumLoads) const { + unsigned NumLoads, unsigned NumBytes) const { llvm_unreachable("target did not implement shouldClusterMemOps()"); } diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1473,10 +1473,12 @@ SUnit *SU; SmallVector BaseOps; int64_t Offset; + unsigned Width; MemOpInfo(SUnit *SU, ArrayRef BaseOps, - int64_t Offset) - : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset) {} + int64_t Offset, unsigned Width) + : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset), + Width(Width) {} static bool Compare(const MachineOperand *const &A, const MachineOperand *const &B) { @@ -1565,12 +1567,14 @@ ArrayRef MemOps, ScheduleDAGInstrs *DAG) { SmallVector MemOpRecords; for (SUnit *SU : MemOps) { + const MachineInstr &MI = *SU->getInstr(); SmallVector BaseOps; int64_t Offset; bool OffsetIsScalable; - if (TII->getMemOperandsWithOffset(*SU->getInstr(), BaseOps, Offset, - OffsetIsScalable, TRI)) - MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset)); + unsigned Width; + if (TII->getMemOperandsWithOffset(MI, BaseOps, Offset, OffsetIsScalable, + Width, TRI)) + MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset, Width)); #ifndef NDEBUG for (auto *Op : BaseOps) assert(Op); @@ -1584,16 +1588,19 @@ // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to // cluster mem ops collected within `MemOpRecords` array. unsigned ClusterLength = 1; + unsigned CurrentClusterBytes = MemOpRecords[0].Width; for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { // Decision to cluster mem ops is taken based on target dependent logic auto MemOpa = MemOpRecords[Idx]; auto MemOpb = MemOpRecords[Idx + 1]; ++ClusterLength; - if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, - ClusterLength)) { + CurrentClusterBytes += MemOpb.Width; + if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength, + CurrentClusterBytes)) { // Current mem ops pair could not be clustered, reset cluster length, and // go to next pair ClusterLength = 1; + CurrentClusterBytes = MemOpb.Width; continue; } @@ -1605,6 +1612,7 @@ // FIXME: Is this check really required? if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { ClusterLength = 1; + CurrentClusterBytes = MemOpb.Width; continue; } diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -1037,7 +1037,9 @@ const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const { SmallVector BaseOps; - if (!getMemOperandsWithOffset(MI, BaseOps, Offset, OffsetIsScalable, TRI) || + unsigned Width; + if (!getMemOperandsWithOffset(MI, BaseOps, Offset, OffsetIsScalable, Width, + TRI) || BaseOps.size() != 1) return false; BaseOp = BaseOps.front(); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -115,8 +115,8 @@ bool getMemOperandsWithOffset( const MachineInstr &MI, SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) - const override; + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const override; /// If \p OffsetIsScalable is set to 'true', the offset is scaled by `vscale`. /// This is true for some SVE instructions like ldr/str that have a @@ -140,7 +140,7 @@ bool shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, - unsigned NumLoads) const override; + unsigned NumLoads, unsigned NumBytes) const override; void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2035,13 +2035,12 @@ bool AArch64InstrInfo::getMemOperandsWithOffset( const MachineInstr &LdSt, SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) - const { + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { if (!LdSt.mayLoadOrStore()) return false; const MachineOperand *BaseOp; - unsigned Width; if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, Width, TRI)) return false; @@ -2492,7 +2491,8 @@ /// Only called for LdSt for which getMemOperandWithOffset returns true. bool AArch64InstrInfo::shouldClusterMemOps( ArrayRef BaseOps1, - ArrayRef BaseOps2, unsigned NumLoads) const { + ArrayRef BaseOps2, unsigned NumLoads, + unsigned NumBytes) const { assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); const MachineOperand &BaseOp1 = *BaseOps1.front(); const MachineOperand &BaseOp2 = *BaseOps2.front(); diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -146,10 +146,11 @@ int64_t Dummy1; bool Dummy2; + unsigned Dummy3; SmallVector BaseOps; if (Type <= LAST_REAL_HARDCLAUSE_TYPE) { if (!SII->getMemOperandsWithOffset(MI, BaseOps, Dummy1, Dummy2, - TRI)) { + Dummy3, TRI)) { // We failed to get the base operands, so we'll never clause this // instruction with any other, so pretend it's illegal. Type = HARDCLAUSE_ILLEGAL; @@ -164,7 +165,7 @@ // scheduler it limits the size of the cluster to avoid increasing // register pressure too much, but this pass runs after register // allocation so there is no need for that kind of limit. - !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2)))) { + !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2, 2)))) { // Finish the current clause. Changed |= emitClause(CI, SII); CI = ClauseInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -181,15 +181,21 @@ int64_t &Offset1, int64_t &Offset2) const override; + unsigned getDstOpSizeInBytes(const MachineInstr &LdSt, + const MachineOperand *Dst) const; + + const MachineOperand *getDstMemOperand(const MachineInstr &LdSt) const; + bool getMemOperandsWithOffset(const MachineInstr &LdSt, SmallVectorImpl &BaseOps, int64_t &Offset, bool &OffsetIsScalable, + unsigned &Width, const TargetRegisterInfo *TRI) const final; bool shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, - unsigned NumLoads) const override; + unsigned NumLoads, unsigned NumBytes) const override; bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -264,10 +264,40 @@ } } +unsigned SIInstrInfo::getDstOpSizeInBytes(const MachineInstr &LdSt, + const MachineOperand *Dst) const { + const MachineRegisterInfo &MRI = LdSt.getParent()->getParent()->getRegInfo(); + const Register Reg = Dst->getReg(); + const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) + ? MRI.getRegClass(Reg) + : RI.getPhysRegClass(Reg); + return (RI.getRegSizeInBits(*DstRC) / 8); +} + +const MachineOperand * +SIInstrInfo::getDstMemOperand(const MachineInstr &LdSt) const { + const MachineOperand *Dst = nullptr; + + if (isMUBUF(LdSt) || isMTBUF(LdSt) || isFLAT(LdSt)) { + Dst = getNamedOperand(LdSt, AMDGPU::OpName::vdata); + if (!Dst) + Dst = getNamedOperand(LdSt, AMDGPU::OpName::vdst); + } else if (isSMRD(LdSt)) { + Dst = getNamedOperand(LdSt, AMDGPU::OpName::sdst); + } else if (isDS(LdSt)) { + Dst = getNamedOperand(LdSt, AMDGPU::OpName::vdst); + } + + return Dst; +} + bool SIInstrInfo::getMemOperandsWithOffset( const MachineInstr &LdSt, SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) - const { + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { + const MachineOperand *DstOp = getDstMemOperand(LdSt); + Width = DstOp ? getDstOpSizeInBytes(LdSt, DstOp) : 0; + if (!LdSt.mayLoadOrStore()) return false; @@ -430,7 +460,8 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, - unsigned NumLoads) const { + unsigned NumLoads, + unsigned NumBytes) const { assert(!BaseOps1.empty() && !BaseOps2.empty()); const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); @@ -2736,9 +2767,12 @@ const MachineInstr &MIb) const { SmallVector BaseOps0, BaseOps1; int64_t Offset0, Offset1; + unsigned Dummy0, Dummy1; bool Offset0IsScalable, Offset1IsScalable; - if (!getMemOperandsWithOffset(MIa, BaseOps0, Offset0, Offset0IsScalable, &RI) || - !getMemOperandsWithOffset(MIb, BaseOps1, Offset1, Offset1IsScalable, &RI)) + if (!getMemOperandsWithOffset(MIa, BaseOps0, Offset0, Offset0IsScalable, + Dummy0, &RI) || + !getMemOperandsWithOffset(MIb, BaseOps1, Offset1, Offset1IsScalable, + Dummy1, &RI)) return false; if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -208,6 +208,7 @@ getMemOperandsWithOffset(const MachineInstr &LdSt, SmallVectorImpl &BaseOps, int64_t &Offset, bool &OffsetIsScalable, + unsigned &Width, const TargetRegisterInfo *TRI) const override; /// Reverses the branch condition of the specified condition list, diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -2965,9 +2965,11 @@ /// Get the base register and byte offset of a load/store instr. bool HexagonInstrInfo::getMemOperandsWithOffset( const MachineInstr &LdSt, SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const { + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { unsigned AccessSize = 0; OffsetIsScalable = false; + Width = !LdSt.memoperands_empty() ? LdSt.memoperands().front()->getSize() : 0; const MachineOperand *BaseOp = getBaseAndOffset(LdSt, Offset, AccessSize); if (!BaseOp || !BaseOp->isReg()) return false; diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/llvm/lib/Target/Lanai/LanaiInstrInfo.h --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.h +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.h @@ -71,6 +71,7 @@ getMemOperandsWithOffset(const MachineInstr &LdSt, SmallVectorImpl &BaseOps, int64_t &Offset, bool &OffsetIsScalable, + unsigned &Width, const TargetRegisterInfo *TRI) const override; bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt, diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp @@ -797,7 +797,8 @@ bool LanaiInstrInfo::getMemOperandsWithOffset( const MachineInstr &LdSt, SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const { + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { switch (LdSt.getOpcode()) { default: return false; @@ -811,7 +812,6 @@ case Lanai::LDBs_RI: case Lanai::LDBz_RI: const MachineOperand *BaseOp; - unsigned Width; OffsetIsScalable = false; if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI)) return false; diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -321,6 +321,7 @@ getMemOperandsWithOffset(const MachineInstr &LdSt, SmallVectorImpl &BaseOps, int64_t &Offset, bool &OffsetIsScalable, + unsigned &Width, const TargetRegisterInfo *TRI) const override; bool analyzeBranchPredicate(MachineBasicBlock &MBB, TargetInstrInfo::MachineBranchPredicate &MBP, diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3664,7 +3664,8 @@ bool X86InstrInfo::getMemOperandsWithOffset( const MachineInstr &MemOp, SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const { + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { const MCInstrDesc &Desc = MemOp.getDesc(); int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); if (MemRefBegin < 0) @@ -3696,6 +3697,8 @@ return false; OffsetIsScalable = false; + Width = + !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0; BaseOps.push_back(BaseOp); return true; }