diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1254,11 +1254,10 @@ /// It returns false if no base operands and offset was found. /// It is not guaranteed to always recognize base operands and offsets in all /// cases. - virtual bool - getMemOperandsWithOffset(const MachineInstr &MI, - SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, - const TargetRegisterInfo *TRI) const { + virtual bool getMemOperandsWithOffsetWidth( + const MachineInstr &MI, SmallVectorImpl &BaseOps, + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { return false; } @@ -1286,9 +1285,11 @@ /// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations. /// \p NumLoads is the number of loads that will be in the cluster if this /// hook returns true. + /// \p NumBytes is the number of bytes that will be loaded from all the + /// clustered loads if this hook returns true. virtual bool shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, - unsigned NumLoads) const { + unsigned NumLoads, unsigned NumBytes) const { llvm_unreachable("target did not implement shouldClusterMemOps()"); } diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1473,10 +1473,12 @@ SUnit *SU; SmallVector BaseOps; int64_t Offset; + unsigned Width; MemOpInfo(SUnit *SU, ArrayRef BaseOps, - int64_t Offset) - : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset) {} + int64_t Offset, unsigned Width) + : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset), + Width(Width) {} static bool Compare(const MachineOperand *const &A, const MachineOperand *const &B) { @@ -1565,12 +1567,14 @@ ArrayRef MemOps, ScheduleDAGInstrs *DAG) { SmallVector MemOpRecords; for (SUnit *SU : MemOps) { + const MachineInstr &MI = *SU->getInstr(); SmallVector BaseOps; int64_t Offset; bool OffsetIsScalable; - if (TII->getMemOperandsWithOffset(*SU->getInstr(), BaseOps, Offset, - OffsetIsScalable, TRI)) - MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset)); + unsigned Width; + if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, + OffsetIsScalable, Width, TRI)) + MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset, Width)); #ifndef NDEBUG for (auto *Op : BaseOps) assert(Op); @@ -1584,16 +1588,19 @@ // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to // cluster mem ops collected within `MemOpRecords` array. unsigned ClusterLength = 1; + unsigned CurrentClusterBytes = MemOpRecords[0].Width; for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { // Decision to cluster mem ops is taken based on target dependent logic auto MemOpa = MemOpRecords[Idx]; auto MemOpb = MemOpRecords[Idx + 1]; ++ClusterLength; - if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, - ClusterLength)) { + CurrentClusterBytes += MemOpb.Width; + if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength, + CurrentClusterBytes)) { // Current mem ops pair could not be clustered, reset cluster length, and // go to next pair ClusterLength = 1; + CurrentClusterBytes = MemOpb.Width; continue; } @@ -1605,6 +1612,7 @@ // FIXME: Is this check really required? if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { ClusterLength = 1; + CurrentClusterBytes = MemOpb.Width; continue; } diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -1037,7 +1037,9 @@ const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const { SmallVector BaseOps; - if (!getMemOperandsWithOffset(MI, BaseOps, Offset, OffsetIsScalable, TRI) || + unsigned Width; + if (!getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, OffsetIsScalable, + Width, TRI) || BaseOps.size() != 1) return false; BaseOp = BaseOps.front(); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -113,10 +113,10 @@ /// Hint that pairing the given load or store is unprofitable. static void suppressLdStPair(MachineInstr &MI); - bool getMemOperandsWithOffset( + bool getMemOperandsWithOffsetWidth( const MachineInstr &MI, SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) - const override; + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const override; /// If \p OffsetIsScalable is set to 'true', the offset is scaled by `vscale`. /// This is true for some SVE instructions like ldr/str that have a @@ -140,7 +140,7 @@ bool shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, - unsigned NumLoads) const override; + unsigned NumLoads, unsigned NumBytes) const override; void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2033,15 +2033,14 @@ return true; } -bool AArch64InstrInfo::getMemOperandsWithOffset( +bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( const MachineInstr &LdSt, SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) - const { + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { if (!LdSt.mayLoadOrStore()) return false; const MachineOperand *BaseOp; - unsigned Width; if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, Width, TRI)) return false; @@ -2513,7 +2512,8 @@ /// Only called for LdSt for which getMemOperandWithOffset returns true. bool AArch64InstrInfo::shouldClusterMemOps( ArrayRef BaseOps1, - ArrayRef BaseOps2, unsigned NumLoads) const { + ArrayRef BaseOps2, unsigned NumLoads, + unsigned NumBytes) const { assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); const MachineOperand &BaseOp1 = *BaseOps1.front(); const MachineOperand &BaseOp2 = *BaseOps2.front(); diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -146,10 +146,11 @@ int64_t Dummy1; bool Dummy2; + unsigned Dummy3; SmallVector BaseOps; if (Type <= LAST_REAL_HARDCLAUSE_TYPE) { - if (!SII->getMemOperandsWithOffset(MI, BaseOps, Dummy1, Dummy2, - TRI)) { + if (!SII->getMemOperandsWithOffsetWidth(MI, BaseOps, Dummy1, Dummy2, + Dummy3, TRI)) { // We failed to get the base operands, so we'll never clause this // instruction with any other, so pretend it's illegal. Type = HARDCLAUSE_ILLEGAL; @@ -164,7 +165,7 @@ // scheduler it limits the size of the cluster to avoid increasing // register pressure too much, but this pass runs after register // allocation so there is no need for that kind of limit. - !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2)))) { + !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2, 2)))) { // Finish the current clause. Changed |= emitClause(CI, SII); CI = ClauseInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -181,15 +181,18 @@ int64_t &Offset1, int64_t &Offset2) const override; - bool - getMemOperandsWithOffset(const MachineInstr &LdSt, - SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, - const TargetRegisterInfo *TRI) const final; + unsigned getOperandSizeInBytes(const MachineInstr &LdSt, + const MachineOperand *MOp) const; + + bool getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, + SmallVectorImpl &BaseOps, int64_t &Offset, + bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const final; bool shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, - unsigned NumLoads) const override; + unsigned NumLoads, unsigned NumBytes) const override; bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -264,16 +264,27 @@ } } -bool SIInstrInfo::getMemOperandsWithOffset( +unsigned SIInstrInfo::getOperandSizeInBytes(const MachineInstr &LdSt, + const MachineOperand *MOp) const { + assert(MOp && "Unexpected null machine operand!"); + const MachineRegisterInfo &MRI = LdSt.getParent()->getParent()->getRegInfo(); + const Register Reg = MOp->getReg(); + const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) + ? MRI.getRegClass(Reg) + : RI.getPhysRegClass(Reg); + return (RI.getRegSizeInBits(*DstRC) / 8); +} + +bool SIInstrInfo::getMemOperandsWithOffsetWidth( const MachineInstr &LdSt, SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) - const { + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { if (!LdSt.mayLoadOrStore()) return false; unsigned Opc = LdSt.getOpcode(); OffsetIsScalable = false; - const MachineOperand *BaseOp, *OffsetOp; + const MachineOperand *BaseOp, *OffsetOp, *MOp; if (isDS(LdSt)) { BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); @@ -287,6 +298,11 @@ } BaseOps.push_back(BaseOp); Offset = OffsetOp->getImm(); + // Get appropriate operand, and compute width accordingly. + MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst); + if (!MOp) + MOp = getNamedOperand(LdSt, AMDGPU::OpName::data0); + Width = getOperandSizeInBytes(LdSt, MOp); } else { // The 2 offset instructions use offset0 and offset1 instead. We can treat // these as a load with a single offset if the 2 offsets are consecutive. @@ -318,6 +334,16 @@ BaseOps.push_back(BaseOp); Offset = EltSize * Offset0; + // Get appropriate operand(s), and compute width accordingly. + MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst); + if (!MOp) { + MOp = getNamedOperand(LdSt, AMDGPU::OpName::data0); + Width = getOperandSizeInBytes(LdSt, MOp); + MOp = getNamedOperand(LdSt, AMDGPU::OpName::data1); + Width += getOperandSizeInBytes(LdSt, MOp); + } else { + Width = getOperandSizeInBytes(LdSt, MOp); + } } return true; } @@ -342,6 +368,11 @@ BaseOps.push_back(RSrc); BaseOps.push_back(SOffset); Offset = OffsetImm->getImm(); + // Get appropriate operand, and compute width accordingly. + MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst); + if (!MOp) + MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdata); + Width = getOperandSizeInBytes(LdSt, MOp); return true; } @@ -359,6 +390,11 @@ Offset = OffsetImm->getImm(); if (SOffset) // soffset can be an inline immediate. Offset += SOffset->getImm(); + // Get appropriate operand, and compute width accordingly. + MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst); + if (!MOp) + MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdata); + Width = getOperandSizeInBytes(LdSt, MOp); return true; } @@ -369,6 +405,9 @@ BaseOps.push_back(BaseOp); OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); Offset = OffsetOp ? OffsetOp->getImm() : 0; + // Get appropriate operand, and compute width accordingly. + MOp = getNamedOperand(LdSt, AMDGPU::OpName::sdst); + Width = getOperandSizeInBytes(LdSt, MOp); return true; } @@ -381,6 +420,11 @@ if (BaseOp) BaseOps.push_back(BaseOp); Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); + // Get appropriate operand, and compute width accordingly. + MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdst); + if (!MOp) + MOp = getNamedOperand(LdSt, AMDGPU::OpName::vdata); + Width = getOperandSizeInBytes(LdSt, MOp); return true; } @@ -430,7 +474,8 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, - unsigned NumLoads) const { + unsigned NumLoads, + unsigned NumBytes) const { assert(!BaseOps1.empty() && !BaseOps2.empty()); const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); @@ -2730,9 +2775,12 @@ const MachineInstr &MIb) const { SmallVector BaseOps0, BaseOps1; int64_t Offset0, Offset1; + unsigned Dummy0, Dummy1; bool Offset0IsScalable, Offset1IsScalable; - if (!getMemOperandsWithOffset(MIa, BaseOps0, Offset0, Offset0IsScalable, &RI) || - !getMemOperandsWithOffset(MIb, BaseOps1, Offset1, Offset1IsScalable, &RI)) + if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, + Dummy0, &RI) || + !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, + Dummy1, &RI)) return false; if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -204,11 +204,11 @@ bool expandPostRAPseudo(MachineInstr &MI) const override; /// Get the base register and byte offset of a load/store instr. - bool - getMemOperandsWithOffset(const MachineInstr &LdSt, - SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, - const TargetRegisterInfo *TRI) const override; + bool getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, + SmallVectorImpl &BaseOps, int64_t &Offset, + bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const override; /// Reverses the branch condition of the specified condition list, /// returning false on success and true if it cannot be reversed. diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -2963,12 +2963,12 @@ } /// Get the base register and byte offset of a load/store instr. -bool HexagonInstrInfo::getMemOperandsWithOffset( +bool HexagonInstrInfo::getMemOperandsWithOffsetWidth( const MachineInstr &LdSt, SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const { - unsigned AccessSize = 0; + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { OffsetIsScalable = false; - const MachineOperand *BaseOp = getBaseAndOffset(LdSt, Offset, AccessSize); + const MachineOperand *BaseOp = getBaseAndOffset(LdSt, Offset, Width); if (!BaseOp || !BaseOp->isReg()) return false; BaseOps.push_back(BaseOp); diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/llvm/lib/Target/Lanai/LanaiInstrInfo.h --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.h +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.h @@ -67,11 +67,11 @@ bool expandPostRAPseudo(MachineInstr &MI) const override; - bool - getMemOperandsWithOffset(const MachineInstr &LdSt, - SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, - const TargetRegisterInfo *TRI) const override; + bool getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, + SmallVectorImpl &BaseOps, int64_t &Offset, + bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const override; bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt, const MachineOperand *&BaseOp, diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp @@ -795,9 +795,10 @@ return true; } -bool LanaiInstrInfo::getMemOperandsWithOffset( +bool LanaiInstrInfo::getMemOperandsWithOffsetWidth( const MachineInstr &LdSt, SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const { + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { switch (LdSt.getOpcode()) { default: return false; @@ -811,7 +812,6 @@ case Lanai::LDBs_RI: case Lanai::LDBz_RI: const MachineOperand *BaseOp; - unsigned Width; OffsetIsScalable = false; if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI)) return false; diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -317,11 +317,11 @@ SmallVectorImpl &Cond, bool AllowModify) const override; - bool - getMemOperandsWithOffset(const MachineInstr &LdSt, - SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, - const TargetRegisterInfo *TRI) const override; + bool getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, + SmallVectorImpl &BaseOps, int64_t &Offset, + bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const override; bool analyzeBranchPredicate(MachineBasicBlock &MBB, TargetInstrInfo::MachineBranchPredicate &MBP, bool AllowModify = false) const override; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3662,9 +3662,10 @@ } } -bool X86InstrInfo::getMemOperandsWithOffset( +bool X86InstrInfo::getMemOperandsWithOffsetWidth( const MachineInstr &MemOp, SmallVectorImpl &BaseOps, - int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const { + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { const MCInstrDesc &Desc = MemOp.getDesc(); int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); if (MemRefBegin < 0) @@ -3696,6 +3697,11 @@ return false; OffsetIsScalable = false; + // FIXME: Relying on memoperands() may not be right thing to do here. Check + // with X86 maintainers, and fix it accordingly. For now, it is ok, since + // there is no use of `Width` for X86 back-end at the moment. + Width = + !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0; BaseOps.push_back(BaseOp); return true; }