Index: test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s +++ test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s @@ -39,6 +39,10 @@ # CHECK-NEXT: Number of physical registers: 72 # CHECK-NEXT: Total number of mappings created: 3 # CHECK-NEXT: Max number of mappings used: 3 +# CHECK-NEXT: Number of optimizable moves: 3 +# CHECK-NEXT: Number of moves eliminated: 3 (100.0%) +# CHECK-NEXT: Number of zero moves: 3 (100.0%) +# CHECK-NEXT: Max moves eliminated per cycle: 1 # CHECK: * Register File #2 -- JIntegerPRF: # CHECK-NEXT: Number of physical registers: 64 Index: test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s +++ test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s @@ -49,6 +49,10 @@ # CHECK-NEXT: Number of physical registers: 72 # CHECK-NEXT: Total number of mappings created: 0 # CHECK-NEXT: Max number of mappings used: 0 +# CHECK-NEXT: Number of optimizable moves: 21 +# CHECK-NEXT: Number of moves eliminated: 21 (100.0%) +# CHECK-NEXT: Number of zero moves: 21 (100.0%) +# CHECK-NEXT: Max moves eliminated per cycle: 2 # CHECK: * Register File #2 -- JIntegerPRF: # CHECK-NEXT: Number of physical registers: 64 Index: test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s +++ test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s @@ -44,6 +44,10 @@ # CHECK-NEXT: Number of physical registers: 72 # CHECK-NEXT: Total number of mappings created: 0 # CHECK-NEXT: Max number of mappings used: 0 +# CHECK-NEXT: Number of optimizable moves: 18 +# CHECK-NEXT: Number of moves eliminated: 18 (100.0%) +# CHECK-NEXT: Number of zero moves: 18 (100.0%) +# CHECK-NEXT: Max moves eliminated per cycle: 2 # CHECK: * Register File #2 -- JIntegerPRF: # CHECK-NEXT: Number of physical registers: 64 Index: test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s +++ test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s @@ -45,6 +45,10 @@ # CHECK-NEXT: Number of physical registers: 64 # CHECK-NEXT: Total number of mappings created: 0 # CHECK-NEXT: Max number of mappings used: 0 +# CHECK-NEXT: Number of optimizable moves: 12 +# CHECK-NEXT: Number of moves eliminated: 12 (100.0%) +# CHECK-NEXT: Number of zero moves: 12 (100.0%) +# CHECK-NEXT: Max moves eliminated per cycle: 2 # CHECK: Resources: # CHECK-NEXT: [0] - JALU0 Index: test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s +++ test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s @@ -45,6 +45,10 @@ # CHECK-NEXT: Number of physical registers: 64 # CHECK-NEXT: Total number of mappings created: 0 # CHECK-NEXT: Max number of mappings used: 0 +# CHECK-NEXT: Number of optimizable moves: 12 +# CHECK-NEXT: Number of moves eliminated: 12 (100.0%) +# CHECK-NEXT: Number of zero moves: 12 (100.0%) +# CHECK-NEXT: Max moves eliminated per cycle: 2 # CHECK: Resources: # CHECK-NEXT: [0] - JALU0 Index: tools/llvm-mca/Views/RegisterFileStatistics.h =================================================================== --- tools/llvm-mca/Views/RegisterFileStatistics.h +++ tools/llvm-mca/Views/RegisterFileStatistics.h @@ -21,6 +21,10 @@ /// Number of physical registers: 72 /// Total number of mappings created: 0 /// Max number of mappings used: 0 +/// Number of optimizable moves: 200 +/// Number of moves eliminated: 200 (100.0%) +/// Number of zero moves: 200 (100.0%) +/// Max moves eliminated per cycle: 2 /// /// * Register File #2 -- IntegerPRF: /// Number of physical registers: 64 @@ -49,12 +53,25 @@ unsigned CurrentlyUsedMappings; }; + struct MoveEliminationInfo { + unsigned TotalMoveEliminationCandidates; + unsigned TotalMovesEliminated; + unsigned TotalMovesThatPropagateZero; + unsigned MaxMovesEliminatedPerCycle; + unsigned CurrentMovesEliminated; + }; + // There is one entry for each register file implemented by the processor. - llvm::SmallVector RegisterFiles; + llvm::SmallVector PRFUsage; + llvm::SmallVector MoveElimInfo; + + void updateRegisterFileUsage(ArrayRef UsedPhysRegs); + void updateMoveElimInfo(const Instruction &Inst); public: RegisterFileStatistics(const llvm::MCSubtargetInfo &sti); + void onCycleEnd() override; void onEvent(const HWInstructionEvent &Event) override; void printView(llvm::raw_ostream &OS) const override; }; Index: tools/llvm-mca/Views/RegisterFileStatistics.cpp =================================================================== --- tools/llvm-mca/Views/RegisterFileStatistics.cpp +++ tools/llvm-mca/Views/RegisterFileStatistics.cpp @@ -21,10 +21,12 @@ RegisterFileStatistics::RegisterFileStatistics(const MCSubtargetInfo &sti) : STI(sti) { const MCSchedModel &SM = STI.getSchedModel(); - RegisterFileUsage Empty = {0, 0, 0}; + RegisterFileUsage RFUEmpty = {0, 0, 0}; + MoveEliminationInfo MEIEmpty = {0, 0, 0, 0, 0}; if (!SM.hasExtraProcessorInfo()) { // Assume a single register file. - RegisterFiles.emplace_back(Empty); + PRFUsage.emplace_back(RFUEmpty); + MoveElimInfo.emplace_back(MEIEmpty); return; } @@ -35,8 +37,41 @@ // be skipped. If there are no user defined register files, then reserve a // single entry for the default register file at index #0. unsigned NumRegFiles = std::max(PI.NumRegisterFiles, 1U); - RegisterFiles.resize(NumRegFiles); - std::fill(RegisterFiles.begin(), RegisterFiles.end(), Empty); + + PRFUsage.resize(NumRegFiles); + std::fill(PRFUsage.begin(), PRFUsage.end(), RFUEmpty); + + MoveElimInfo.resize(NumRegFiles); + std::fill(MoveElimInfo.begin(), MoveElimInfo.end(), MEIEmpty); +} + +void RegisterFileStatistics::updateRegisterFileUsage( + ArrayRef UsedPhysRegs) { + for (unsigned I = 0, E = PRFUsage.size(); I < E; ++I) { + RegisterFileUsage &RFU = PRFUsage[I]; + unsigned NumUsedPhysRegs = UsedPhysRegs[I]; + RFU.CurrentlyUsedMappings += NumUsedPhysRegs; + RFU.TotalMappings += NumUsedPhysRegs; + RFU.MaxUsedMappings = + std::max(RFU.MaxUsedMappings, RFU.CurrentlyUsedMappings); + } +} + +void RegisterFileStatistics::updateMoveElimInfo(const Instruction &Inst) { + if (Inst.isOptimizableMove()) { + assert(Inst.getDefs().size() == 1 && "Expected a single definition!"); + assert(Inst.getUses().size() == 1 && "Expected a single register use!"); + const WriteState &WS = Inst.getDefs()[0]; + const ReadState &RS = Inst.getUses()[0]; + + MoveEliminationInfo &Info = + MoveElimInfo[Inst.getDefs()[0].getRegisterFileID()]; + Info.TotalMoveEliminationCandidates++; + if (WS.isEliminated()) + Info.CurrentMovesEliminated++; + if (WS.isWriteZero() && RS.isReadZero()) + Info.TotalMovesThatPropagateZero++; + } } void RegisterFileStatistics::onEvent(const HWInstructionEvent &Event) { @@ -45,37 +80,40 @@ break; case HWInstructionEvent::Retired: { const auto &RE = static_cast(Event); - for (unsigned I = 0, E = RegisterFiles.size(); I < E; ++I) - RegisterFiles[I].CurrentlyUsedMappings -= RE.FreedPhysRegs[I]; + for (unsigned I = 0, E = PRFUsage.size(); I < E; ++I) + PRFUsage[I].CurrentlyUsedMappings -= RE.FreedPhysRegs[I]; break; } case HWInstructionEvent::Dispatched: { const auto &DE = static_cast(Event); - for (unsigned I = 0, E = RegisterFiles.size(); I < E; ++I) { - RegisterFileUsage &RFU = RegisterFiles[I]; - unsigned NumUsedPhysRegs = DE.UsedPhysRegs[I]; - RFU.CurrentlyUsedMappings += NumUsedPhysRegs; - RFU.TotalMappings += NumUsedPhysRegs; - RFU.MaxUsedMappings = - std::max(RFU.MaxUsedMappings, RFU.CurrentlyUsedMappings); - } + updateRegisterFileUsage(DE.UsedPhysRegs); + updateMoveElimInfo(*DE.IR.getInstruction()); } } } +void RegisterFileStatistics::onCycleEnd() { + for (MoveEliminationInfo &MEI : MoveElimInfo) { + unsigned &CurrentMax = MEI.MaxMovesEliminatedPerCycle; + CurrentMax = std::max(CurrentMax, MEI.CurrentMovesEliminated); + MEI.TotalMovesEliminated += MEI.CurrentMovesEliminated; + MEI.CurrentMovesEliminated = 0; + } +} + void RegisterFileStatistics::printView(raw_ostream &OS) const { std::string Buffer; raw_string_ostream TempStream(Buffer); TempStream << "\n\nRegister File statistics:"; - const RegisterFileUsage &GlobalUsage = RegisterFiles[0]; + const RegisterFileUsage &GlobalUsage = PRFUsage[0]; TempStream << "\nTotal number of mappings created: " << GlobalUsage.TotalMappings; TempStream << "\nMax number of mappings used: " << GlobalUsage.MaxUsedMappings << '\n'; - for (unsigned I = 1, E = RegisterFiles.size(); I < E; ++I) { - const RegisterFileUsage &RFU = RegisterFiles[I]; + for (unsigned I = 1, E = PRFUsage.size(); I < E; ++I) { + const RegisterFileUsage &RFU = PRFUsage[I]; // Obtain the register file descriptor from the scheduling model. assert(STI.getSchedModel().hasExtraProcessorInfo() && "Unable to find register file info!"); @@ -98,6 +136,27 @@ << RFU.TotalMappings; TempStream << "\n Max number of mappings used: " << RFU.MaxUsedMappings << '\n'; + const MoveEliminationInfo &MEI = MoveElimInfo[I]; + + if (MEI.TotalMoveEliminationCandidates) { + TempStream << " Number of optimizable moves: " + << MEI.TotalMoveEliminationCandidates; + double EliminatedMovProportion = (double)MEI.TotalMovesEliminated / + MEI.TotalMoveEliminationCandidates * + 100.0; + double ZeroMovProportion = (double)MEI.TotalMovesThatPropagateZero / + MEI.TotalMoveEliminationCandidates * 100.0; + TempStream << "\n Number of moves eliminated: " + << MEI.TotalMovesEliminated << " " + << format("(%.1f%%)", + floor((EliminatedMovProportion * 10) + 0.5) / 10); + TempStream << "\n Number of zero moves: " + << MEI.TotalMovesThatPropagateZero << " " + << format("(%.1f%%)", + floor((ZeroMovProportion * 10) + 0.5) / 10); + TempStream << "\n Max moves eliminated per cycle: " + << MEI.MaxMovesEliminatedPerCycle << '\n'; + } } TempStream.flush(); Index: tools/llvm-mca/include/HardwareUnits/RegisterFile.h =================================================================== --- tools/llvm-mca/include/HardwareUnits/RegisterFile.h +++ tools/llvm-mca/include/HardwareUnits/RegisterFile.h @@ -173,6 +173,10 @@ void freePhysRegs(const RegisterRenamingInfo &Entry, MutableArrayRef FreedPhysRegs); + // Collects writes that are in a RAW dependency with RS. + // This method is called from `addRegisterRead()`. + void collectWrites(const ReadState &RS, SmallVectorImpl &Writes) const; + // Create an instance of RegisterMappingTracker for every register file // specified by the processor model. // If no register file is specified, then this method creates a default @@ -189,6 +193,10 @@ // No physical regiser is allocated if this write is from a zero-idiom. void addRegisterWrite(WriteRef Write, MutableArrayRef UsedPhysRegs); + // Collect writes that are in a data dependency with RS, and update RS + // internal state. + void addRegisterRead(ReadState &RS, SmallVectorImpl &Writes); + // Removes write \param WS from the register mappings. // Physical registers may be released to reflect this update. // No registers are released if this write is from a zero-idiom. @@ -200,7 +208,7 @@ // If RS is a read from a zero register, and WS is eliminated, then // `WS.WritesZero` is also set, so that method addRegisterWrite() would not // reserve a physical register for it. - bool tryEliminateMove(WriteState &WS, const ReadState &RS); + bool tryEliminateMove(WriteState &WS, ReadState &RS); // Checks if there are enough physical registers in the register files. // Returns a "response mask" where each bit represents the response from a @@ -212,7 +220,8 @@ // Current implementation can simulate up to 32 register files (including the // special register file at index #0). unsigned isAvailable(ArrayRef Regs) const; - void collectWrites(SmallVectorImpl &Writes, unsigned RegID) const; + + // Returns the number of PRFs implemented by this processor. unsigned getNumRegisterFiles() const { return RegisterFiles.size(); } // Notify each PRF that a new cycle just started. Index: tools/llvm-mca/include/Instruction.h =================================================================== --- tools/llvm-mca/include/Instruction.h +++ tools/llvm-mca/include/Instruction.h @@ -101,6 +101,9 @@ // field RegisterID from WD. unsigned RegisterID; + // Physical register file that serves register RegisterID. + unsigned PRFID; + // True if this write implicitly clears the upper portion of RegisterID's // super-registers. bool ClearsSuperRegs; @@ -135,7 +138,7 @@ WriteState(const WriteDescriptor &Desc, unsigned RegID, bool clearsSuperRegs = false, bool writesZero = false) : WD(&Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID), - ClearsSuperRegs(clearsSuperRegs), WritesZero(writesZero), + PRFID(0), ClearsSuperRegs(clearsSuperRegs), WritesZero(writesZero), IsEliminated(false), DependentWrite(nullptr), NumWriteUsers(0U) {} WriteState(const WriteState &Other) = default; @@ -144,6 +147,7 @@ int getCyclesLeft() const { return CyclesLeft; } unsigned getWriteResourceID() const { return WD->SClassOrWriteResourceID; } unsigned getRegisterID() const { return RegisterID; } + unsigned getRegisterFileID() const { return PRFID; } unsigned getLatency() const { return WD->Latency; } void addUser(ReadState *Use, int ReadAdvance); @@ -168,6 +172,8 @@ IsEliminated = true; } + void setPRF(unsigned PRF) { PRFID = PRF; } + // On every cycle, update CyclesLeft and notify dependent users. void cycleEvent(); void onInstructionIssued(); @@ -185,6 +191,8 @@ const ReadDescriptor *RD; // Physical register identified associated to this read. unsigned RegisterID; + // Physical register file that serves register RegisterID. + unsigned PRFID; // Number of writes that contribute to the definition of RegisterID. // In the absence of partial register updates, the number of DependentWrites // cannot be more than one. @@ -201,18 +209,21 @@ // This field is set to true only if there are no dependent writes, and // there are no `CyclesLeft' to wait. bool IsReady; + // True if this is a read from a known zero register. + bool IsZero; // True if this register read is from a dependency-breaking instruction. bool IndependentFromDef; public: ReadState(const ReadDescriptor &Desc, unsigned RegID) - : RD(&Desc), RegisterID(RegID), DependentWrites(0), + : RD(&Desc), RegisterID(RegID), PRFID(0), DependentWrites(0), CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), IsReady(true), - IndependentFromDef(false) {} + IsZero(false), IndependentFromDef(false) {} const ReadDescriptor &getDescriptor() const { return *RD; } unsigned getSchedClass() const { return RD->SchedClassID; } unsigned getRegisterID() const { return RegisterID; } + unsigned getRegisterFileID() const { return PRFID; } bool isReady() const { return IsReady; } bool isImplicitRead() const { return RD->isImplicitRead(); } @@ -226,6 +237,10 @@ DependentWrites = Writes; IsReady = !Writes; } + + bool isReadZero() const { return IsZero; } + void setReadZero() { IsZero = true; } + void setPRF(unsigned ID) { PRFID = ID; } }; /// A sequence of cycles. Index: tools/llvm-mca/include/Stages/DispatchStage.h =================================================================== --- tools/llvm-mca/include/Stages/DispatchStage.h +++ tools/llvm-mca/include/Stages/DispatchStage.h @@ -68,10 +68,6 @@ ArrayRef UsedPhysRegs, unsigned uOps) const; - void collectWrites(SmallVectorImpl &Vec, unsigned RegID) const { - return PRF.collectWrites(Vec, RegID); - } - public: DispatchStage(const MCSubtargetInfo &Subtarget, const MCRegisterInfo &MRI, unsigned MaxDispatchWidth, RetireControlUnit &R, Index: tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp =================================================================== --- tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp +++ tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp @@ -173,6 +173,7 @@ bool IsEliminated = WS.isEliminated(); bool ShouldAllocatePhysRegs = !IsWriteZero && !IsEliminated; const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second; + WS.setPRF(RRI.IndexPlusCost.first); if (RRI.RenameAs && RRI.RenameAs != RegID) { RegID = RRI.RenameAs; @@ -217,9 +218,9 @@ RegisterMappings[*I].second.AliasRegID = 0U; } - // No physical registers are allocated for instructions that are optimized in - // hardware. For example, zero-latency data-dependency breaking instructions - // don't consume physical registers. + // No physical registers are allocated for instructions that are optimized + // in hardware. For example, zero-latency data-dependency breaking + // instructions don't consume physical registers. if (ShouldAllocatePhysRegs) allocatePhysRegs(RegisterMappings[RegID].second, UsedPhysRegs); } @@ -288,7 +289,7 @@ } } -bool RegisterFile::tryEliminateMove(WriteState &WS, const ReadState &RS) { +bool RegisterFile::tryEliminateMove(WriteState &WS, ReadState &RS) { const RegisterMapping &RMFrom = RegisterMappings[RS.getRegisterID()]; const RegisterMapping &RMTo = RegisterMappings[WS.getRegisterID()]; @@ -349,15 +350,18 @@ } RMT.NumMoveEliminated++; - if (IsZeroMove) + if (IsZeroMove) { WS.setWriteZero(); + RS.setReadZero(); + } WS.setEliminated(); return true; } -void RegisterFile::collectWrites(SmallVectorImpl &Writes, - unsigned RegID) const { +void RegisterFile::collectWrites(const ReadState &RS, + SmallVectorImpl &Writes) const { + unsigned RegID = RS.getRegisterID(); assert(RegID && RegID < RegisterMappings.size()); LLVM_DEBUG(dbgs() << "RegisterFile: collecting writes for register " << MRI.getName(RegID) << '\n'); @@ -379,11 +383,13 @@ } // Remove duplicate entries and resize the input vector. - sort(Writes, [](const WriteRef &Lhs, const WriteRef &Rhs) { - return Lhs.getWriteState() < Rhs.getWriteState(); - }); - auto It = std::unique(Writes.begin(), Writes.end()); - Writes.resize(std::distance(Writes.begin(), It)); + if (Writes.size() > 1) { + sort(Writes, [](const WriteRef &Lhs, const WriteRef &Rhs) { + return Lhs.getWriteState() < Rhs.getWriteState(); + }); + auto It = std::unique(Writes.begin(), Writes.end()); + Writes.resize(std::distance(Writes.begin(), It)); + } LLVM_DEBUG({ for (const WriteRef &WR : Writes) { @@ -395,6 +401,20 @@ }); } +void RegisterFile::addRegisterRead(ReadState &RS, + SmallVectorImpl &DependentWrites) { + unsigned RegID = RS.getRegisterID(); + const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second; + RS.setPRF(RRI.IndexPlusCost.first); + if (RS.isIndependentFromDef()) + return; + + collectWrites(RS, DependentWrites); + if (ZeroRegisters[RS.getRegisterID()]) + RS.setReadZero(); + RS.setDependentWrites(DependentWrites.size()); +} + unsigned RegisterFile::isAvailable(ArrayRef Regs) const { SmallVector NumPhysRegs(getNumRegisterFiles()); Index: tools/llvm-mca/lib/Stages/DispatchStage.cpp =================================================================== --- tools/llvm-mca/lib/Stages/DispatchStage.cpp +++ tools/llvm-mca/lib/Stages/DispatchStage.cpp @@ -67,8 +67,9 @@ const MCSubtargetInfo &STI) { SmallVector DependentWrites; - collectWrites(DependentWrites, RS.getRegisterID()); - RS.setDependentWrites(DependentWrites.size()); + // Collect all the dependent writes, and update RS internal state. + PRF.addRegisterRead(RS, DependentWrites); + // We know that this read depends on all the writes in DependentWrites. // For each write, check if we have ReadAdvance information, and use it // to figure out in how many cycles this read becomes available. @@ -116,10 +117,8 @@ // We also don't update data dependencies for instructions that have been // eliminated at register renaming stage. if (!IsEliminated) { - for (ReadState &RS : IS.getUses()) { - if (!RS.isIndependentFromDef()) - updateRAWDependencies(RS, STI); - } + for (ReadState &RS : IS.getUses()) + updateRAWDependencies(RS, STI); } // By default, a dependency-breaking zero-idiom is expected to be optimized @@ -127,8 +126,7 @@ // to the instruction. SmallVector RegisterFiles(PRF.getNumRegisterFiles()); for (WriteState &WS : IS.getDefs()) - PRF.addRegisterWrite(WriteRef(IR.getSourceIndex(), &WS), - RegisterFiles); + PRF.addRegisterWrite(WriteRef(IR.getSourceIndex(), &WS), RegisterFiles); // Reserve slots in the RCU, and notify the instruction that it has been // dispatched to the schedulers for execution.