Index: include/llvm/MC/MCInstrAnalysis.h =================================================================== --- include/llvm/MC/MCInstrAnalysis.h +++ include/llvm/MC/MCInstrAnalysis.h @@ -22,6 +22,9 @@ namespace llvm { +class BitVector; +class MCRegisterInfo; + class MCInstrAnalysis { protected: friend class Target; @@ -60,6 +63,30 @@ return Info->get(Inst.getOpcode()).isTerminator(); } + /// Returns true if at least one of the register writes performed by + /// \param Inst implicitly clears the upper portion of a super-register. + /// + /// Example: on X86-64, a write to EAX implicitly clears the upper half of + /// RAX. Also (still on x86) an XMM write perfomed by an AVX 128-bit + /// instruction implicitly clear the upper half of the aliasing YMM register. + /// + /// This method also updates a BitVector of register writes. There is one + /// bit for every explicit/implicit write performed by the instruction. If a + /// write implicitly clears its super-registers, then the corresponding bit is + /// set (vic. it is cleared). + /// + /// The first bits in the vector are related to explicit writes. The remaining + /// bits are related to implicit writes. The sequence of writes follows the + /// machine operand sequence. For implicit writes, the sequence is defined by + /// the MCInstrDesc. + /// + /// The assumption is that the capacity of the BitVector is correctly set by + /// the caller. The default implementation conservatively assumes that none of + /// the writes clears the upper portion of a super-register. + virtual bool clearsSuperRegisters(const MCRegisterInfo &MRI, + const MCInst &Inst, + BitVector &Writes) const; + /// Given a branch instruction try to get the address the branch /// targets. Return true on success, and the address in Target. virtual bool Index: lib/MC/MCInstrAnalysis.cpp =================================================================== --- lib/MC/MCInstrAnalysis.cpp +++ lib/MC/MCInstrAnalysis.cpp @@ -8,6 +8,8 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCInstrAnalysis.h" + +#include "llvm/ADT/BitVector.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" @@ -15,6 +17,13 @@ using namespace llvm; +bool MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI, + const MCInst &Inst, + BitVector &Writes) const { + Writes.reset(); + return false; +} + bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, uint64_t &Target) const { if (Inst.getNumOperands() == 0 || Index: lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp =================================================================== --- lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -15,6 +15,8 @@ #include "InstPrinter/X86ATTInstPrinter.h" #include "InstPrinter/X86IntelInstPrinter.h" #include "X86MCAsmInfo.h" +#include "X86BaseInfo.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/MC/MCInstrAnalysis.h" @@ -293,8 +295,56 @@ return llvm::createMCRelocationInfo(TheTriple, Ctx); } +namespace llvm { +namespace X86_MC { + +class X86MCInstrAnalysis : public MCInstrAnalysis { + X86MCInstrAnalysis(const X86MCInstrAnalysis &) = delete; + X86MCInstrAnalysis &operator=(const X86MCInstrAnalysis &) = delete; + virtual ~X86MCInstrAnalysis() = default; + +public: + X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {} + + bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst, + BitVector &Writes) const override { + const MCInstrDesc &Desc = Info->get(Inst.getOpcode()); + + // AVX instructions that write to XMM registers zero out the upper 128 bits + // of the underlying YMM register. + bool HasVEXOrEVEX = ((Desc.TSFlags & X86II::EncodingMask) == X86II::VEX || + (Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX); + + // On X86-64, a general purpose integer register is viewed as a 64-bit + // register internal to the processor. + // An update to the lower 32 bits of a 64 bit integer register is + // architecturally defined to zero extend the upper 32 bits. + const MCRegisterClass &GR32RC = MRI.getRegClass(X86::GR32RegClassID); + const MCRegisterClass &XMMRC = MRI.getRegClass(X86::VR128RegClassID); + + unsigned NumDefs = Desc.getNumDefs(); + for (unsigned I = 0, E = NumDefs; I < E; ++I) { + const MCOperand &Op = Inst.getOperand(I); + Writes[I] = GR32RC.contains(Op.getReg()) || + (HasVEXOrEVEX && XMMRC.contains(Op.getReg())); + } + + for (unsigned I = 0, E = Desc.getNumImplicitDefs(); I < E; ++I) { + const MCPhysReg Reg = Desc.getImplicitDefs()[I]; + Writes[I + NumDefs] = + GR32RC.contains(Reg) || (HasVEXOrEVEX && XMMRC.contains(Reg)); + } + + return Writes.any(); + } +}; + +} // end of namespace X86_MC + +} // end of namespace llvm + static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) { - return new MCInstrAnalysis(Info); + return new X86_MC::X86MCInstrAnalysis(Info); } // Force static initialization. Index: test/tools/llvm-mca/X86/BtVer2/clear-super-register-1.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/clear-super-register-1.s +++ test/tools/llvm-mca/X86/BtVer2/clear-super-register-1.s @@ -3,7 +3,7 @@ ## Sets register RAX. imulq $5, %rcx, %rax - + ## Kills the previous definition of RAX. ## The upper portion of RAX is cleared. lzcnt %ecx, %eax @@ -15,9 +15,9 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 400 -# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total Cycles: 704 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: IPC: 0.57 # CHECK-NEXT: Block RThroughput: 6.0 # CHECK: Instruction Info: @@ -35,17 +35,17 @@ # CHECK-NEXT: 8 5 2.00 bsfq %rax, %rcx # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 0123456 +# CHECK-NEXT: 01234567 +# CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeER . . . .. imulq $5, %rcx, %rax -# CHECK-NEXT: [0,1] .DeE----R . . . .. lzcntl %ecx, %eax -# CHECK-NEXT: [0,2] .D=====eER. . . .. andq %rcx, %rax -# CHECK-NEXT: [0,3] . D=====eeeeeER. . .. bsfq %rax, %rcx -# CHECK-NEXT: [1,0] . .D======eeeeeeER .. imulq $5, %rcx, %rax -# CHECK-NEXT: [1,1] . . D=====eE-----R .. lzcntl %ecx, %eax -# CHECK-NEXT: [1,2] . . D===========eER .. andq %rcx, %rax -# CHECK-NEXT: [1,3] . . D===========eeeeeER bsfq %rax, %rcx +# CHECK: [0,0] DeeeeeeER . . . imulq $5, %rcx, %rax +# CHECK-NEXT: [0,1] .DeE----R . . . lzcntl %ecx, %eax +# CHECK-NEXT: [0,2] .D=eE----R. . . andq %rcx, %rax +# CHECK-NEXT: [0,3] . D=eeeeeER . . bsfq %rax, %rcx +# CHECK-NEXT: [1,0] . .D==eeeeeeER. imulq $5, %rcx, %rax +# CHECK-NEXT: [1,1] . . D=eE-----R. lzcntl %ecx, %eax +# CHECK-NEXT: [1,2] . . D==eE-----R andq %rcx, %rax +# CHECK-NEXT: [1,3] . . D==eeeeeER bsfq %rax, %rcx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -54,7 +54,7 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 4.0 0.5 0.0 imulq $5, %rcx, %rax -# CHECK-NEXT: 1. 2 3.5 0.5 4.5 lzcntl %ecx, %eax -# CHECK-NEXT: 2. 2 9.0 0.0 0.0 andq %rcx, %rax -# CHECK-NEXT: 3. 2 9.0 0.0 0.0 bsfq %rax, %rcx +# CHECK-NEXT: 0. 2 2.0 0.5 0.0 imulq $5, %rcx, %rax +# CHECK-NEXT: 1. 2 1.5 0.5 4.5 lzcntl %ecx, %eax +# CHECK-NEXT: 2. 2 2.5 0.0 4.5 andq %rcx, %rax +# CHECK-NEXT: 3. 2 2.5 0.0 0.0 bsfq %rax, %rcx Index: test/tools/llvm-mca/X86/BtVer2/clear-super-register-2.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/clear-super-register-2.s +++ test/tools/llvm-mca/X86/BtVer2/clear-super-register-2.s @@ -33,9 +33,9 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 1800 -# CHECK-NEXT: Total Cycles: 7003 +# CHECK-NEXT: Total Cycles: 3811 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 0.26 +# CHECK-NEXT: IPC: 0.47 # CHECK-NEXT: Block RThroughput: 38.0 # CHECK: Instruction Info: @@ -67,27 +67,31 @@ # CHECK-NEXT: 1 1 0.50 vandps %xmm4, %xmm1, %xmm0 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 0123456789 01234 +# CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789 # CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 -# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER . . . . . . . vdivps %ymm0, %ymm1, %ymm3 -# CHECK-NEXT: [0,1] .DeeeE----------------------------------R . . . . . . . vaddps %xmm0, %xmm1, %xmm3 -# CHECK-NEXT: [0,2] . D====================================eeeER . . . . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,3] . D=====================================eeeER . . . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,4] . D======================================eeeER . . . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,5] . D=======================================eeeER. . . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,6] . .D========================================eeeER . . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,7] . . D=========================================eeeER . . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,8] . . D==========================================eeeER . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,9] . . D===========================================eeeER . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,10] . . D============================================eeeER. . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,11] . . .D=============================================eeeER . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,12] . . . D==============================================eeeER . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,13] . . . D===============================================eeeER . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,14] . . . D================================================eeeER . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,15] . . . D=================================================eeeER. . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,16] . . . .D==================================================eeeER . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,17] . . . . D====================================================eER . vandps %xmm4, %xmm1, %xmm0 +# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . vdivps %ymm0, %ymm1, %ymm3 +# CHECK-NEXT: [0,1] .DeeeE----------------------------------R . . . . . . . . vaddps %xmm0, %xmm1, %xmm3 +# CHECK-NEXT: [0,2] . D==eeeE--------------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,3] . D===eeeE------------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,4] . D====eeeE-----------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,5] . D=====eeeE---------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,6] . .D======eeeE--------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,7] . . D=======eeeE------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,8] . . D========eeeE-----------------------R. . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,9] . . D=========eeeE---------------------R. . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,10] . . D==========eeeE--------------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,11] . . .D===========eeeE------------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,12] . . . D============eeeE-----------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,13] . . . D=============eeeE---------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,14] . . . D==============eeeE--------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,15] . . . D===============eeeE------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,16] . . . .D================eeeE-----------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,17] . . . . D==================eE----------R . . . . . . . vandps %xmm4, %xmm1, %xmm0 +# CHECK-NEXT: [1,0] . . . . D====================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER. vdivps %ymm0, %ymm1, %ymm3 +# CHECK-NEXT: [1,1] . . . . D=================eeeE-------------------------------------R. vaddps %xmm0, %xmm1, %xmm3 +# CHECK-NEXT: [1,2] . . . . D===================eeeE-----------------------------------R vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [1,3] . . . . .D====================eeeE---------------------------------R vaddps %ymm3, %ymm1, %ymm4 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -96,21 +100,21 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vdivps %ymm0, %ymm1, %ymm3 -# CHECK-NEXT: 1. 1 1.0 1.0 34.0 vaddps %xmm0, %xmm1, %xmm3 -# CHECK-NEXT: 2. 1 37.0 0.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 3. 1 38.0 2.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 4. 1 39.0 4.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 5. 1 40.0 6.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 6. 1 41.0 8.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 7. 1 42.0 10.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 8. 1 43.0 12.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 9. 1 44.0 14.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 10. 1 45.0 16.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 11. 1 46.0 18.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 12. 1 47.0 20.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 13. 1 48.0 22.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 14. 1 49.0 24.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 15. 1 50.0 26.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 16. 1 51.0 28.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 17. 1 53.0 0.0 0.0 vandps %xmm4, %xmm1, %xmm0 +# CHECK-NEXT: 0. 2 11.0 1.5 0.0 vdivps %ymm0, %ymm1, %ymm3 +# CHECK-NEXT: 1. 2 9.5 0.5 35.5 vaddps %xmm0, %xmm1, %xmm3 +# CHECK-NEXT: 2. 2 11.5 0.0 33.5 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 3. 2 12.5 2.0 31.5 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 4. 1 5.0 4.0 29.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 5. 1 6.0 6.0 27.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 6. 1 7.0 7.0 26.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 7. 1 8.0 8.0 24.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 8. 1 9.0 9.0 23.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 9. 1 10.0 10.0 21.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 10. 1 11.0 11.0 20.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 11. 1 12.0 12.0 18.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 12. 1 13.0 13.0 17.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 13. 1 14.0 14.0 15.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 14. 1 15.0 15.0 14.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 15. 1 16.0 16.0 12.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 16. 1 17.0 17.0 11.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 17. 1 19.0 0.0 10.0 vandps %xmm4, %xmm1, %xmm0 Index: tools/llvm-mca/InstrBuilder.h =================================================================== --- tools/llvm-mca/InstrBuilder.h +++ tools/llvm-mca/InstrBuilder.h @@ -17,7 +17,9 @@ #include "Instruction.h" #include "Support.h" +#include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" namespace mca { @@ -37,6 +39,8 @@ class InstrBuilder { const llvm::MCSubtargetInfo &STI; const llvm::MCInstrInfo &MCII; + const llvm::MCRegisterInfo &MRI; + const llvm::MCInstrAnalysis &MCIA; llvm::SmallVector ProcResourceMasks; llvm::DenseMap> Descriptors; @@ -48,8 +52,10 @@ InstrBuilder &operator=(const InstrBuilder &) = delete; public: - InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii) - : STI(sti), MCII(mcii), + InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii, + const llvm::MCRegisterInfo &mri, + const llvm::MCInstrAnalysis &mcia) + : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia), ProcResourceMasks(STI.getSchedModel().getNumProcResourceKinds()) { computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks); } Index: tools/llvm-mca/InstrBuilder.cpp =================================================================== --- tools/llvm-mca/InstrBuilder.cpp +++ tools/llvm-mca/InstrBuilder.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "InstrBuilder.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/Debug.h" @@ -158,23 +159,6 @@ const MCInstrDesc &MCDesc, const MCSchedClassDesc &SCDesc, const MCSubtargetInfo &STI) { - // Set if writes through this opcode may update super registers. - // TODO: on x86-64, a 4 byte write of a general purpose register always - // fully updates the super-register. - // More in general, (at least on x86) not all register writes perform - // a partial (super-)register update. - // For example, an AVX instruction that writes on a XMM register implicitly - // zeroes the upper half of every aliasing super-register. - // - // For now, we pessimistically assume that writes are all potentially - // partial register updates. This is a good default for most targets, execept - // for those like x86 which implement a special semantic for certain opcodes. - // At least on x86, this may lead to an inaccurate prediction of the - // instruction level parallelism. - bool FullyUpdatesSuperRegisters = false; - - // Now Populate Writes. - // This algorithm currently works under the strong (and potentially incorrect) // assumption that information related to register def/uses can be obtained // from MCInstrDesc. @@ -275,7 +259,6 @@ Write.Latency = ID.MaxLatency; Write.SClassOrWriteResourceID = 0; } - Write.FullyUpdatesSuperRegs = FullyUpdatesSuperRegisters; Write.IsOptionalDef = false; LLVM_DEBUG({ dbgs() << "\t\tOpIdx=" << Write.OpIndex << ", Latency=" << Write.Latency @@ -488,16 +471,30 @@ NewIS->getUses().emplace_back(llvm::make_unique(RD, RegID)); } + // Use a bit-vector to track register writes that implicitly clear the upper + // portion of the underlying super-registers. + // There is one bit for every (explicit or implicit) register write. + BitVector BV(D.Writes.size()); + + // Now query the MCInstrAnalysis object to obtain information about which + // register writes implicitly clear the upper portion of a super-register. + MCIA.clearsSuperRegisters(MRI, MCI, BV); + // Initialize writes. + unsigned WriteIndex = 0; for (const WriteDescriptor &WD : D.Writes) { unsigned RegID = WD.OpIndex == -1 ? WD.RegisterID : MCI.getOperand(WD.OpIndex).getReg(); // Check if this is a optional definition that references NoReg. - if (WD.IsOptionalDef && !RegID) + if (WD.IsOptionalDef && !RegID) { + ++WriteIndex; continue; + } assert(RegID && "Expected a valid register ID!"); - NewIS->getDefs().emplace_back(llvm::make_unique(WD, RegID)); + NewIS->getDefs().emplace_back( + llvm::make_unique(WD, RegID, BV[WriteIndex])); + ++WriteIndex; } return NewIS; Index: tools/llvm-mca/Instruction.h =================================================================== --- tools/llvm-mca/Instruction.h +++ tools/llvm-mca/Instruction.h @@ -70,11 +70,6 @@ // This field is set to a value different than zero only if this // is an implicit definition. unsigned RegisterID; - // True if this write generates a partial update of a super-registers. - // On X86, this flag is set by byte/word writes on GPR registers. Also, - // a write of an XMM register only partially updates the corresponding - // YMM super-register if the write is associated to a legacy SSE instruction. - bool FullyUpdatesSuperRegs; // Instruction itineraries would set this field to the SchedClass ID. // Otherwise, it defaults to the WriteResourceID from the MCWriteLatencyEntry // element associated to this write. @@ -129,6 +124,10 @@ // field RegisterID from WD. unsigned RegisterID; + // True if this write implicitly clears the upper portion of RegisterID's + // super-registers. + bool ClearsSuperRegs; + // A list of dependent reads. Users is a set of dependent // reads. A dependent read is added to the set only if CyclesLeft // is "unknown". As soon as CyclesLeft is 'known', each user in the set @@ -138,8 +137,10 @@ std::set> Users; public: - WriteState(const WriteDescriptor &Desc, unsigned RegID) - : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID) {} + WriteState(const WriteDescriptor &Desc, unsigned RegID, + bool clearsSuperRegs = false) + : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID), + ClearsSuperRegs(clearsSuperRegs) {} WriteState(const WriteState &Other) = delete; WriteState &operator=(const WriteState &Other) = delete; @@ -148,7 +149,7 @@ unsigned getRegisterID() const { return RegisterID; } void addUser(ReadState *Use, int ReadAdvance); - bool fullyUpdatesSuperRegs() const { return WD.FullyUpdatesSuperRegs; } + bool clearsSuperRegisters() const { return ClearsSuperRegs; } // On every cycle, update CyclesLeft and notify dependent users. void cycleEvent(); Index: tools/llvm-mca/RegisterFile.cpp =================================================================== --- tools/llvm-mca/RegisterFile.cpp +++ tools/llvm-mca/RegisterFile.cpp @@ -138,7 +138,7 @@ allocatePhysRegs(Mapping.second, UsedPhysRegs); // If this is a partial update, then we are done. - if (!WS.fullyUpdatesSuperRegs()) + if (!WS.clearsSuperRegisters()) return; for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) @@ -149,7 +149,7 @@ MutableArrayRef FreedPhysRegs, bool ShouldFreePhysRegs) { unsigned RegID = WS.getRegisterID(); - bool ShouldInvalidateSuperRegs = WS.fullyUpdatesSuperRegs(); + bool ShouldInvalidateSuperRegs = WS.clearsSuperRegisters(); assert(RegID != 0 && "Invalidating an already invalid register?"); assert(WS.getCyclesLeft() != -512 && Index: tools/llvm-mca/llvm-mca.cpp =================================================================== --- tools/llvm-mca/llvm-mca.cpp +++ tools/llvm-mca/llvm-mca.cpp @@ -381,6 +381,9 @@ std::unique_ptr MCII(TheTarget->createMCInstrInfo()); + std::unique_ptr MCIA( + TheTarget->createMCInstrAnalysis(MCII.get())); + if (!MCPU.compare("native")) MCPU = llvm::sys::getHostCPUName(); @@ -450,7 +453,7 @@ Width = DispatchWidth; // Create an instruction builder. - mca::InstrBuilder IB(*STI, *MCII); + mca::InstrBuilder IB(*STI, *MCII, *MRI, *MCIA); // Number each region in the sequence. unsigned RegionIdx = 0;