Index: include/llvm/MC/MCInstrAnalysis.h =================================================================== --- include/llvm/MC/MCInstrAnalysis.h +++ include/llvm/MC/MCInstrAnalysis.h @@ -22,6 +22,8 @@ namespace llvm { +class MCRegisterInfo; + class MCInstrAnalysis { protected: friend class Target; @@ -60,6 +62,31 @@ return Info->get(Inst.getOpcode()).isTerminator(); } + /// Returns true if at least one of the register writes performed by + /// \param Inst implicitly clears the upper portion of all super-registers. + /// + /// Example: on X86-64, a write to EAX implicitly clears the upper half of + /// RAX. Also (still on x86) an XMM write perfomed by an AVX 128-bit + /// instruction implicitly clears the upper portion of the correspondent + /// YMM register. + /// + /// This method also updates an APInt which is used as mask of register + /// writes. There is one bit for every explicit/implicit write performed by + /// the instruction. If a write implicitly clears its super-registers, then + /// the corresponding bit is set (vic. the corresponding bit is cleared). + /// + /// The first bits in the APint are related to explicit writes. The remaining + /// bits are related to implicit writes. The sequence of writes follows the + /// machine operand sequence. For implicit writes, the sequence is defined by + /// the MCInstrDesc. + /// + /// The assumption is that the bit-width of the APInt is correctly set by + /// the caller. The default implementation conservatively assumes that none of + /// the writes clears the upper portion of a super-register. + virtual bool clearsSuperRegisters(const MCRegisterInfo &MRI, + const MCInst &Inst, + APInt &Writes) const; + /// Given a branch instruction try to get the address the branch /// targets. Return true on success, and the address in Target. virtual bool Index: lib/MC/MCInstrAnalysis.cpp =================================================================== --- lib/MC/MCInstrAnalysis.cpp +++ lib/MC/MCInstrAnalysis.cpp @@ -8,6 +8,8 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCInstrAnalysis.h" + +#include "llvm/ADT/APInt.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" @@ -15,6 +17,13 @@ using namespace llvm; +bool MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI, + const MCInst &Inst, + APInt &Writes) const { + Writes.clearAllBits(); + return false; +} + bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, uint64_t &Target) const { if (Inst.getNumOperands() == 0 || Index: lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp =================================================================== --- lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -14,7 +14,9 @@ #include "X86MCTargetDesc.h" #include "InstPrinter/X86ATTInstPrinter.h" #include "InstPrinter/X86IntelInstPrinter.h" +#include "X86BaseInfo.h" #include "X86MCAsmInfo.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/MC/MCInstrAnalysis.h" @@ -293,8 +295,79 @@ return llvm::createMCRelocationInfo(TheTriple, Ctx); } +namespace llvm { +namespace X86_MC { + +class X86MCInstrAnalysis : public MCInstrAnalysis { + X86MCInstrAnalysis(const X86MCInstrAnalysis &) = delete; + X86MCInstrAnalysis &operator=(const X86MCInstrAnalysis &) = delete; + virtual ~X86MCInstrAnalysis() = default; + +public: + X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {} + + bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst, + APInt &Mask) const override; +}; + +bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI, + const MCInst &Inst, + APInt &Mask) const { + const MCInstrDesc &Desc = Info->get(Inst.getOpcode()); + unsigned NumDefs = Desc.getNumDefs(); + unsigned NumImplicitDefs = Desc.getNumImplicitDefs(); + assert(Mask.getBitWidth() == NumDefs + NumImplicitDefs && + "Unexpected number of bits in the mask!"); + + bool HasVEX = (Desc.TSFlags & X86II::EncodingMask) == X86II::VEX; + bool HasEVEX = (Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX; + bool HasXOP = (Desc.TSFlags & X86II::EncodingMask) == X86II::XOP; + + const MCRegisterClass &GR32RC = MRI.getRegClass(X86::GR32RegClassID); + const MCRegisterClass &VR128XRC = MRI.getRegClass(X86::VR128XRegClassID); + const MCRegisterClass &VR256XRC = MRI.getRegClass(X86::VR256XRegClassID); + + auto ClearsSuperReg = [=](unsigned RegID) { + // On X86-64, a general purpose integer register is viewed as a 64-bit + // register internal to the processor. + // An update to the lower 32 bits of a 64 bit integer register is + // architecturally defined to zero extend the upper 32 bits. + if (GR32RC.contains(RegID)) + return true; + + // Early exit if this instruction has no vex/evex/xop prefix. + if (!HasEVEX && !HasVEX && !HasXOP) + return false; + + // All VEX and EVEX encoded instructions are defined to zero the high bits + // of the destination register up to VLMAX (i.e. the maximum vector register + // width pertaining to the instruction). + // We assume the same behavior for XOP instructions too. + return VR128XRC.contains(RegID) || VR256XRC.contains(RegID); + }; + + Mask.clearAllBits(); + for (unsigned I = 0, E = NumDefs; I < E; ++I) { + const MCOperand &Op = Inst.getOperand(I); + if (ClearsSuperReg(Op.getReg())) + Mask.setBit(I); + } + + for (unsigned I = 0, E = NumImplicitDefs; I < E; ++I) { + const MCPhysReg Reg = Desc.getImplicitDefs()[I]; + if (ClearsSuperReg(Reg)) + Mask.setBit(NumDefs + I); + } + + return Mask.getBoolValue(); +} + +} // end of namespace X86_MC + +} // end of namespace llvm + static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) { - return new MCInstrAnalysis(Info); + return new X86_MC::X86MCInstrAnalysis(Info); } // Force static initialization. Index: test/tools/llvm-mca/X86/BtVer2/clear-super-register-1.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/clear-super-register-1.s +++ test/tools/llvm-mca/X86/BtVer2/clear-super-register-1.s @@ -3,7 +3,7 @@ ## Sets register RAX. imulq $5, %rcx, %rax - + ## Kills the previous definition of RAX. ## The upper portion of RAX is cleared. lzcnt %ecx, %eax @@ -15,9 +15,9 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 400 -# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total Cycles: 704 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: IPC: 0.57 # CHECK-NEXT: Block RThroughput: 6.0 # CHECK: Instruction Info: @@ -35,17 +35,17 @@ # CHECK-NEXT: 8 5 2.00 bsfq %rax, %rcx # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 0123456 +# CHECK-NEXT: 01234567 +# CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeER . . . .. imulq $5, %rcx, %rax -# CHECK-NEXT: [0,1] .DeE----R . . . .. lzcntl %ecx, %eax -# CHECK-NEXT: [0,2] .D=====eER. . . .. andq %rcx, %rax -# CHECK-NEXT: [0,3] . D=====eeeeeER. . .. bsfq %rax, %rcx -# CHECK-NEXT: [1,0] . .D======eeeeeeER .. imulq $5, %rcx, %rax -# CHECK-NEXT: [1,1] . . D=====eE-----R .. lzcntl %ecx, %eax -# CHECK-NEXT: [1,2] . . D===========eER .. andq %rcx, %rax -# CHECK-NEXT: [1,3] . . D===========eeeeeER bsfq %rax, %rcx +# CHECK: [0,0] DeeeeeeER . . . imulq $5, %rcx, %rax +# CHECK-NEXT: [0,1] .DeE----R . . . lzcntl %ecx, %eax +# CHECK-NEXT: [0,2] .D=eE----R. . . andq %rcx, %rax +# CHECK-NEXT: [0,3] . D=eeeeeER . . bsfq %rax, %rcx +# CHECK-NEXT: [1,0] . .D==eeeeeeER. imulq $5, %rcx, %rax +# CHECK-NEXT: [1,1] . . D=eE-----R. lzcntl %ecx, %eax +# CHECK-NEXT: [1,2] . . D==eE-----R andq %rcx, %rax +# CHECK-NEXT: [1,3] . . D==eeeeeER bsfq %rax, %rcx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -54,7 +54,7 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 4.0 0.5 0.0 imulq $5, %rcx, %rax -# CHECK-NEXT: 1. 2 3.5 0.5 4.5 lzcntl %ecx, %eax -# CHECK-NEXT: 2. 2 9.0 0.0 0.0 andq %rcx, %rax -# CHECK-NEXT: 3. 2 9.0 0.0 0.0 bsfq %rax, %rcx +# CHECK-NEXT: 0. 2 2.0 0.5 0.0 imulq $5, %rcx, %rax +# CHECK-NEXT: 1. 2 1.5 0.5 4.5 lzcntl %ecx, %eax +# CHECK-NEXT: 2. 2 2.5 0.0 4.5 andq %rcx, %rax +# CHECK-NEXT: 3. 2 2.5 0.0 0.0 bsfq %rax, %rcx Index: test/tools/llvm-mca/X86/BtVer2/clear-super-register-2.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/clear-super-register-2.s +++ test/tools/llvm-mca/X86/BtVer2/clear-super-register-2.s @@ -33,9 +33,9 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 1800 -# CHECK-NEXT: Total Cycles: 7003 +# CHECK-NEXT: Total Cycles: 3811 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 0.26 +# CHECK-NEXT: IPC: 0.47 # CHECK-NEXT: Block RThroughput: 38.0 # CHECK: Instruction Info: @@ -67,27 +67,31 @@ # CHECK-NEXT: 1 1 0.50 vandps %xmm4, %xmm1, %xmm0 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 0123456789 01234 +# CHECK-NEXT: 0123456789 0123456789 0123456789 0123456789 # CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123456789 -# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER . . . . . . . vdivps %ymm0, %ymm1, %ymm3 -# CHECK-NEXT: [0,1] .DeeeE----------------------------------R . . . . . . . vaddps %xmm0, %xmm1, %xmm3 -# CHECK-NEXT: [0,2] . D====================================eeeER . . . . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,3] . D=====================================eeeER . . . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,4] . D======================================eeeER . . . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,5] . D=======================================eeeER. . . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,6] . .D========================================eeeER . . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,7] . . D=========================================eeeER . . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,8] . . D==========================================eeeER . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,9] . . D===========================================eeeER . . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,10] . . D============================================eeeER. . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,11] . . .D=============================================eeeER . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,12] . . . D==============================================eeeER . . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,13] . . . D===============================================eeeER . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,14] . . . D================================================eeeER . . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,15] . . . D=================================================eeeER. . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,16] . . . .D==================================================eeeER . vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: [0,17] . . . . D====================================================eER . vandps %xmm4, %xmm1, %xmm0 +# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . vdivps %ymm0, %ymm1, %ymm3 +# CHECK-NEXT: [0,1] .DeeeE----------------------------------R . . . . . . . . vaddps %xmm0, %xmm1, %xmm3 +# CHECK-NEXT: [0,2] . D==eeeE--------------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,3] . D===eeeE------------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,4] . D====eeeE-----------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,5] . D=====eeeE---------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,6] . .D======eeeE--------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,7] . . D=======eeeE------------------------R . . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,8] . . D========eeeE-----------------------R. . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,9] . . D=========eeeE---------------------R. . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,10] . . D==========eeeE--------------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,11] . . .D===========eeeE------------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,12] . . . D============eeeE-----------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,13] . . . D=============eeeE---------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,14] . . . D==============eeeE--------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,15] . . . D===============eeeE------------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,16] . . . .D================eeeE-----------R . . . . . . . vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [0,17] . . . . D==================eE----------R . . . . . . . vandps %xmm4, %xmm1, %xmm0 +# CHECK-NEXT: [1,0] . . . . D====================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER. vdivps %ymm0, %ymm1, %ymm3 +# CHECK-NEXT: [1,1] . . . . D=================eeeE-------------------------------------R. vaddps %xmm0, %xmm1, %xmm3 +# CHECK-NEXT: [1,2] . . . . D===================eeeE-----------------------------------R vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: [1,3] . . . . .D====================eeeE---------------------------------R vaddps %ymm3, %ymm1, %ymm4 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -96,21 +100,21 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vdivps %ymm0, %ymm1, %ymm3 -# CHECK-NEXT: 1. 1 1.0 1.0 34.0 vaddps %xmm0, %xmm1, %xmm3 -# CHECK-NEXT: 2. 1 37.0 0.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 3. 1 38.0 2.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 4. 1 39.0 4.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 5. 1 40.0 6.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 6. 1 41.0 8.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 7. 1 42.0 10.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 8. 1 43.0 12.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 9. 1 44.0 14.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 10. 1 45.0 16.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 11. 1 46.0 18.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 12. 1 47.0 20.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 13. 1 48.0 22.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 14. 1 49.0 24.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 15. 1 50.0 26.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 16. 1 51.0 28.0 0.0 vaddps %ymm3, %ymm1, %ymm4 -# CHECK-NEXT: 17. 1 53.0 0.0 0.0 vandps %xmm4, %xmm1, %xmm0 +# CHECK-NEXT: 0. 2 11.0 1.5 0.0 vdivps %ymm0, %ymm1, %ymm3 +# CHECK-NEXT: 1. 2 9.5 0.5 35.5 vaddps %xmm0, %xmm1, %xmm3 +# CHECK-NEXT: 2. 2 11.5 0.0 33.5 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 3. 2 12.5 2.0 31.5 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 4. 1 5.0 4.0 29.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 5. 1 6.0 6.0 27.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 6. 1 7.0 7.0 26.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 7. 1 8.0 8.0 24.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 8. 1 9.0 9.0 23.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 9. 1 10.0 10.0 21.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 10. 1 11.0 11.0 20.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 11. 1 12.0 12.0 18.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 12. 1 13.0 13.0 17.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 13. 1 14.0 14.0 15.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 14. 1 15.0 15.0 14.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 15. 1 16.0 16.0 12.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 16. 1 17.0 17.0 11.0 vaddps %ymm3, %ymm1, %ymm4 +# CHECK-NEXT: 17. 1 19.0 0.0 10.0 vandps %xmm4, %xmm1, %xmm0 Index: test/tools/llvm-mca/X86/Generic/avx512-super-registers-1.s =================================================================== --- test/tools/llvm-mca/X86/Generic/avx512-super-registers-1.s +++ test/tools/llvm-mca/X86/Generic/avx512-super-registers-1.s @@ -10,9 +10,9 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 600 -# CHECK-NEXT: Total Cycles: 2103 +# CHECK-NEXT: Total Cycles: 318 # CHECK-NEXT: Dispatch Width: 4 -# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: IPC: 1.89 # CHECK-NEXT: Block RThroughput: 3.0 # CHECK: Instruction Info: @@ -55,21 +55,21 @@ # CHECK-NEXT: - - - 1.00 - - - - vaddps %xmm4, %xmm5, %xmm0 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 -# CHECK-NEXT: Index 0123456789 0123456789 01234 +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 -# CHECK: [0,0] DeeeeeER . . . . . . . . vmulps %zmm0, %zmm1, %zmm2 -# CHECK-NEXT: [0,1] DeeeE--R . . . . . . . . vaddps %xmm1, %xmm1, %xmm2 -# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . . . vmulps %ymm2, %ymm3, %ymm4 -# CHECK-NEXT: [0,3] D==========eeeER . . . . . . vaddps %xmm4, %xmm5, %xmm6 -# CHECK-NEXT: [0,4] .D============eeeeeER . . . . . vmulps %xmm6, %xmm3, %xmm4 -# CHECK-NEXT: [0,5] .D=================eeeER . . . . . vaddps %xmm4, %xmm5, %xmm0 -# CHECK-NEXT: [1,0] .D====================eeeeeER . . . . vmulps %zmm0, %zmm1, %zmm2 -# CHECK-NEXT: [1,1] .DeeeE----------------------R . . . . vaddps %xmm1, %xmm1, %xmm2 -# CHECK-NEXT: [1,2] . D========================eeeeeER . . . vmulps %ymm2, %ymm3, %ymm4 -# CHECK-NEXT: [1,3] . D=============================eeeER . . vaddps %xmm4, %xmm5, %xmm6 -# CHECK-NEXT: [1,4] . D================================eeeeeER . vmulps %xmm6, %xmm3, %xmm4 -# CHECK-NEXT: [1,5] . D=====================================eeeER vaddps %xmm4, %xmm5, %xmm0 +# CHECK: [0,0] DeeeeeER . . . .. vmulps %zmm0, %zmm1, %zmm2 +# CHECK-NEXT: [0,1] DeeeE--R . . . .. vaddps %xmm1, %xmm1, %xmm2 +# CHECK-NEXT: [0,2] D===eeeeeER . . .. vmulps %ymm2, %ymm3, %ymm4 +# CHECK-NEXT: [0,3] D========eeeER . . .. vaddps %xmm4, %xmm5, %xmm6 +# CHECK-NEXT: [0,4] .D==========eeeeeER . .. vmulps %xmm6, %xmm3, %xmm4 +# CHECK-NEXT: [0,5] .D===============eeeER .. vaddps %xmm4, %xmm5, %xmm0 +# CHECK-NEXT: [1,0] .D==================eeeeeER vmulps %zmm0, %zmm1, %zmm2 +# CHECK-NEXT: [1,1] .DeeeE--------------------R vaddps %xmm1, %xmm1, %xmm2 +# CHECK-NEXT: [1,2] . D==eeeeeE---------------R vmulps %ymm2, %ymm3, %ymm4 +# CHECK-NEXT: [1,3] . D=======eeeE------------R vaddps %xmm4, %xmm5, %xmm6 +# CHECK-NEXT: [1,4] . D==========eeeeeE-------R vmulps %xmm6, %xmm3, %xmm4 +# CHECK-NEXT: [1,5] . D===============eeeE----R vaddps %xmm4, %xmm5, %xmm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -78,9 +78,9 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 11.0 0.5 0.0 vmulps %zmm0, %zmm1, %zmm2 -# CHECK-NEXT: 1. 2 1.0 1.0 12.0 vaddps %xmm1, %xmm1, %xmm2 -# CHECK-NEXT: 2. 2 15.5 0.0 0.0 vmulps %ymm2, %ymm3, %ymm4 -# CHECK-NEXT: 3. 2 20.5 0.0 0.0 vaddps %xmm4, %xmm5, %xmm6 -# CHECK-NEXT: 4. 2 23.0 0.0 0.0 vmulps %xmm6, %xmm3, %xmm4 -# CHECK-NEXT: 5. 2 28.0 0.0 0.0 vaddps %xmm4, %xmm5, %xmm0 +# CHECK-NEXT: 0. 2 10.0 0.5 0.0 vmulps %zmm0, %zmm1, %zmm2 +# CHECK-NEXT: 1. 2 1.0 1.0 11.0 vaddps %xmm1, %xmm1, %xmm2 +# CHECK-NEXT: 2. 2 3.5 0.0 7.5 vmulps %ymm2, %ymm3, %ymm4 +# CHECK-NEXT: 3. 2 8.5 0.0 6.0 vaddps %xmm4, %xmm5, %xmm6 +# CHECK-NEXT: 4. 2 11.0 0.0 3.5 vmulps %xmm6, %xmm3, %xmm4 +# CHECK-NEXT: 5. 2 16.0 0.0 2.0 vaddps %xmm4, %xmm5, %xmm0 Index: test/tools/llvm-mca/X86/Generic/avx512-super-registers-2.s =================================================================== --- test/tools/llvm-mca/X86/Generic/avx512-super-registers-2.s +++ test/tools/llvm-mca/X86/Generic/avx512-super-registers-2.s @@ -10,9 +10,9 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 600 -# CHECK-NEXT: Total Cycles: 2103 +# CHECK-NEXT: Total Cycles: 318 # CHECK-NEXT: Dispatch Width: 4 -# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: IPC: 1.89 # CHECK-NEXT: Block RThroughput: 3.0 # CHECK: Instruction Info: @@ -55,21 +55,21 @@ # CHECK-NEXT: - - - 1.00 - - - - vaddps %xmm4, %xmm5, %xmm0 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 -# CHECK-NEXT: Index 0123456789 0123456789 01234 +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 -# CHECK: [0,0] DeeeeeER . . . . . . . . vmulps %zmm0, %zmm1, %zmm2 -# CHECK-NEXT: [0,1] DeeeE--R . . . . . . . . vaddps %ymm1, %ymm1, %ymm2 -# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . . . vmulps %zmm2, %zmm3, %zmm4 -# CHECK-NEXT: [0,3] D==========eeeER . . . . . . vaddps %xmm4, %xmm5, %xmm6 -# CHECK-NEXT: [0,4] .D============eeeeeER . . . . . vmulps %xmm6, %xmm3, %xmm4 -# CHECK-NEXT: [0,5] .D=================eeeER . . . . . vaddps %xmm4, %xmm5, %xmm0 -# CHECK-NEXT: [1,0] .D====================eeeeeER . . . . vmulps %zmm0, %zmm1, %zmm2 -# CHECK-NEXT: [1,1] .DeeeE----------------------R . . . . vaddps %ymm1, %ymm1, %ymm2 -# CHECK-NEXT: [1,2] . D========================eeeeeER . . . vmulps %zmm2, %zmm3, %zmm4 -# CHECK-NEXT: [1,3] . D=============================eeeER . . vaddps %xmm4, %xmm5, %xmm6 -# CHECK-NEXT: [1,4] . D================================eeeeeER . vmulps %xmm6, %xmm3, %xmm4 -# CHECK-NEXT: [1,5] . D=====================================eeeER vaddps %xmm4, %xmm5, %xmm0 +# CHECK: [0,0] DeeeeeER . . . .. vmulps %zmm0, %zmm1, %zmm2 +# CHECK-NEXT: [0,1] DeeeE--R . . . .. vaddps %ymm1, %ymm1, %ymm2 +# CHECK-NEXT: [0,2] D===eeeeeER . . .. vmulps %zmm2, %zmm3, %zmm4 +# CHECK-NEXT: [0,3] D========eeeER . . .. vaddps %xmm4, %xmm5, %xmm6 +# CHECK-NEXT: [0,4] .D==========eeeeeER . .. vmulps %xmm6, %xmm3, %xmm4 +# CHECK-NEXT: [0,5] .D===============eeeER .. vaddps %xmm4, %xmm5, %xmm0 +# CHECK-NEXT: [1,0] .D==================eeeeeER vmulps %zmm0, %zmm1, %zmm2 +# CHECK-NEXT: [1,1] .DeeeE--------------------R vaddps %ymm1, %ymm1, %ymm2 +# CHECK-NEXT: [1,2] . D==eeeeeE---------------R vmulps %zmm2, %zmm3, %zmm4 +# CHECK-NEXT: [1,3] . D=======eeeE------------R vaddps %xmm4, %xmm5, %xmm6 +# CHECK-NEXT: [1,4] . D==========eeeeeE-------R vmulps %xmm6, %xmm3, %xmm4 +# CHECK-NEXT: [1,5] . D===============eeeE----R vaddps %xmm4, %xmm5, %xmm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -78,9 +78,9 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 11.0 0.5 0.0 vmulps %zmm0, %zmm1, %zmm2 -# CHECK-NEXT: 1. 2 1.0 1.0 12.0 vaddps %ymm1, %ymm1, %ymm2 -# CHECK-NEXT: 2. 2 15.5 0.0 0.0 vmulps %zmm2, %zmm3, %zmm4 -# CHECK-NEXT: 3. 2 20.5 0.0 0.0 vaddps %xmm4, %xmm5, %xmm6 -# CHECK-NEXT: 4. 2 23.0 0.0 0.0 vmulps %xmm6, %xmm3, %xmm4 -# CHECK-NEXT: 5. 2 28.0 0.0 0.0 vaddps %xmm4, %xmm5, %xmm0 +# CHECK-NEXT: 0. 2 10.0 0.5 0.0 vmulps %zmm0, %zmm1, %zmm2 +# CHECK-NEXT: 1. 2 1.0 1.0 11.0 vaddps %ymm1, %ymm1, %ymm2 +# CHECK-NEXT: 2. 2 3.5 0.0 7.5 vmulps %zmm2, %zmm3, %zmm4 +# CHECK-NEXT: 3. 2 8.5 0.0 6.0 vaddps %xmm4, %xmm5, %xmm6 +# CHECK-NEXT: 4. 2 11.0 0.0 3.5 vmulps %xmm6, %xmm3, %xmm4 +# CHECK-NEXT: 5. 2 16.0 0.0 2.0 vaddps %xmm4, %xmm5, %xmm0 Index: test/tools/llvm-mca/X86/Generic/avx512-super-registers-3.s =================================================================== --- test/tools/llvm-mca/X86/Generic/avx512-super-registers-3.s +++ test/tools/llvm-mca/X86/Generic/avx512-super-registers-3.s @@ -10,9 +10,9 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 600 -# CHECK-NEXT: Total Cycles: 2103 +# CHECK-NEXT: Total Cycles: 318 # CHECK-NEXT: Dispatch Width: 4 -# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: IPC: 1.89 # CHECK-NEXT: Block RThroughput: 3.0 # CHECK: Instruction Info: @@ -55,21 +55,21 @@ # CHECK-NEXT: - - - 1.00 - - - - vaddps %xmm4, %xmm20, %xmm0 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 -# CHECK-NEXT: Index 0123456789 0123456789 01234 +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 -# CHECK: [0,0] DeeeeeER . . . . . . . . vmulps %zmm0, %zmm1, %zmm2 -# CHECK-NEXT: [0,1] DeeeE--R . . . . . . . . vaddps %xmm16, %xmm17, %xmm2 -# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . . . vmulps %ymm2, %ymm3, %ymm4 -# CHECK-NEXT: [0,3] D==========eeeER . . . . . . vaddps %xmm4, %xmm18, %xmm6 -# CHECK-NEXT: [0,4] .D============eeeeeER . . . . . vmulps %xmm6, %xmm19, %xmm4 -# CHECK-NEXT: [0,5] .D=================eeeER . . . . . vaddps %xmm4, %xmm20, %xmm0 -# CHECK-NEXT: [1,0] .D====================eeeeeER . . . . vmulps %zmm0, %zmm1, %zmm2 -# CHECK-NEXT: [1,1] .DeeeE----------------------R . . . . vaddps %xmm16, %xmm17, %xmm2 -# CHECK-NEXT: [1,2] . D========================eeeeeER . . . vmulps %ymm2, %ymm3, %ymm4 -# CHECK-NEXT: [1,3] . D=============================eeeER . . vaddps %xmm4, %xmm18, %xmm6 -# CHECK-NEXT: [1,4] . D================================eeeeeER . vmulps %xmm6, %xmm19, %xmm4 -# CHECK-NEXT: [1,5] . D=====================================eeeER vaddps %xmm4, %xmm20, %xmm0 +# CHECK: [0,0] DeeeeeER . . . .. vmulps %zmm0, %zmm1, %zmm2 +# CHECK-NEXT: [0,1] DeeeE--R . . . .. vaddps %xmm16, %xmm17, %xmm2 +# CHECK-NEXT: [0,2] D===eeeeeER . . .. vmulps %ymm2, %ymm3, %ymm4 +# CHECK-NEXT: [0,3] D========eeeER . . .. vaddps %xmm4, %xmm18, %xmm6 +# CHECK-NEXT: [0,4] .D==========eeeeeER . .. vmulps %xmm6, %xmm19, %xmm4 +# CHECK-NEXT: [0,5] .D===============eeeER .. vaddps %xmm4, %xmm20, %xmm0 +# CHECK-NEXT: [1,0] .D==================eeeeeER vmulps %zmm0, %zmm1, %zmm2 +# CHECK-NEXT: [1,1] .DeeeE--------------------R vaddps %xmm16, %xmm17, %xmm2 +# CHECK-NEXT: [1,2] . D==eeeeeE---------------R vmulps %ymm2, %ymm3, %ymm4 +# CHECK-NEXT: [1,3] . D=======eeeE------------R vaddps %xmm4, %xmm18, %xmm6 +# CHECK-NEXT: [1,4] . D==========eeeeeE-------R vmulps %xmm6, %xmm19, %xmm4 +# CHECK-NEXT: [1,5] . D===============eeeE----R vaddps %xmm4, %xmm20, %xmm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -78,9 +78,9 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 11.0 0.5 0.0 vmulps %zmm0, %zmm1, %zmm2 -# CHECK-NEXT: 1. 2 1.0 1.0 12.0 vaddps %xmm16, %xmm17, %xmm2 -# CHECK-NEXT: 2. 2 15.5 0.0 0.0 vmulps %ymm2, %ymm3, %ymm4 -# CHECK-NEXT: 3. 2 20.5 0.0 0.0 vaddps %xmm4, %xmm18, %xmm6 -# CHECK-NEXT: 4. 2 23.0 0.0 0.0 vmulps %xmm6, %xmm19, %xmm4 -# CHECK-NEXT: 5. 2 28.0 0.0 0.0 vaddps %xmm4, %xmm20, %xmm0 +# CHECK-NEXT: 0. 2 10.0 0.5 0.0 vmulps %zmm0, %zmm1, %zmm2 +# CHECK-NEXT: 1. 2 1.0 1.0 11.0 vaddps %xmm16, %xmm17, %xmm2 +# CHECK-NEXT: 2. 2 3.5 0.0 7.5 vmulps %ymm2, %ymm3, %ymm4 +# CHECK-NEXT: 3. 2 8.5 0.0 6.0 vaddps %xmm4, %xmm18, %xmm6 +# CHECK-NEXT: 4. 2 11.0 0.0 3.5 vmulps %xmm6, %xmm19, %xmm4 +# CHECK-NEXT: 5. 2 16.0 0.0 2.0 vaddps %xmm4, %xmm20, %xmm0 Index: test/tools/llvm-mca/X86/Generic/xop-super-registers-1.s =================================================================== --- test/tools/llvm-mca/X86/Generic/xop-super-registers-1.s +++ test/tools/llvm-mca/X86/Generic/xop-super-registers-1.s @@ -10,9 +10,9 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 600 -# CHECK-NEXT: Total Cycles: 2103 +# CHECK-NEXT: Total Cycles: 318 # CHECK-NEXT: Dispatch Width: 4 -# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: IPC: 1.89 # CHECK-NEXT: Block RThroughput: 3.0 # CHECK: Instruction Info: @@ -55,21 +55,21 @@ # CHECK-NEXT: - - - 1.00 - - - - vaddps %ymm4, %ymm5, %ymm0 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 -# CHECK-NEXT: Index 0123456789 0123456789 01234 +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 -# CHECK: [0,0] DeeeeeER . . . . . . . . vmulps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: [0,1] DeeeE--R . . . . . . . . vfrczpd %xmm1, %xmm2 -# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . . . vmulps %ymm2, %ymm3, %ymm4 -# CHECK-NEXT: [0,3] D==========eeeER . . . . . . vaddps %ymm4, %ymm5, %ymm6 -# CHECK-NEXT: [0,4] .D============eeeeeER . . . . . vmulps %ymm6, %ymm3, %ymm4 -# CHECK-NEXT: [0,5] .D=================eeeER . . . . . vaddps %ymm4, %ymm5, %ymm0 -# CHECK-NEXT: [1,0] .D====================eeeeeER . . . . vmulps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: [1,1] .DeeeE----------------------R . . . . vfrczpd %xmm1, %xmm2 -# CHECK-NEXT: [1,2] . D========================eeeeeER . . . vmulps %ymm2, %ymm3, %ymm4 -# CHECK-NEXT: [1,3] . D=============================eeeER . . vaddps %ymm4, %ymm5, %ymm6 -# CHECK-NEXT: [1,4] . D================================eeeeeER . vmulps %ymm6, %ymm3, %ymm4 -# CHECK-NEXT: [1,5] . D=====================================eeeER vaddps %ymm4, %ymm5, %ymm0 +# CHECK: [0,0] DeeeeeER . . . .. vmulps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: [0,1] DeeeE--R . . . .. vfrczpd %xmm1, %xmm2 +# CHECK-NEXT: [0,2] D===eeeeeER . . .. vmulps %ymm2, %ymm3, %ymm4 +# CHECK-NEXT: [0,3] D========eeeER . . .. vaddps %ymm4, %ymm5, %ymm6 +# CHECK-NEXT: [0,4] .D==========eeeeeER . .. vmulps %ymm6, %ymm3, %ymm4 +# CHECK-NEXT: [0,5] .D===============eeeER .. vaddps %ymm4, %ymm5, %ymm0 +# CHECK-NEXT: [1,0] .D==================eeeeeER vmulps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: [1,1] .DeeeE--------------------R vfrczpd %xmm1, %xmm2 +# CHECK-NEXT: [1,2] . D==eeeeeE---------------R vmulps %ymm2, %ymm3, %ymm4 +# CHECK-NEXT: [1,3] . D=======eeeE------------R vaddps %ymm4, %ymm5, %ymm6 +# CHECK-NEXT: [1,4] . D==========eeeeeE-------R vmulps %ymm6, %ymm3, %ymm4 +# CHECK-NEXT: [1,5] . D===============eeeE----R vaddps %ymm4, %ymm5, %ymm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -78,9 +78,9 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 11.0 0.5 0.0 vmulps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 1. 2 1.0 1.0 12.0 vfrczpd %xmm1, %xmm2 -# CHECK-NEXT: 2. 2 15.5 0.0 0.0 vmulps %ymm2, %ymm3, %ymm4 -# CHECK-NEXT: 3. 2 20.5 0.0 0.0 vaddps %ymm4, %ymm5, %ymm6 -# CHECK-NEXT: 4. 2 23.0 0.0 0.0 vmulps %ymm6, %ymm3, %ymm4 -# CHECK-NEXT: 5. 2 28.0 0.0 0.0 vaddps %ymm4, %ymm5, %ymm0 +# CHECK-NEXT: 0. 2 10.0 0.5 0.0 vmulps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 1. 2 1.0 1.0 11.0 vfrczpd %xmm1, %xmm2 +# CHECK-NEXT: 2. 2 3.5 0.0 7.5 vmulps %ymm2, %ymm3, %ymm4 +# CHECK-NEXT: 3. 2 8.5 0.0 6.0 vaddps %ymm4, %ymm5, %ymm6 +# CHECK-NEXT: 4. 2 11.0 0.0 3.5 vmulps %ymm6, %ymm3, %ymm4 +# CHECK-NEXT: 5. 2 16.0 0.0 2.0 vaddps %ymm4, %ymm5, %ymm0 Index: test/tools/llvm-mca/X86/Generic/xop-super-registers-2.s =================================================================== --- test/tools/llvm-mca/X86/Generic/xop-super-registers-2.s +++ test/tools/llvm-mca/X86/Generic/xop-super-registers-2.s @@ -10,9 +10,9 @@ # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 600 -# CHECK-NEXT: Total Cycles: 2103 +# CHECK-NEXT: Total Cycles: 316 # CHECK-NEXT: Dispatch Width: 4 -# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: IPC: 1.90 # CHECK-NEXT: Block RThroughput: 3.0 # CHECK: Instruction Info: @@ -55,21 +55,21 @@ # CHECK-NEXT: - - - 1.00 - - - - vaddps %ymm4, %ymm5, %ymm0 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 -# CHECK-NEXT: Index 0123456789 0123456789 01234 +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 01234 -# CHECK: [0,0] DeeeeeER . . . . . . . . vmulps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: [0,1] DeE----R . . . . . . . . vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2 -# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . . . vmulps %ymm2, %ymm3, %ymm4 -# CHECK-NEXT: [0,3] D==========eeeER . . . . . . vaddps %ymm4, %ymm5, %ymm6 -# CHECK-NEXT: [0,4] .D============eeeeeER . . . . . vmulps %ymm6, %ymm3, %ymm4 -# CHECK-NEXT: [0,5] .D=================eeeER . . . . . vaddps %ymm4, %ymm5, %ymm0 -# CHECK-NEXT: [1,0] .D====================eeeeeER . . . . vmulps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: [1,1] .DeE------------------------R . . . . vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2 -# CHECK-NEXT: [1,2] . D========================eeeeeER . . . vmulps %ymm2, %ymm3, %ymm4 -# CHECK-NEXT: [1,3] . D=============================eeeER . . vaddps %ymm4, %ymm5, %ymm6 -# CHECK-NEXT: [1,4] . D================================eeeeeER . vmulps %ymm6, %ymm3, %ymm4 -# CHECK-NEXT: [1,5] . D=====================================eeeER vaddps %ymm4, %ymm5, %ymm0 +# CHECK: [0,0] DeeeeeER . . . . vmulps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: [0,1] DeE----R . . . . vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2 +# CHECK-NEXT: [0,2] D=eeeeeER . . . . vmulps %ymm2, %ymm3, %ymm4 +# CHECK-NEXT: [0,3] D======eeeER . . . vaddps %ymm4, %ymm5, %ymm6 +# CHECK-NEXT: [0,4] .D========eeeeeER . . vmulps %ymm6, %ymm3, %ymm4 +# CHECK-NEXT: [0,5] .D=============eeeER. . vaddps %ymm4, %ymm5, %ymm0 +# CHECK-NEXT: [1,0] .D================eeeeeER vmulps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: [1,1] .DeE--------------------R vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2 +# CHECK-NEXT: [1,2] . DeeeeeE---------------R vmulps %ymm2, %ymm3, %ymm4 +# CHECK-NEXT: [1,3] . D=====eeeE------------R vaddps %ymm4, %ymm5, %ymm6 +# CHECK-NEXT: [1,4] . D========eeeeeE-------R vmulps %ymm6, %ymm3, %ymm4 +# CHECK-NEXT: [1,5] . D=============eeeE----R vaddps %ymm4, %ymm5, %ymm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -78,9 +78,9 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 11.0 0.5 0.0 vmulps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 1. 2 1.0 1.0 14.0 vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2 -# CHECK-NEXT: 2. 2 15.5 0.0 0.0 vmulps %ymm2, %ymm3, %ymm4 -# CHECK-NEXT: 3. 2 20.5 0.0 0.0 vaddps %ymm4, %ymm5, %ymm6 -# CHECK-NEXT: 4. 2 23.0 0.0 0.0 vmulps %ymm6, %ymm3, %ymm4 -# CHECK-NEXT: 5. 2 28.0 0.0 0.0 vaddps %ymm4, %ymm5, %ymm0 +# CHECK-NEXT: 0. 2 9.0 0.5 0.0 vmulps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 1. 2 1.0 1.0 12.0 vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2 +# CHECK-NEXT: 2. 2 1.5 0.0 7.5 vmulps %ymm2, %ymm3, %ymm4 +# CHECK-NEXT: 3. 2 6.5 0.0 6.0 vaddps %ymm4, %ymm5, %ymm6 +# CHECK-NEXT: 4. 2 9.0 0.0 3.5 vmulps %ymm6, %ymm3, %ymm4 +# CHECK-NEXT: 5. 2 14.0 0.0 2.0 vaddps %ymm4, %ymm5, %ymm0 Index: tools/llvm-mca/InstrBuilder.h =================================================================== --- tools/llvm-mca/InstrBuilder.h +++ tools/llvm-mca/InstrBuilder.h @@ -17,7 +17,9 @@ #include "Instruction.h" #include "Support.h" +#include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" namespace mca { @@ -37,6 +39,8 @@ class InstrBuilder { const llvm::MCSubtargetInfo &STI; const llvm::MCInstrInfo &MCII; + const llvm::MCRegisterInfo &MRI; + const llvm::MCInstrAnalysis &MCIA; llvm::SmallVector ProcResourceMasks; llvm::DenseMap> Descriptors; @@ -48,8 +52,10 @@ InstrBuilder &operator=(const InstrBuilder &) = delete; public: - InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii) - : STI(sti), MCII(mcii), + InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii, + const llvm::MCRegisterInfo &mri, + const llvm::MCInstrAnalysis &mcia) + : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia), ProcResourceMasks(STI.getSchedModel().getNumProcResourceKinds()) { computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks); } Index: tools/llvm-mca/InstrBuilder.cpp =================================================================== --- tools/llvm-mca/InstrBuilder.cpp +++ tools/llvm-mca/InstrBuilder.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "InstrBuilder.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/Debug.h" @@ -158,23 +159,6 @@ const MCInstrDesc &MCDesc, const MCSchedClassDesc &SCDesc, const MCSubtargetInfo &STI) { - // Set if writes through this opcode may update super registers. - // TODO: on x86-64, a 4 byte write of a general purpose register always - // fully updates the super-register. - // More in general, (at least on x86) not all register writes perform - // a partial (super-)register update. - // For example, an AVX instruction that writes on a XMM register implicitly - // zeroes the upper half of every aliasing super-register. - // - // For now, we pessimistically assume that writes are all potentially - // partial register updates. This is a good default for most targets, execept - // for those like x86 which implement a special semantic for certain opcodes. - // At least on x86, this may lead to an inaccurate prediction of the - // instruction level parallelism. - bool FullyUpdatesSuperRegisters = false; - - // Now Populate Writes. - // This algorithm currently works under the strong (and potentially incorrect) // assumption that information related to register def/uses can be obtained // from MCInstrDesc. @@ -275,7 +259,6 @@ Write.Latency = ID.MaxLatency; Write.SClassOrWriteResourceID = 0; } - Write.FullyUpdatesSuperRegs = FullyUpdatesSuperRegisters; Write.IsOptionalDef = false; LLVM_DEBUG({ dbgs() << "\t\tOpIdx=" << Write.OpIndex << ", Latency=" << Write.Latency @@ -488,16 +471,35 @@ NewIS->getUses().emplace_back(llvm::make_unique(RD, RegID)); } + // Early exit if there are no writes. + if (D.Writes.empty()) + return NewIS; + + // Track register writes that implicitly clear the upper portion of the + // underlying super-registers using an APInt. + APInt WriteMask(D.Writes.size(), 0); + + // Now query the MCInstrAnalysis object to obtain information about which + // register writes implicitly clear the upper portion of a super-register. + MCIA.clearsSuperRegisters(MRI, MCI, WriteMask); + // Initialize writes. + unsigned WriteIndex = 0; for (const WriteDescriptor &WD : D.Writes) { unsigned RegID = WD.OpIndex == -1 ? WD.RegisterID : MCI.getOperand(WD.OpIndex).getReg(); // Check if this is a optional definition that references NoReg. - if (WD.IsOptionalDef && !RegID) + if (WD.IsOptionalDef && !RegID) { + ++WriteIndex; continue; + } assert(RegID && "Expected a valid register ID!"); - NewIS->getDefs().emplace_back(llvm::make_unique(WD, RegID)); + APInt CurrWriteMask = WriteMask & (1 << WriteIndex); + bool UpdatesSuperRegisters = CurrWriteMask.getBoolValue(); + NewIS->getDefs().emplace_back( + llvm::make_unique(WD, RegID, UpdatesSuperRegisters)); + ++WriteIndex; } return NewIS; Index: tools/llvm-mca/Instruction.h =================================================================== --- tools/llvm-mca/Instruction.h +++ tools/llvm-mca/Instruction.h @@ -70,11 +70,6 @@ // This field is set to a value different than zero only if this // is an implicit definition. unsigned RegisterID; - // True if this write generates a partial update of a super-registers. - // On X86, this flag is set by byte/word writes on GPR registers. Also, - // a write of an XMM register only partially updates the corresponding - // YMM super-register if the write is associated to a legacy SSE instruction. - bool FullyUpdatesSuperRegs; // Instruction itineraries would set this field to the SchedClass ID. // Otherwise, it defaults to the WriteResourceID from the MCWriteLatencyEntry // element associated to this write. @@ -129,6 +124,10 @@ // field RegisterID from WD. unsigned RegisterID; + // True if this write implicitly clears the upper portion of RegisterID's + // super-registers. + bool ClearsSuperRegs; + // A list of dependent reads. Users is a set of dependent // reads. A dependent read is added to the set only if CyclesLeft // is "unknown". As soon as CyclesLeft is 'known', each user in the set @@ -138,8 +137,10 @@ std::set> Users; public: - WriteState(const WriteDescriptor &Desc, unsigned RegID) - : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID) {} + WriteState(const WriteDescriptor &Desc, unsigned RegID, + bool clearsSuperRegs = false) + : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID), + ClearsSuperRegs(clearsSuperRegs) {} WriteState(const WriteState &Other) = delete; WriteState &operator=(const WriteState &Other) = delete; @@ -148,7 +149,7 @@ unsigned getRegisterID() const { return RegisterID; } void addUser(ReadState *Use, int ReadAdvance); - bool fullyUpdatesSuperRegs() const { return WD.FullyUpdatesSuperRegs; } + bool clearsSuperRegisters() const { return ClearsSuperRegs; } // On every cycle, update CyclesLeft and notify dependent users. void cycleEvent(); Index: tools/llvm-mca/RegisterFile.cpp =================================================================== --- tools/llvm-mca/RegisterFile.cpp +++ tools/llvm-mca/RegisterFile.cpp @@ -138,7 +138,7 @@ allocatePhysRegs(Mapping.second, UsedPhysRegs); // If this is a partial update, then we are done. - if (!WS.fullyUpdatesSuperRegs()) + if (!WS.clearsSuperRegisters()) return; for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) @@ -149,7 +149,7 @@ MutableArrayRef FreedPhysRegs, bool ShouldFreePhysRegs) { unsigned RegID = WS.getRegisterID(); - bool ShouldInvalidateSuperRegs = WS.fullyUpdatesSuperRegs(); + bool ShouldInvalidateSuperRegs = WS.clearsSuperRegisters(); assert(RegID != 0 && "Invalidating an already invalid register?"); assert(WS.getCyclesLeft() != -512 && Index: tools/llvm-mca/llvm-mca.cpp =================================================================== --- tools/llvm-mca/llvm-mca.cpp +++ tools/llvm-mca/llvm-mca.cpp @@ -388,6 +388,9 @@ std::unique_ptr MCII(TheTarget->createMCInstrInfo()); + std::unique_ptr MCIA( + TheTarget->createMCInstrAnalysis(MCII.get())); + if (!MCPU.compare("native")) MCPU = llvm::sys::getHostCPUName(); @@ -457,7 +460,7 @@ Width = DispatchWidth; // Create an instruction builder. - mca::InstrBuilder IB(*STI, *MCII); + mca::InstrBuilder IB(*STI, *MCII, *MRI, *MCIA); // Number each region in the sequence. unsigned RegionIdx = 0;