Index: llvm/trunk/include/llvm/MC/MCInstrAnalysis.h =================================================================== --- llvm/trunk/include/llvm/MC/MCInstrAnalysis.h +++ llvm/trunk/include/llvm/MC/MCInstrAnalysis.h @@ -87,6 +87,19 @@ const MCInst &Inst, APInt &Writes) const; + /// Returns true if \param Inst is a dependency breaking instruction for the + /// given subtarget. + /// + /// The value computed by a dependency breaking instruction is not dependent + /// on the inputs. An example of dependency breaking instruction on X86 is + /// `XOR %eax, %eax`. + /// TODO: In future, we could implement an alternative approach where this + /// method returns `true` if the input instruction is not dependent on + /// some/all of its input operands. An APInt mask could then be used to + /// identify independent operands. + virtual bool isDependencyBreaking(const MCSubtargetInfo &STI, + const MCInst &Inst) const; + /// Given a branch instruction try to get the address the branch /// targets. Return true on success, and the address in Target. virtual bool Index: llvm/trunk/lib/MC/MCInstrAnalysis.cpp =================================================================== --- llvm/trunk/lib/MC/MCInstrAnalysis.cpp +++ llvm/trunk/lib/MC/MCInstrAnalysis.cpp @@ -24,6 +24,11 @@ return false; } +bool MCInstrAnalysis::isDependencyBreaking(const MCSubtargetInfo &STI, + const MCInst &Inst) const { + return false; +} + bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, uint64_t &Target) const { if (Inst.getNumOperands() == 0 || Index: llvm/trunk/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp =================================================================== --- llvm/trunk/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ llvm/trunk/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -307,10 +307,84 @@ public: X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {} + bool isDependencyBreaking(const MCSubtargetInfo &STI, + const MCInst &Inst) const override; bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst, APInt &Mask) const override; }; +bool X86MCInstrAnalysis::isDependencyBreaking(const MCSubtargetInfo &STI, + const MCInst &Inst) const { + if (STI.getCPU() == "btver2") { + // Reference: Agner Fog's microarchitecture.pdf - Section 20 "AMD Bobcat and + // Jaguar pipeline", subsection 8 "Dependency-breaking instructions". + switch (Inst.getOpcode()) { + default: + return false; + case X86::SUB32rr: + case X86::SUB64rr: + case X86::SBB32rr: + case X86::SBB64rr: + case X86::XOR32rr: + case X86::XOR64rr: + case X86::XORPSrr: + case X86::XORPDrr: + case X86::VXORPSrr: + case X86::VXORPDrr: + case X86::ANDNPSrr: + case X86::VANDNPSrr: + case X86::ANDNPDrr: + case X86::VANDNPDrr: + case X86::PXORrr: + case X86::VPXORrr: + case X86::PANDNrr: + case X86::VPANDNrr: + case X86::PSUBBrr: + case X86::PSUBWrr: + case X86::PSUBDrr: + case X86::PSUBQrr: + case X86::VPSUBBrr: + case X86::VPSUBWrr: + case X86::VPSUBDrr: + case X86::VPSUBQrr: + case X86::PCMPEQBrr: + case X86::PCMPEQWrr: + case X86::PCMPEQDrr: + case X86::PCMPEQQrr: + case X86::VPCMPEQBrr: + case X86::VPCMPEQWrr: + case X86::VPCMPEQDrr: + case X86::VPCMPEQQrr: + case X86::PCMPGTBrr: + case X86::PCMPGTWrr: + case X86::PCMPGTDrr: + case X86::PCMPGTQrr: + case X86::VPCMPGTBrr: + case X86::VPCMPGTWrr: + case X86::VPCMPGTDrr: + case X86::VPCMPGTQrr: + case X86::MMX_PXORirr: + case X86::MMX_PANDNirr: + case X86::MMX_PSUBBirr: + case X86::MMX_PSUBDirr: + case X86::MMX_PSUBQirr: + case X86::MMX_PSUBWirr: + case X86::MMX_PCMPGTBirr: + case X86::MMX_PCMPGTDirr: + case X86::MMX_PCMPGTWirr: + case X86::MMX_PCMPEQBirr: + case X86::MMX_PCMPEQDirr: + case X86::MMX_PCMPEQWirr: + return Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg(); + case X86::CMP32rr: + case X86::CMP64rr: + return Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg(); + } + } + + return false; +} + bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst, APInt &Mask) const { Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-cmp.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-cmp.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-cmp.s @@ -11,9 +11,9 @@ # CHECK: Iterations: 1500 # CHECK-NEXT: Instructions: 3000 -# CHECK-NEXT: Total Cycles: 3003 +# CHECK-NEXT: Total Cycles: 1504 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 1.00 +# CHECK-NEXT: IPC: 1.99 # CHECK-NEXT: Block RThroughput: 1.0 # CHECK: Instruction Info: @@ -54,14 +54,14 @@ # CHECK-NEXT: 1.00 - - - - - - - - - - - - - cmovael %ebx, %eax # CHECK: Timeline view: -# CHECK-NEXT: Index 012345678 +# CHECK-NEXT: Index 0123456 -# CHECK: [0,0] DeER . . cmpl %eax, %eax -# CHECK-NEXT: [0,1] D=eER. . cmovael %ebx, %eax -# CHECK-NEXT: [1,0] .D=eER . cmpl %eax, %eax -# CHECK-NEXT: [1,1] .D==eER . cmovael %ebx, %eax -# CHECK-NEXT: [2,0] . D==eER. cmpl %eax, %eax -# CHECK-NEXT: [2,1] . D===eER cmovael %ebx, %eax +# CHECK: [0,0] DeER .. cmpl %eax, %eax +# CHECK-NEXT: [0,1] D=eER.. cmovael %ebx, %eax +# CHECK-NEXT: [1,0] .DeER.. cmpl %eax, %eax +# CHECK-NEXT: [1,1] .D=eER. cmovael %ebx, %eax +# CHECK-NEXT: [2,0] . DeER. cmpl %eax, %eax +# CHECK-NEXT: [2,1] . D=eER cmovael %ebx, %eax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -70,5 +70,5 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 2.0 0.3 0.0 cmpl %eax, %eax -# CHECK-NEXT: 1. 3 3.0 0.0 0.0 cmovael %ebx, %eax +# CHECK-NEXT: 0. 3 1.0 1.0 0.0 cmpl %eax, %eax +# CHECK-NEXT: 1. 3 2.0 0.0 0.0 cmovael %ebx, %eax Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-pcmpeq.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-pcmpeq.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-pcmpeq.s @@ -14,9 +14,9 @@ # CHECK: Iterations: 1500 # CHECK-NEXT: Instructions: 6000 -# CHECK-NEXT: Total Cycles: 6003 +# CHECK-NEXT: Total Cycles: 3003 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 1.00 +# CHECK-NEXT: IPC: 2.00 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -61,21 +61,20 @@ # CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpcmpeqq %xmm3, %xmm3, %xmm0 # CHECK: Timeline view: -# CHECK-NEXT: 01234 -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeER . . . vpcmpeqb %xmm0, %xmm0, %xmm1 -# CHECK-NEXT: [0,1] D=eER. . . vpcmpeqw %xmm1, %xmm1, %xmm2 -# CHECK-NEXT: [0,2] .D=eER . . vpcmpeqd %xmm2, %xmm2, %xmm3 -# CHECK-NEXT: [0,3] .D==eER . . vpcmpeqq %xmm3, %xmm3, %xmm0 -# CHECK-NEXT: [1,0] . D==eER . . vpcmpeqb %xmm0, %xmm0, %xmm1 -# CHECK-NEXT: [1,1] . D===eER . . vpcmpeqw %xmm1, %xmm1, %xmm2 -# CHECK-NEXT: [1,2] . D===eER. . vpcmpeqd %xmm2, %xmm2, %xmm3 -# CHECK-NEXT: [1,3] . D====eER . vpcmpeqq %xmm3, %xmm3, %xmm0 -# CHECK-NEXT: [2,0] . D====eER . vpcmpeqb %xmm0, %xmm0, %xmm1 -# CHECK-NEXT: [2,1] . D=====eER . vpcmpeqw %xmm1, %xmm1, %xmm2 -# CHECK-NEXT: [2,2] . D=====eER. vpcmpeqd %xmm2, %xmm2, %xmm3 -# CHECK-NEXT: [2,3] . D======eER vpcmpeqq %xmm3, %xmm3, %xmm0 +# CHECK: [0,0] DeER . . vpcmpeqb %xmm0, %xmm0, %xmm1 +# CHECK-NEXT: [0,1] DeER . . vpcmpeqw %xmm1, %xmm1, %xmm2 +# CHECK-NEXT: [0,2] .DeER. . vpcmpeqd %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: [0,3] .DeER. . vpcmpeqq %xmm3, %xmm3, %xmm0 +# CHECK-NEXT: [1,0] . DeER . vpcmpeqb %xmm0, %xmm0, %xmm1 +# CHECK-NEXT: [1,1] . DeER . vpcmpeqw %xmm1, %xmm1, %xmm2 +# CHECK-NEXT: [1,2] . DeER . vpcmpeqd %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: [1,3] . DeER . vpcmpeqq %xmm3, %xmm3, %xmm0 +# CHECK-NEXT: [2,0] . DeER. vpcmpeqb %xmm0, %xmm0, %xmm1 +# CHECK-NEXT: [2,1] . DeER. vpcmpeqw %xmm1, %xmm1, %xmm2 +# CHECK-NEXT: [2,2] . DeER vpcmpeqd %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: [2,3] . DeER vpcmpeqq %xmm3, %xmm3, %xmm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -84,7 +83,7 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 3.0 0.3 0.0 vpcmpeqb %xmm0, %xmm0, %xmm1 -# CHECK-NEXT: 1. 3 4.0 0.0 0.0 vpcmpeqw %xmm1, %xmm1, %xmm2 -# CHECK-NEXT: 2. 3 4.0 0.0 0.0 vpcmpeqd %xmm2, %xmm2, %xmm3 -# CHECK-NEXT: 3. 3 5.0 0.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm0 +# CHECK-NEXT: 0. 3 1.0 1.0 0.0 vpcmpeqb %xmm0, %xmm0, %xmm1 +# CHECK-NEXT: 1. 3 1.0 1.0 0.0 vpcmpeqw %xmm1, %xmm1, %xmm2 +# CHECK-NEXT: 2. 3 1.0 1.0 0.0 vpcmpeqd %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: 3. 3 1.0 1.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm0 Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-sbb-2.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-sbb-2.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-sbb-2.s @@ -13,9 +13,9 @@ # CHECK: Iterations: 1500 # CHECK-NEXT: Instructions: 4500 -# CHECK-NEXT: Total Cycles: 6745 +# CHECK-NEXT: Total Cycles: 3007 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 0.67 +# CHECK-NEXT: IPC: 1.50 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -49,27 +49,27 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: 2.01 1.99 - - - - - - 1.00 - - - - - +# CHECK-NEXT: 2.00 2.00 - - - - - - 1.00 - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: # CHECK-NEXT: - 1.00 - - - - - - 1.00 - - - - - imull %edx, %eax -# CHECK-NEXT: 0.99 0.01 - - - - - - - - - - - - addl %edx, %edx -# CHECK-NEXT: 1.01 0.99 - - - - - - - - - - - - sbbl %eax, %eax +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - addl %edx, %edx +# CHECK-NEXT: 2.00 - - - - - - - - - - - - - sbbl %eax, %eax # CHECK: Timeline view: -# CHECK-NEXT: 012345 +# CHECK-NEXT: 01 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeER . . imull %edx, %eax -# CHECK-NEXT: [0,1] .DeE-R . . addl %edx, %edx -# CHECK-NEXT: [0,2] .D==eER . . sbbl %eax, %eax -# CHECK-NEXT: [1,0] . D===eeeER . imull %edx, %eax -# CHECK-NEXT: [1,1] . DeE----R . addl %edx, %edx -# CHECK-NEXT: [1,2] . D=====eER . sbbl %eax, %eax -# CHECK-NEXT: [2,0] . D=====eeeER. imull %edx, %eax -# CHECK-NEXT: [2,1] . DeE------R. addl %edx, %edx -# CHECK-NEXT: [2,2] . D=======eER sbbl %eax, %eax +# CHECK: [0,0] DeeeER .. imull %edx, %eax +# CHECK-NEXT: [0,1] .DeE-R .. addl %edx, %edx +# CHECK-NEXT: [0,2] .D=eE-R .. sbbl %eax, %eax +# CHECK-NEXT: [1,0] . D==eeeER.. imull %edx, %eax +# CHECK-NEXT: [1,1] . DeE---R.. addl %edx, %edx +# CHECK-NEXT: [1,2] . D=eE---R. sbbl %eax, %eax +# CHECK-NEXT: [2,0] . D=eeeER. imull %edx, %eax +# CHECK-NEXT: [2,1] . D=eE--R addl %edx, %edx +# CHECK-NEXT: [2,2] . D==eE-R sbbl %eax, %eax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -78,6 +78,6 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 3.7 0.7 0.0 imull %edx, %eax -# CHECK-NEXT: 1. 3 1.0 1.0 3.7 addl %edx, %edx -# CHECK-NEXT: 2. 3 5.7 0.0 0.0 sbbl %eax, %eax +# CHECK-NEXT: 0. 3 2.0 0.7 0.0 imull %edx, %eax +# CHECK-NEXT: 1. 3 1.3 1.3 2.0 addl %edx, %edx +# CHECK-NEXT: 2. 3 2.3 0.0 1.7 sbbl %eax, %eax Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/one-idioms.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/one-idioms.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/one-idioms.s @@ -1,9 +1,11 @@ # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py -# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -register-file-stats -iterations=1 < %s | FileCheck %s +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -timeline-max-iterations=1 -register-file-stats < %s | FileCheck %s # These are dependency-breaking one-idioms. # Much like zero-idioms, but they produce ones, and do consume resources. +# perf stats reports a throughput of 2.00 IPC. + pcmpeqb %mm2, %mm2 pcmpeqd %mm2, %mm2 pcmpeqw %mm2, %mm2 @@ -25,11 +27,11 @@ # FIXME: their handling is broken in llvm-mca. -# CHECK: Iterations: 1 -# CHECK-NEXT: Instructions: 15 -# CHECK-NEXT: Total Cycles: 12 +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 1500 +# CHECK-NEXT: Total Cycles: 753 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 1.25 +# CHECK-NEXT: IPC: 1.99 # CHECK-NEXT: Block RThroughput: 7.5 # CHECK: Instruction Info: @@ -58,13 +60,13 @@ # CHECK-NEXT: 1 1 0.50 vpcmpeqw %xmm3, %xmm3, %xmm5 # CHECK: Register File statistics: -# CHECK-NEXT: Total number of mappings created: 15 -# CHECK-NEXT: Max number of mappings used: 8 +# CHECK-NEXT: Total number of mappings created: 1500 +# CHECK-NEXT: Max number of mappings used: 6 # CHECK: * Register File #1 -- JFpuPRF: # CHECK-NEXT: Number of physical registers: 72 -# CHECK-NEXT: Total number of mappings created: 15 -# CHECK-NEXT: Max number of mappings used: 8 +# CHECK-NEXT: Total number of mappings created: 1500 +# CHECK-NEXT: Max number of mappings used: 6 # CHECK: * Register File #2 -- JIntegerPRF: # CHECK-NEXT: Number of physical registers: 64 @@ -89,45 +91,45 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: - - - - - 7.00 8.00 - - - - 7.00 8.00 - +# CHECK-NEXT: - - - - - 7.50 7.50 - - - - 7.50 7.50 - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: -# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - pcmpeqb %mm2, %mm2 -# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - pcmpeqd %mm2, %mm2 -# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - pcmpeqw %mm2, %mm2 -# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - pcmpeqb %xmm2, %xmm2 -# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - pcmpeqd %xmm2, %xmm2 -# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - pcmpeqq %xmm2, %xmm2 -# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - pcmpeqw %xmm2, %xmm2 -# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - vpcmpeqb %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - vpcmpeqd %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpcmpeqq %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - vpcmpeqw %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpcmpeqb %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - vpcmpeqd %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpcmpeqq %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - vpcmpeqw %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - pcmpeqb %mm2, %mm2 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - pcmpeqd %mm2, %mm2 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - pcmpeqw %mm2, %mm2 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - pcmpeqb %xmm2, %xmm2 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - pcmpeqd %xmm2, %xmm2 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - pcmpeqq %xmm2, %xmm2 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - pcmpeqw %xmm2, %xmm2 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqb %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqd %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqq %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqw %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqb %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqd %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqq %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqw %xmm3, %xmm3, %xmm5 # CHECK: Timeline view: -# CHECK-NEXT: 01 +# CHECK-NEXT: 0 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeER . .. pcmpeqb %mm2, %mm2 -# CHECK-NEXT: [0,1] D=eER. .. pcmpeqd %mm2, %mm2 -# CHECK-NEXT: [0,2] .D=eER .. pcmpeqw %mm2, %mm2 -# CHECK-NEXT: [0,3] .DeE-R .. pcmpeqb %xmm2, %xmm2 -# CHECK-NEXT: [0,4] . DeE-R .. pcmpeqd %xmm2, %xmm2 -# CHECK-NEXT: [0,5] . D=eER .. pcmpeqq %xmm2, %xmm2 -# CHECK-NEXT: [0,6] . D=eER .. pcmpeqw %xmm2, %xmm2 -# CHECK-NEXT: [0,7] . DeE-R .. vpcmpeqb %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,8] . DeE-R .. vpcmpeqd %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,9] . D=eER .. vpcmpeqq %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,10] . D=eER.. vpcmpeqw %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,11] . D==eER. vpcmpeqb %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,12] . .D=eER. vpcmpeqd %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,13] . .D==eER vpcmpeqq %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,14] . . D=eER vpcmpeqw %xmm3, %xmm3, %xmm5 +# CHECK: [0,0] DeER . . pcmpeqb %mm2, %mm2 +# CHECK-NEXT: [0,1] DeER . . pcmpeqd %mm2, %mm2 +# CHECK-NEXT: [0,2] .DeER. . pcmpeqw %mm2, %mm2 +# CHECK-NEXT: [0,3] .DeER. . pcmpeqb %xmm2, %xmm2 +# CHECK-NEXT: [0,4] . DeER . pcmpeqd %xmm2, %xmm2 +# CHECK-NEXT: [0,5] . DeER . pcmpeqq %xmm2, %xmm2 +# CHECK-NEXT: [0,6] . DeER . pcmpeqw %xmm2, %xmm2 +# CHECK-NEXT: [0,7] . DeER . vpcmpeqb %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,8] . DeER . vpcmpeqd %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,9] . DeER . vpcmpeqq %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,10] . DeER . vpcmpeqw %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,11] . DeER . vpcmpeqb %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [0,12] . .DeER. vpcmpeqd %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [0,13] . .DeER. vpcmpeqq %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [0,14] . . DeER vpcmpeqw %xmm3, %xmm3, %xmm5 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -137,17 +139,17 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 pcmpeqb %mm2, %mm2 -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 pcmpeqd %mm2, %mm2 -# CHECK-NEXT: 2. 1 2.0 0.0 0.0 pcmpeqw %mm2, %mm2 -# CHECK-NEXT: 3. 1 1.0 1.0 1.0 pcmpeqb %xmm2, %xmm2 -# CHECK-NEXT: 4. 1 1.0 0.0 1.0 pcmpeqd %xmm2, %xmm2 -# CHECK-NEXT: 5. 1 2.0 0.0 0.0 pcmpeqq %xmm2, %xmm2 -# CHECK-NEXT: 6. 1 2.0 0.0 0.0 pcmpeqw %xmm2, %xmm2 -# CHECK-NEXT: 7. 1 1.0 1.0 1.0 vpcmpeqb %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 8. 1 1.0 0.0 1.0 vpcmpeqd %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 9. 1 2.0 0.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 10. 1 2.0 0.0 0.0 vpcmpeqw %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 11. 1 3.0 0.0 0.0 vpcmpeqb %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 12. 1 2.0 0.0 0.0 vpcmpeqd %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 13. 1 3.0 1.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 14. 1 2.0 1.0 0.0 vpcmpeqw %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 1. 1 1.0 1.0 0.0 pcmpeqd %mm2, %mm2 +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 pcmpeqw %mm2, %mm2 +# CHECK-NEXT: 3. 1 1.0 1.0 0.0 pcmpeqb %xmm2, %xmm2 +# CHECK-NEXT: 4. 1 1.0 1.0 0.0 pcmpeqd %xmm2, %xmm2 +# CHECK-NEXT: 5. 1 1.0 1.0 0.0 pcmpeqq %xmm2, %xmm2 +# CHECK-NEXT: 6. 1 1.0 1.0 0.0 pcmpeqw %xmm2, %xmm2 +# CHECK-NEXT: 7. 1 1.0 1.0 0.0 vpcmpeqb %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 8. 1 1.0 1.0 0.0 vpcmpeqd %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 9. 1 1.0 1.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 10. 1 1.0 1.0 0.0 vpcmpeqw %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 11. 1 1.0 1.0 0.0 vpcmpeqb %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 12. 1 1.0 1.0 0.0 vpcmpeqd %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 13. 1 1.0 1.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 14. 1 1.0 1.0 0.0 vpcmpeqw %xmm3, %xmm3, %xmm5 Index: llvm/trunk/tools/llvm-mca/DispatchStage.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/DispatchStage.cpp +++ llvm/trunk/tools/llvm-mca/DispatchStage.cpp @@ -107,17 +107,21 @@ // instruction. A dependency-breaking instruction is a zero-latency // instruction that doesn't consume hardware resources. // An example of dependency-breaking instruction on X86 is a zero-idiom XOR. - if (!Desc.isZeroLatency()) - for (std::unique_ptr &RS : IS.getUses()) + bool IsDependencyBreaking = IS.isDependencyBreaking(); + for (std::unique_ptr &RS : IS.getUses()) + if (RS->isImplicitRead() || !IsDependencyBreaking) updateRAWDependencies(*RS, STI); // By default, a dependency-breaking zero-latency instruction is expected to // be optimized at register renaming stage. That means, no physical register // is allocated to the instruction. + bool ShouldAllocateRegisters = + !(Desc.isZeroLatency() && IsDependencyBreaking); SmallVector RegisterFiles(PRF.getNumRegisterFiles()); - for (std::unique_ptr &WS : IS.getDefs()) + for (std::unique_ptr &WS : IS.getDefs()) { PRF.addRegisterWrite(WriteRef(IR.first, WS.get()), RegisterFiles, - !Desc.isZeroLatency()); + ShouldAllocateRegisters); + } // Reserve slots in the RCU, and notify the instruction that it has been // dispatched to the schedulers for execution. Index: llvm/trunk/tools/llvm-mca/InstrBuilder.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/InstrBuilder.cpp +++ llvm/trunk/tools/llvm-mca/InstrBuilder.cpp @@ -443,6 +443,10 @@ // register writes implicitly clear the upper portion of a super-register. MCIA.clearsSuperRegisters(MRI, MCI, WriteMask); + // Check if this is a dependency breaking instruction. + if (MCIA.isDependencyBreaking(STI, MCI)) + NewIS->setDependencyBreaking(); + // Initialize writes. unsigned WriteIndex = 0; for (const WriteDescriptor &WD : D.Writes) { Index: llvm/trunk/tools/llvm-mca/Instruction.h =================================================================== --- llvm/trunk/tools/llvm-mca/Instruction.h +++ llvm/trunk/tools/llvm-mca/Instruction.h @@ -170,8 +170,6 @@ bool IsReady; public: - bool isReady() const { return IsReady; } - ReadState(const ReadDescriptor &Desc, unsigned RegID) : RD(Desc), RegisterID(RegID), DependentWrites(0), CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), IsReady(true) {} @@ -182,6 +180,9 @@ unsigned getSchedClass() const { return RD.SchedClassID; } unsigned getRegisterID() const { return RegisterID; } + bool isReady() const { return IsReady; } + bool isImplicitRead() const { return RD.isImplicitRead(); } + void cycleEvent(); void writeStartEvent(unsigned Cycles); void setDependentWrites(unsigned Writes) { @@ -299,6 +300,8 @@ // Retire Unit token ID for this instruction. unsigned RCUTokenID; + bool IsDepBreaking; + using UniqueDef = std::unique_ptr; using UniqueUse = std::unique_ptr; using VecDefs = std::vector; @@ -314,7 +317,8 @@ public: Instruction(const InstrDesc &D) - : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES) {} + : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES), RCUTokenID(0), + IsDepBreaking(false) {} Instruction(const Instruction &Other) = delete; Instruction &operator=(const Instruction &Other) = delete; @@ -326,6 +330,9 @@ unsigned getRCUTokenID() const { return RCUTokenID; } int getCyclesLeft() const { return CyclesLeft; } + bool isDependencyBreaking() const { return IsDepBreaking; } + void setDependencyBreaking() { IsDepBreaking = true; } + unsigned getNumUsers() const { unsigned NumUsers = 0; for (const UniqueDef &Def : Defs) Index: llvm/trunk/tools/llvm-mca/RetireStage.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/RetireStage.cpp +++ llvm/trunk/tools/llvm-mca/RetireStage.cpp @@ -45,10 +45,12 @@ void RetireStage::notifyInstructionRetired(const InstRef &IR) { LLVM_DEBUG(dbgs() << "[E] Instruction Retired: #" << IR << '\n'); SmallVector FreedRegs(PRF.getNumRegisterFiles()); - const InstrDesc &Desc = IR.getInstruction()->getDesc(); + const Instruction &Inst = *IR.getInstruction(); + const InstrDesc &Desc = Inst.getDesc(); - for (const std::unique_ptr &WS : IR.getInstruction()->getDefs()) - PRF.removeRegisterWrite(*WS.get(), FreedRegs, !Desc.isZeroLatency()); + bool ShouldFreeRegs = !(Desc.isZeroLatency() && Inst.isDependencyBreaking()); + for (const std::unique_ptr &WS : Inst.getDefs()) + PRF.removeRegisterWrite(*WS.get(), FreedRegs, ShouldFreeRegs); notifyEvent(HWInstructionRetiredEvent(IR, FreedRegs)); }