Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s @@ -23,6 +23,22 @@ # CHECK-NEXT: - Register Dependencies [ 94.04% ] # CHECK-NEXT: - Memory Dependencies [ 0.00% ] +# CHECK: Critical sequence based on the simulation: + +# CHECK: Instruction Dependency Information +# CHECK-NEXT: +----< 3. addl %edx, %eax +# CHECK-NEXT: | +# CHECK-NEXT: | < loop carried > +# CHECK-NEXT: | +# CHECK-NEXT: +----> 0. addl %eax, %ebx ## REGISTER dependency: %eax +# CHECK-NEXT: +----> 1. addl %ebx, %ecx ## REGISTER dependency: %ebx +# CHECK-NEXT: +----> 2. addl %ecx, %edx ## REGISTER dependency: %ecx +# CHECK-NEXT: +----> 3. addl %edx, %eax ## REGISTER dependency: %edx +# CHECK-NEXT: | +# CHECK-NEXT: | < loop carried > +# CHECK-NEXT: | +# CHECK-NEXT: +----> 0. addl %eax, %ebx ## REGISTER dependency: %eax + # CHECK: Instruction Info: # CHECK-NEXT: [1]: #uOps # CHECK-NEXT: [2]: Latency Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s @@ -22,6 +22,19 @@ # CHECK-NEXT: - Register Dependencies [ 0.00% ] # CHECK-NEXT: - Memory Dependencies [ 0.00% ] +# CHECK: Critical sequence based on the simulation: + +# CHECK: Instruction Dependency Information +# CHECK-NEXT: +----< 0. vhaddps %xmm0, %xmm0, %xmm1 +# CHECK-NEXT: | +# CHECK-NEXT: | < loop carried > +# CHECK-NEXT: | +# CHECK-NEXT: +----> 0. vhaddps %xmm0, %xmm0, %xmm1 ## RESOURCE interference: JFPA [ probability: 99% ] +# CHECK-NEXT: | +# CHECK-NEXT: | < loop carried > +# CHECK-NEXT: | +# CHECK-NEXT: +----> 0. vhaddps %xmm0, %xmm0, %xmm1 ## RESOURCE interference: JFPA [ probability: 99% ] + # CHECK: Instruction Info: # CHECK-NEXT: [1]: #uOps # CHECK-NEXT: [2]: Latency Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s @@ -27,6 +27,26 @@ # CHECK-NEXT: - Register Dependencies [ 83.24% ] # CHECK-NEXT: - Memory Dependencies [ 99.89% ] +# CHECK: Critical sequence based on the simulation: + +# CHECK: Instruction Dependency Information +# CHECK-NEXT: +----< 7. vmovaps %xmm0, 48(%rdi) +# CHECK-NEXT: | +# CHECK-NEXT: | < loop carried > +# CHECK-NEXT: | +# CHECK-NEXT: +----> 0. vmovaps (%rsi), %xmm0 ## MEMORY dependency. +# CHECK-NEXT: +----> 1. vmovaps %xmm0, (%rdi) ## REGISTER dependency: %xmm0 +# CHECK-NEXT: +----> 2. vmovaps 16(%rsi), %xmm0 ## MEMORY dependency. +# CHECK-NEXT: +----> 3. vmovaps %xmm0, 16(%rdi) ## REGISTER dependency: %xmm0 +# CHECK-NEXT: +----> 4. vmovaps 32(%rsi), %xmm0 ## MEMORY dependency. +# CHECK-NEXT: +----> 5. vmovaps %xmm0, 32(%rdi) ## REGISTER dependency: %xmm0 +# CHECK-NEXT: +----> 6. vmovaps 48(%rsi), %xmm0 ## MEMORY dependency. +# CHECK-NEXT: +----> 7. vmovaps %xmm0, 48(%rdi) ## REGISTER dependency: %xmm0 +# CHECK-NEXT: | +# CHECK-NEXT: | < loop carried > +# CHECK-NEXT: | +# CHECK-NEXT: +----> 0. vmovaps (%rsi), %xmm0 ## MEMORY dependency. + # CHECK: Instruction Info: # CHECK-NEXT: [1]: #uOps # CHECK-NEXT: [2]: Latency Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-4.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-4.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-4.s @@ -0,0 +1,79 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -bottleneck-analysis < %s | FileCheck %s + +vmulps %xmm0, %xmm1, %xmm2 +vhaddps %xmm2, %xmm2, %xmm3 +vhaddps %xmm3, %xmm3, %xmm4 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 300 +# CHECK-NEXT: Total Cycles: 211 +# CHECK-NEXT: Total uOps: 300 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.42 +# CHECK-NEXT: IPC: 1.42 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Cycles with backend pressure increase [ 40.76% ] +# CHECK-NEXT: Throughput Bottlenecks: +# CHECK-NEXT: Resource Pressure [ 39.34% ] +# CHECK-NEXT: - JFPA [ 39.34% ] +# CHECK-NEXT: - JFPU0 [ 39.34% ] +# CHECK-NEXT: Data Dependencies: [ 1.42% ] +# CHECK-NEXT: - Register Dependencies [ 1.42% ] +# CHECK-NEXT: - Memory Dependencies [ 0.00% ] + +# CHECK: Critical sequence based on the simulation: + +# CHECK: Instruction Dependency Information +# CHECK-NEXT: +----< 2. vhaddps %xmm3, %xmm3, %xmm4 +# CHECK-NEXT: | +# CHECK-NEXT: | < loop carried > +# CHECK-NEXT: | +# CHECK-NEXT: | 0. vmulps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] +# CHECK-NEXT: +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3 +# CHECK-NEXT: | +# CHECK-NEXT: | < loop carried > +# CHECK-NEXT: | +# CHECK-NEXT: +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 2 1.00 vmulps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 1 4 1.00 vhaddps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: 1 4 1.00 vhaddps %xmm3, %xmm3, %xmm4 + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - 2.00 1.00 2.00 1.00 - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - - 1.00 - 1.00 - - - - - - - vmulps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vhaddps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vhaddps %xmm3, %xmm3, %xmm4 Index: llvm/trunk/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s +++ llvm/trunk/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s @@ -0,0 +1,154 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mcpu=skylake -bottleneck-analysis < %s | FileCheck %s + +.LBB0_4: + vmovups (%rsi,%rax,2), %xmm0 + vpermilps $255, %xmm0, %xmm7 + vmulps -24(%rsp), %xmm7, %xmm8 + vpermilps $170, %xmm0, %xmm6 + vpermilps $85, %xmm0, %xmm5 + vbroadcastss %xmm0, %xmm0 + vfmadd231ps %xmm9, %xmm6, %xmm8 + vfmadd213ps %xmm8, %xmm10, %xmm5 + vfmadd213ps %xmm5, %xmm11, %xmm0 + vfmadd213ps %xmm0, %xmm12, %xmm4 + vfmadd213ps %xmm4, %xmm13, %xmm1 + vmovaps %xmm7, %xmm4 + vfmadd213ps %xmm1, %xmm14, %xmm2 + vmovaps %xmm6, %xmm1 + vfmadd213ps %xmm2, %xmm15, %xmm3 + vpermilps $170, %xmm3, %xmm0 + vmovups %xmm3, (%rdx,%rax) + vpermilps $255, %xmm3, %xmm2 + addq $16, %rax + decl %ecx + vmovaps %xmm0, %xmm3 + jne .LBB0_4 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 2200 +# CHECK-NEXT: Total Cycles: 1039 +# CHECK-NEXT: Total uOps: 2400 + +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 2.31 +# CHECK-NEXT: IPC: 2.12 +# CHECK-NEXT: Block RThroughput: 6.0 + +# CHECK: Cycles with backend pressure increase [ 92.69% ] +# CHECK-NEXT: Throughput Bottlenecks: +# CHECK-NEXT: Resource Pressure [ 46.78% ] +# CHECK-NEXT: - SKLPort0 [ 14.24% ] +# CHECK-NEXT: - SKLPort1 [ 14.24% ] +# CHECK-NEXT: - SKLPort5 [ 46.49% ] +# CHECK-NEXT: - SKLPort6 [ 8.66% ] +# CHECK-NEXT: Data Dependencies: [ 64.97% ] +# CHECK-NEXT: - Register Dependencies [ 64.97% ] +# CHECK-NEXT: - Memory Dependencies [ 0.00% ] + +# CHECK: Critical sequence based on the simulation: + +# CHECK: Instruction Dependency Information +# CHECK-NEXT: +----< 18. addq $16, %rax +# CHECK-NEXT: | +# CHECK-NEXT: | < loop carried > +# CHECK-NEXT: | +# CHECK-NEXT: +----> 0. vmovups (%rsi,%rax,2), %xmm0 ## REGISTER dependency: %rax +# CHECK-NEXT: | 1. vpermilps $255, %xmm0, %xmm7 +# CHECK-NEXT: | 2. vmulps -24(%rsp), %xmm7, %xmm8 +# CHECK-NEXT: +----> 3. vpermilps $170, %xmm0, %xmm6 ## REGISTER dependency: %xmm0 +# CHECK-NEXT: | 4. vpermilps $85, %xmm0, %xmm5 +# CHECK-NEXT: | 5. vbroadcastss %xmm0, %xmm0 +# CHECK-NEXT: +----> 6. vfmadd231ps %xmm9, %xmm6, %xmm8 ## REGISTER dependency: %xmm6 +# CHECK-NEXT: +----> 7. vfmadd213ps %xmm8, %xmm10, %xmm5 ## REGISTER dependency: %xmm8 +# CHECK-NEXT: +----> 8. vfmadd213ps %xmm5, %xmm11, %xmm0 ## REGISTER dependency: %xmm5 +# CHECK-NEXT: +----> 9. vfmadd213ps %xmm0, %xmm12, %xmm4 ## REGISTER dependency: %xmm0 +# CHECK-NEXT: +----> 10. vfmadd213ps %xmm4, %xmm13, %xmm1 ## REGISTER dependency: %xmm4 +# CHECK-NEXT: | 11. vmovaps %xmm7, %xmm4 +# CHECK-NEXT: +----> 12. vfmadd213ps %xmm1, %xmm14, %xmm2 ## REGISTER dependency: %xmm1 +# CHECK-NEXT: | 13. vmovaps %xmm6, %xmm1 +# CHECK-NEXT: | 14. vfmadd213ps %xmm2, %xmm15, %xmm3 +# CHECK-NEXT: | 15. vpermilps $170, %xmm3, %xmm0 +# CHECK-NEXT: | 16. vmovups %xmm3, (%rdx,%rax) +# CHECK-NEXT: | 17. vpermilps $255, %xmm3, %xmm2 +# CHECK-NEXT: | 18. addq $16, %rax +# CHECK-NEXT: | 19. decl %ecx +# CHECK-NEXT: | 20. vmovaps %xmm0, %xmm3 +# CHECK-NEXT: | 21. jne .LBB0_4 +# CHECK-NEXT: | +# CHECK-NEXT: | < loop carried > +# CHECK-NEXT: | +# CHECK-NEXT: +----> 2. vmulps -24(%rsp), %xmm7, %xmm8 ## RESOURCE interference: SKLPort1 [ probability: 45% ] + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 6 0.50 * vmovups (%rsi,%rax,2), %xmm0 +# CHECK-NEXT: 1 1 1.00 vpermilps $255, %xmm0, %xmm7 +# CHECK-NEXT: 2 10 0.50 * vmulps -24(%rsp), %xmm7, %xmm8 +# CHECK-NEXT: 1 1 1.00 vpermilps $170, %xmm0, %xmm6 +# CHECK-NEXT: 1 1 1.00 vpermilps $85, %xmm0, %xmm5 +# CHECK-NEXT: 1 1 1.00 vbroadcastss %xmm0, %xmm0 +# CHECK-NEXT: 1 4 0.50 vfmadd231ps %xmm9, %xmm6, %xmm8 +# CHECK-NEXT: 1 4 0.50 vfmadd213ps %xmm8, %xmm10, %xmm5 +# CHECK-NEXT: 1 4 0.50 vfmadd213ps %xmm5, %xmm11, %xmm0 +# CHECK-NEXT: 1 4 0.50 vfmadd213ps %xmm0, %xmm12, %xmm4 +# CHECK-NEXT: 1 4 0.50 vfmadd213ps %xmm4, %xmm13, %xmm1 +# CHECK-NEXT: 1 1 0.33 vmovaps %xmm7, %xmm4 +# CHECK-NEXT: 1 4 0.50 vfmadd213ps %xmm1, %xmm14, %xmm2 +# CHECK-NEXT: 1 1 0.33 vmovaps %xmm6, %xmm1 +# CHECK-NEXT: 1 4 0.50 vfmadd213ps %xmm2, %xmm15, %xmm3 +# CHECK-NEXT: 1 1 1.00 vpermilps $170, %xmm3, %xmm0 +# CHECK-NEXT: 2 1 1.00 * vmovups %xmm3, (%rdx,%rax) +# CHECK-NEXT: 1 1 1.00 vpermilps $255, %xmm3, %xmm2 +# CHECK-NEXT: 1 1 0.25 addq $16, %rax +# CHECK-NEXT: 1 1 0.25 decl %ecx +# CHECK-NEXT: 1 1 0.33 vmovaps %xmm0, %xmm3 +# CHECK-NEXT: 1 1 0.50 jne .LBB0_4 + +# CHECK: Resources: +# CHECK-NEXT: [0] - SKLDivider +# CHECK-NEXT: [1] - SKLFPDivider +# CHECK-NEXT: [2] - SKLPort0 +# CHECK-NEXT: [3] - SKLPort1 +# CHECK-NEXT: [4] - SKLPort2 +# CHECK-NEXT: [5] - SKLPort3 +# CHECK-NEXT: [6] - SKLPort4 +# CHECK-NEXT: [7] - SKLPort5 +# CHECK-NEXT: [8] - SKLPort6 +# CHECK-NEXT: [9] - SKLPort7 + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] +# CHECK-NEXT: - - 5.52 5.53 1.01 1.03 1.00 6.02 2.93 0.96 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: +# CHECK-NEXT: - - - - 0.04 0.96 - - - - vmovups (%rsi,%rax,2), %xmm0 +# CHECK-NEXT: - - - - - - - 1.00 - - vpermilps $255, %xmm0, %xmm7 +# CHECK-NEXT: - - 0.03 0.97 0.96 0.04 - - - - vmulps -24(%rsp), %xmm7, %xmm8 +# CHECK-NEXT: - - - - - - - 1.00 - - vpermilps $170, %xmm0, %xmm6 +# CHECK-NEXT: - - - - - - - 1.00 - - vpermilps $85, %xmm0, %xmm5 +# CHECK-NEXT: - - - - - - - 1.00 - - vbroadcastss %xmm0, %xmm0 +# CHECK-NEXT: - - 0.95 0.05 - - - - - - vfmadd231ps %xmm9, %xmm6, %xmm8 +# CHECK-NEXT: - - 0.50 0.50 - - - - - - vfmadd213ps %xmm8, %xmm10, %xmm5 +# CHECK-NEXT: - - 0.92 0.08 - - - - - - vfmadd213ps %xmm5, %xmm11, %xmm0 +# CHECK-NEXT: - - 0.95 0.05 - - - - - - vfmadd213ps %xmm0, %xmm12, %xmm4 +# CHECK-NEXT: - - 0.51 0.49 - - - - - - vfmadd213ps %xmm4, %xmm13, %xmm1 +# CHECK-NEXT: - - 0.52 0.48 - - - - - - vmovaps %xmm7, %xmm4 +# CHECK-NEXT: - - 0.49 0.51 - - - - - - vfmadd213ps %xmm1, %xmm14, %xmm2 +# CHECK-NEXT: - - 0.04 0.95 - - - 0.01 - - vmovaps %xmm6, %xmm1 +# CHECK-NEXT: - - 0.51 0.49 - - - - - - vfmadd213ps %xmm2, %xmm15, %xmm3 +# CHECK-NEXT: - - - - - - - 1.00 - - vpermilps $170, %xmm3, %xmm0 +# CHECK-NEXT: - - - - 0.01 0.03 1.00 - - 0.96 vmovups %xmm3, (%rdx,%rax) +# CHECK-NEXT: - - - - - - - 1.00 - - vpermilps $255, %xmm3, %xmm2 +# CHECK-NEXT: - - - - - - - - 1.00 - addq $16, %rax +# CHECK-NEXT: - - 0.04 0.01 - - - 0.01 0.94 - decl %ecx +# CHECK-NEXT: - - 0.05 0.95 - - - - - - vmovaps %xmm0, %xmm3 +# CHECK-NEXT: - - 0.01 - - - - - 0.99 - jne .LBB0_4 Index: llvm/trunk/test/tools/llvm-mca/X86/option-all-views-1.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/option-all-views-1.s +++ llvm/trunk/test/tools/llvm-mca/X86/option-all-views-1.s @@ -25,6 +25,19 @@ # FULLREPORT-NEXT: - Register Dependencies [ 76.70% ] # FULLREPORT-NEXT: - Memory Dependencies [ 0.00% ] +# FULLREPORT: Critical sequence based on the simulation: + +# FULLREPORT: Instruction Dependency Information +# FULLREPORT-NEXT: +----< 0. addl %eax, %eax +# FULLREPORT-NEXT: | +# FULLREPORT-NEXT: | < loop carried > +# FULLREPORT-NEXT: | +# FULLREPORT-NEXT: +----> 0. addl %eax, %eax ## REGISTER dependency: %eax +# FULLREPORT-NEXT: | +# FULLREPORT-NEXT: | < loop carried > +# FULLREPORT-NEXT: | +# FULLREPORT-NEXT: +----> 0. addl %eax, %eax ## REGISTER dependency: %eax + # DEFAULTREPORT: Instruction Info: # DEFAULTREPORT-NEXT: [1]: #uOps # DEFAULTREPORT-NEXT: [2]: Latency Index: llvm/trunk/test/tools/llvm-mca/X86/option-all-views-2.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/option-all-views-2.s +++ llvm/trunk/test/tools/llvm-mca/X86/option-all-views-2.s @@ -24,6 +24,19 @@ # ALL-NEXT: - Register Dependencies [ 76.70% ] # ALL-NEXT: - Memory Dependencies [ 0.00% ] +# ALL: Critical sequence based on the simulation: + +# ALL: Instruction Dependency Information +# ALL-NEXT: +----< 0. addl %eax, %eax +# ALL-NEXT: | +# ALL-NEXT: | < loop carried > +# ALL-NEXT: | +# ALL-NEXT: +----> 0. addl %eax, %eax ## REGISTER dependency: %eax +# ALL-NEXT: | +# ALL-NEXT: | < loop carried > +# ALL-NEXT: | +# ALL-NEXT: +----> 0. addl %eax, %eax ## REGISTER dependency: %eax + # ALL: Instruction Info: # ALL-NEXT: [1]: #uOps # ALL-NEXT: [2]: Latency Index: llvm/trunk/test/tools/llvm-mca/X86/option-no-stats-1.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/option-no-stats-1.s +++ llvm/trunk/test/tools/llvm-mca/X86/option-no-stats-1.s @@ -22,6 +22,19 @@ # CHECK-NEXT: - Register Dependencies [ 76.70% ] # CHECK-NEXT: - Memory Dependencies [ 0.00% ] +# CHECK: Critical sequence based on the simulation: + +# CHECK: Instruction Dependency Information +# CHECK-NEXT: +----< 0. addl %edi, %eax +# CHECK-NEXT: | +# CHECK-NEXT: | < loop carried > +# CHECK-NEXT: | +# CHECK-NEXT: +----> 0. addl %edi, %eax ## REGISTER dependency: %eax +# CHECK-NEXT: | +# CHECK-NEXT: | < loop carried > +# CHECK-NEXT: | +# CHECK-NEXT: +----> 0. addl %edi, %eax ## REGISTER dependency: %eax + # CHECK: Instruction Info: # CHECK-NEXT: [1]: #uOps # CHECK-NEXT: [2]: Latency Index: llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.h =================================================================== --- llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.h +++ llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.h @@ -10,18 +10,71 @@ /// This file implements the bottleneck analysis view. /// /// This view internally observes backend pressure increase events in order to -/// identify potential sources of bottlenecks. +/// identify problematic data dependencies and processor resource interferences. /// -/// Example of bottleneck analysis report: +/// Example of bottleneck analysis report for a dot-product on X86 btver2: /// -/// Cycles with backend pressure increase [ 33.40% ] -/// Throughput Bottlenecks: -/// Resource Pressure [ 0.52% ] -/// - JLAGU [ 0.52% ] -/// Data Dependencies: [ 32.88% ] -/// - Register Dependencies [ 32.88% ] -/// - Memory Dependencies [ 0.00% ] +/// Cycles with backend pressure increase [ 40.76% ] +/// Throughput Bottlenecks: +/// Resource Pressure [ 39.34% ] +/// - JFPA [ 39.34% ] +/// - JFPU0 [ 39.34% ] +/// Data Dependencies: [ 1.42% ] +/// - Register Dependencies [ 1.42% ] +/// - Memory Dependencies [ 0.00% ] /// +/// According to the example, backend pressure increased during the 40.76% of +/// the simulated cycles. In particular, the major cause of backend pressure +/// increases was the contention on floating point adder JFPA accessible from +/// pipeline resource JFPU0. +/// +/// At the end of each cycle, if pressure on the simulated out-of-order buffers +/// has increased, a backend pressure event is reported. +/// In particular, this occurs when there is a delta between the number of uOps +/// dispatched and the number of uOps issued to the underlying pipelines. +/// +/// The bottleneck analysis view is also responsible for identifying and printing +/// the most "critical" sequence of dependent instructions according to the +/// simulated run. +/// +/// Below is the critical sequence computed for the dot-product example on +/// btver2: +/// +/// Instruction Dependency Information +/// +----< 2. vhaddps %xmm3, %xmm3, %xmm4 +/// | +/// | < loop carried > +/// | +/// | 0. vmulps %xmm0, %xmm0, %xmm2 +/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] +/// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3 +/// | +/// | < loop carried > +/// | +/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ] +/// +/// +/// The algorithm that computes the critical sequence is very similar to a +/// critical path analysis. +/// +/// A dependency graph is used internally to track dependencies between nodes. +/// Nodes of the graph represent instructions from the input assembly sequence, +/// and edges of the graph represent data dependencies or processor resource +/// interferences. +/// +/// Edges are dynamically 'discovered' by observing instruction state transitions +/// and backend pressure increase events. Edges are internally ranked based on +/// their "criticality". A dependency is considered to be critical if it takes a +/// long time to execute, and if it contributes to backend pressure increases. +/// Criticality is internally measured in terms of cycles; it is computed for +/// every edge in the graph as a function of the edge latency and the number of +/// backend pressure increase cycles contributed by that edge. +/// +/// At the end of simulation, costs are propagated to nodes through the edges of +/// the graph, and the most expensive path connecting the root-set (a +/// set of nodes with no predecessors) to a leaf node is reported as critical +/// sequence. +// //===----------------------------------------------------------------------===// #ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H @@ -108,6 +161,13 @@ return Info.ResourcePressureCycles; } + const char *resolveResourceName(uint64_t ResourceMask) const { + unsigned Index = getResourceStateIndex(ResourceMask); + unsigned ProcResID = ResIdx2ProcResID[Index]; + const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID); + return PRDesc.Name; + } + void onInstructionDispatched(unsigned IID); void onInstructionExecuted(unsigned IID); @@ -115,14 +175,13 @@ void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event); }; -// An edge of a dependency graph. -// Vertices of the graph are instructions identified by their ID. +// A dependency edge. struct DependencyEdge { enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE }; // Dependency edge descriptor. // - // It describe the dependency reason, as well as the edge cost in cycles. + // It specifies the dependency type, as well as the edge cost in cycles. struct Dependency { DependencyType Type; uint64_t ResourceOrRegID; @@ -130,14 +189,43 @@ }; Dependency Dep; - // Pair of vertices connected by this edge. unsigned FromIID; unsigned ToIID; + + // Used by the bottleneck analysis to compute the interference + // probability for processor resources. + unsigned Frequency; }; +// A dependency graph used by the bottleneck analysis to describe data +// dependencies and processor resource interferences between instructions. +// +// There is a node (an instance of struct DGNode) for every instruction in the +// input assembly sequence. Edges of the graph represent dependencies between +// instructions. +// +// Each edge of the graph is associated with a cost value which is used +// internally to rank dependency based on their impact on the runtime +// performance (see field DependencyEdge::Dependency::Cost). In general, the +// higher the cost of an edge, the higher the impact on performance. +// +// The cost of a dependency is a function of both the latency and the number of +// cycles where the dependency has been seen as critical (i.e. contributing to +// back-pressure increases). +// +// Loop carried dependencies are carefully expanded by the bottleneck analysis +// to guarantee that the graph stays acyclic. To this end, extra nodes are +// pre-allocated at construction time to describe instructions from "past and +// future" iterations. The graph is kept acyclic mainly because it simplifies the +// complexity of the algorithm that computes the critical sequence. class DependencyGraph { struct DGNode { unsigned NumPredecessors; + unsigned NumVisitedPredecessors; + uint64_t Cost; + unsigned Depth; + + DependencyEdge CriticalPredecessor; SmallVector OutgoingEdges; }; SmallVector Nodes; @@ -148,6 +236,9 @@ void addDependency(unsigned From, unsigned To, DependencyEdge::Dependency &&DE); + void initializeRootSet(SmallVectorImpl &RootSet) const; + void propagateThroughEdges(SmallVectorImpl &RootSet); + #ifndef NDEBUG void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE, MCInstPrinter &MCIP) const; @@ -170,6 +261,19 @@ addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost}); } + // Called by the bottleneck analysis at the end of simulation to propagate + // costs through the edges of the graph, and compute a critical path. + void finalizeGraph() { + SmallVector RootSet; + initializeRootSet(RootSet); + propagateThroughEdges(RootSet); + } + + // Returns a sequence of edges representing the critical sequence based on the + // simulated run. It assumes that the graph has already been finalized (i.e. + // method `finalizeGraph()` has already been called on this graph). + void getCriticalSequence(SmallVectorImpl &Seq) const; + #ifndef NDEBUG void dump(raw_ostream &OS, MCInstPrinter &MCIP) const; #endif @@ -178,6 +282,7 @@ /// A view that collects and prints a few performance numbers. class BottleneckAnalysis : public View { const MCSubtargetInfo &STI; + MCInstPrinter &MCIP; PressureTracker Tracker; DependencyGraph DG; @@ -212,6 +317,7 @@ // Prints a bottleneck message to OS. void printBottleneckHints(raw_ostream &OS) const; + void printCriticalSequence(raw_ostream &OS) const; public: BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP, Index: llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.cpp +++ llvm/trunk/tools/llvm-mca/Views/BottleneckAnalysis.cpp @@ -16,6 +16,7 @@ #include "llvm/MC/MCInst.h" #include "llvm/MCA/Support.h" #include "llvm/Support/Format.h" +#include "llvm/Support/FormattedStream.h" namespace llvm { namespace mca { @@ -161,12 +162,219 @@ OS << " - MEMORY"; } else { assert(DE.Type == DependencyEdge::DT_RESOURCE && - "Unexpected unsupported dependency type!"); + "Unsupported dependency type!"); OS << " - RESOURCE MASK: " << DE.ResourceOrRegID; } OS << " - CYCLES: " << DE.Cost << '\n'; } +#endif // NDEBUG + +void DependencyGraph::initializeRootSet( + SmallVectorImpl &RootSet) const { + for (unsigned I = 0, E = Nodes.size(); I < E; ++I) { + const DGNode &N = Nodes[I]; + if (N.NumPredecessors == 0 && !N.OutgoingEdges.empty()) + RootSet.emplace_back(I); + } +} + +void DependencyGraph::propagateThroughEdges( + SmallVectorImpl &RootSet) { + SmallVector ToVisit; + + // A critical sequence is computed as the longest path from a node of the + // RootSet to a leaf node (i.e. a node with no successors). The RootSet is + // composed of nodes with at least one successor, and no predecessors. + // + // Each node of the graph starts with an initial default cost of zero. The + // cost of a node is a measure of criticality: the higher the cost, the bigger + // is the performance impact. + // + // This algorithm is very similar to a (reverse) Dijkstra. Every iteration of + // the inner loop selects (i.e. visits) a node N from a set of `unvisited + // nodes`, and then propagates the cost of N to all its neighbors. + // + // The `unvisited nodes` set initially contains all the nodes from the + // RootSet. A node N is added to the `unvisited nodes` if all its + // predecessors have been visited already. + // + // For simplicity, every node tracks the number of unvisited incoming edges in + // field `NumVisitedPredecessors`. When the value of that field drops to + // zero, then the corresponding node is added to a `ToVisit` set. + // + // At the end of every iteration of the outer loop, set `ToVisit` becomes our + // new `unvisited nodes` set. + // + // The algorithm terminates when the set of unvisited nodes (i.e. our RootSet) + // is empty. This algorithm works under the assumption that the graph is + // acyclic. + do { + for (unsigned IID : RootSet) { + const DGNode &N = Nodes[IID]; + for (const DependencyEdge &DepEdge : N.OutgoingEdges) { + unsigned ToIID = DepEdge.ToIID; + DGNode &To = Nodes[ToIID]; + uint64_t Cost = N.Cost + DepEdge.Dep.Cost; + // Check if this is the most expensive incoming edge seen so far. In + // case, update the total cost of the destination node (ToIID), as well + // its field `CriticalPredecessor`. + if (Cost > To.Cost) { + To.CriticalPredecessor = DepEdge; + To.Cost = Cost; + To.Depth = N.Depth + 1; + } + To.NumVisitedPredecessors++; + if (To.NumVisitedPredecessors == To.NumPredecessors) + ToVisit.emplace_back(ToIID); + } + } + + std::swap(RootSet, ToVisit); + ToVisit.clear(); + } while (!RootSet.empty()); +} + +void DependencyGraph::getCriticalSequence( + SmallVectorImpl &Seq) const { + // At this stage, nodes of the graph have been already visited, and costs have + // been propagated through the edges (see method `propagateThroughEdges()`). + + // Identify the node N with the highest cost in the graph. By construction, + // that node is the last instruction of our critical sequence. + // Field N.Depth would tell us the total length of the sequence. + // + // To obtain the sequence of critical edges, we simply follow the chain of critical + // predecessors starting from node N (field DGNode::CriticalPredecessor). + const auto It = std::max_element( + Nodes.begin(), Nodes.end(), + [](const DGNode &Lhs, const DGNode &Rhs) { return Lhs.Cost < Rhs.Cost; }); + unsigned IID = std::distance(Nodes.begin(), It); + Seq.resize(Nodes[IID].Depth); + for (unsigned I = Seq.size(), E = 0; I > E; --I) { + const DGNode &N = Nodes[IID]; + Seq[I - 1] = &N.CriticalPredecessor; + IID = N.CriticalPredecessor.FromIID; + } +} + +static void printInstruction(formatted_raw_ostream &FOS, + const MCSubtargetInfo &STI, MCInstPrinter &MCIP, + const MCInst &MCI, + bool UseDifferentColor = false) { + std::string Instruction; + raw_string_ostream InstrStream(Instruction); + + FOS.PadToColumn(14); + + MCIP.printInst(&MCI, InstrStream, "", STI); + InstrStream.flush(); + + if (UseDifferentColor) + FOS.changeColor(raw_ostream::CYAN, true, false); + FOS << StringRef(Instruction).ltrim(); + if (UseDifferentColor) + FOS.resetColor(); +} + +void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const { + SmallVector Seq; + DG.getCriticalSequence(Seq); + if (Seq.empty()) + return; + + OS << "\nCritical sequence based on the simulation:\n\n"; + + const DependencyEdge &FirstEdge = *Seq[0]; + unsigned FromIID = FirstEdge.FromIID % Source.size(); + unsigned ToIID = FirstEdge.ToIID % Source.size(); + bool IsLoopCarried = FromIID >= ToIID; + + formatted_raw_ostream FOS(OS); + FOS.PadToColumn(14); + FOS << "Instruction"; + FOS.PadToColumn(58); + FOS << "Dependency Information"; + + bool HasColors = FOS.has_colors(); + + unsigned CurrentIID = 0; + if (IsLoopCarried) { + FOS << "\n +----< " << FromIID << "."; + printInstruction(FOS, STI, MCIP, Source[FromIID], HasColors); + FOS << "\n |\n | < loop carried > \n |"; + } else { + while (CurrentIID < FromIID) { + FOS << "\n " << CurrentIID << "."; + printInstruction(FOS, STI, MCIP, Source[CurrentIID]); + CurrentIID++; + } + FOS << "\n +----< " << CurrentIID << "."; + printInstruction(FOS, STI, MCIP, Source[CurrentIID], HasColors); + CurrentIID++; + } + + for (const DependencyEdge *&DE : Seq) { + ToIID = DE->ToIID % Source.size(); + unsigned LastIID = CurrentIID > ToIID ? Source.size() : ToIID; + + while (CurrentIID < LastIID) { + FOS << "\n | " << CurrentIID << "."; + printInstruction(FOS, STI, MCIP, Source[CurrentIID]); + CurrentIID++; + } + + if (CurrentIID == ToIID) { + FOS << "\n +----> " << ToIID << "."; + printInstruction(FOS, STI, MCIP, Source[CurrentIID], HasColors); + } else { + FOS << "\n |\n | < loop carried > \n |" + << "\n +----> " << ToIID << "."; + printInstruction(FOS, STI, MCIP, Source[ToIID], HasColors); + } + FOS.PadToColumn(58); + + const DependencyEdge::Dependency &Dep = DE->Dep; + if (HasColors) + FOS.changeColor(raw_ostream::SAVEDCOLOR, true, false); + + if (Dep.Type == DependencyEdge::DT_REGISTER) { + FOS << "## REGISTER dependency: "; + if (HasColors) + FOS.changeColor(raw_ostream::MAGENTA, true, false); + MCIP.printRegName(FOS, Dep.ResourceOrRegID); + } else if (Dep.Type == DependencyEdge::DT_MEMORY) { + FOS << "## MEMORY dependency."; + } else { + assert(Dep.Type == DependencyEdge::DT_RESOURCE && + "Unsupported dependency type!"); + FOS << "## RESOURCE interference: "; + if (HasColors) + FOS.changeColor(raw_ostream::MAGENTA, true, false); + FOS << Tracker.resolveResourceName(Dep.ResourceOrRegID); + if (HasColors) { + FOS.resetColor(); + FOS.changeColor(raw_ostream::SAVEDCOLOR, true, false); + } + FOS << " [ probability: " << ((DE->Frequency * 100) / Iterations) + << "% ]"; + } + if (HasColors) + FOS.resetColor(); + ++CurrentIID; + } + + while (CurrentIID < Source.size()) { + FOS << "\n " << CurrentIID << "."; + printInstruction(FOS, STI, MCIP, Source[CurrentIID]); + CurrentIID++; + } + + FOS << '\n'; + FOS.flush(); +} + +#ifndef NDEBUG void DependencyGraph::dump(raw_ostream &OS, MCInstPrinter &MCIP) const { OS << "\nREG DEPS\n"; for (const DGNode &Node : Nodes) @@ -200,10 +408,11 @@ if (It != Vec.end()) { It->Dep.Cost += Dep.Cost; + It->Frequency++; return; } - DependencyEdge DE = {Dep, From, To}; + DependencyEdge DE = {Dep, From, To, 1}; Vec.emplace_back(DE); NodeTo.NumPredecessors++; } @@ -211,7 +420,7 @@ BottleneckAnalysis::BottleneckAnalysis(const MCSubtargetInfo &sti, MCInstPrinter &Printer, ArrayRef S, unsigned NumIter) - : STI(sti), Tracker(STI.getSchedModel()), DG(S.size() * 3), + : STI(sti), MCIP(Printer), Tracker(STI.getSchedModel()), DG(S.size() * 3), Source(S), Iterations(NumIter), TotalCycles(0), PressureIncreasedBecauseOfResources(false), PressureIncreasedBecauseOfRegisterDependencies(false), @@ -274,38 +483,38 @@ const Instruction &IS = *Event.IR.getInstruction(); unsigned To = IID % Source.size(); - unsigned Cycles = Tracker.getResourcePressureCycles(IID); - if (Cycles) { - uint64_t ResourceMask = IS.getCriticalResourceMask(); - SmallVector, 4> Users; - while (ResourceMask) { - uint64_t Current = ResourceMask & (-ResourceMask); - Tracker.getResourceUsers(Current, Users); - for (const std::pair &U : Users) { - unsigned Cost = std::min(U.second, Cycles); - addResourceDep(U.first % Source.size(), To, Current, Cost); - } - Users.clear(); - ResourceMask ^= Current; - } + unsigned Cycles = 2 * Tracker.getResourcePressureCycles(IID); + uint64_t ResourceMask = IS.getCriticalResourceMask(); + SmallVector, 4> Users; + while (ResourceMask) { + uint64_t Current = ResourceMask & (-ResourceMask); + Tracker.getResourceUsers(Current, Users); + for (const std::pair &U : Users) + addResourceDep(U.first % Source.size(), To, Current, U.second + Cycles); + Users.clear(); + ResourceMask ^= Current; } - Cycles = Tracker.getRegisterPressureCycles(IID); - if (Cycles) { - const CriticalDependency &RegDep = IS.getCriticalRegDep(); + const CriticalDependency &RegDep = IS.getCriticalRegDep(); + if (RegDep.Cycles) { + Cycles = RegDep.Cycles + 2 * Tracker.getRegisterPressureCycles(IID); unsigned From = RegDep.IID % Source.size(); addRegisterDep(From, To, RegDep.RegID, Cycles); } - Cycles = Tracker.getMemoryPressureCycles(IID); - if (Cycles) { - const CriticalDependency &MemDep = IS.getCriticalMemDep(); + const CriticalDependency &MemDep = IS.getCriticalMemDep(); + if (MemDep.Cycles) { + Cycles = MemDep.Cycles + 2 * Tracker.getMemoryPressureCycles(IID); unsigned From = MemDep.IID % Source.size(); addMemoryDep(From, To, Cycles); } Tracker.handleInstructionIssuedEvent( static_cast(Event)); + + // Check if this is the last simulated instruction. + if (IID == ((Iterations * Source.size()) - 1)) + DG.finalizeGraph(); } void BottleneckAnalysis::onEvent(const HWPressureEvent &Event) { @@ -356,7 +565,7 @@ void BottleneckAnalysis::printBottleneckHints(raw_ostream &OS) const { if (!SeenStallCycles || !BPI.PressureIncreaseCycles) { - OS << "\nNo resource or data dependency bottlenecks discovered.\n"; + OS << "\n\nNo resource or data dependency bottlenecks discovered.\n"; return; } @@ -370,7 +579,7 @@ double MemDepPressurePerCycle = (double)BPI.MemoryDependencyCycles * 100 / TotalCycles; - OS << "\nCycles with backend pressure increase [ " + OS << "\n\nCycles with backend pressure increase [ " << format("%.2f", floor((PressurePerCycle * 100) + 0.5) / 100) << "% ]"; OS << "\nThroughput Bottlenecks: " @@ -399,7 +608,7 @@ << "% ]"; OS << "\n - Memory Dependencies [ " << format("%.2f", floor((MemDepPressurePerCycle * 100) + 0.5) / 100) - << "% ]\n\n"; + << "% ]\n"; } void BottleneckAnalysis::printView(raw_ostream &OS) const { @@ -408,6 +617,7 @@ printBottleneckHints(TempStream); TempStream.flush(); OS << Buffer; + printCriticalSequence(OS); } } // namespace mca.