Index: llvm/trunk/docs/CommandGuide/llvm-mca.rst =================================================================== --- llvm/trunk/docs/CommandGuide/llvm-mca.rst +++ llvm/trunk/docs/CommandGuide/llvm-mca.rst @@ -0,0 +1,133 @@ +llvm-mca - LLVM Machine Code Analyzer +===================================== + +SYNOPSIS +-------- + +:program:`llvm-mca` [*options*] [input] + +DESCRIPTION +----------- + +:program:`llvm-mca` is a performance analysis tool that uses information +available in LLVM (e.g. scheduling models) to statically measure the performance +of machine code in a specific CPU. + +Performance is measured in terms of throughput as well as processor resource +consumption. The tool currently works for processors with an out-of-order +backend, for which there is a scheduling model available in LLVM. + +The main goal of this tool is not just to predict the performance of the code +when run on the target, but also help with diagnosing potential performance +issues. + +Given an assembly code sequence, llvm-mca estimates the IPC (Instructions Per +Cycle), as well as hardware resource pressure. The analysis and reporting style +were inspired by the IACA tool from Intel. + +OPTIONS +------- + +If ``input`` is "``-``" or omitted, :program:`llvm-mca` reads from standard +input. Otherwise, it will read from the specified filename. + +If the :option:`-o` option is omitted, then :program:`llvm-mca` will send its output +to standard output if the input is from standard input. If the :option:`-o` +option specifies "``-``", then the output will also be sent to standard output. + + +.. option:: -help + + Print a summary of command line options. + +.. option:: -mtriple= + + Specify a target triple string. + +.. option:: -march= + + Specify the architecture for which to analyze the code. It defaults to the + host default target. + +.. option:: -mcpu= + + Specify the processor for whic to run the analysis. + By default this defaults to a "generic" processor. It is not autodetected to + the current architecture. + +.. option:: -output-asm-variant= + + Specify the output assembly variant for the report generated by the tool. + On x86, possible values are [0, 1]. A value of 0 (vic. 1) for this flag enables + the AT&T (vic. Intel) assembly format for the code printed out by the tool in + the analysis report. + +.. option:: -dispatch= + + Specify a different dispatch width for the processor. The dispatch width + defaults to the 'IssueWidth' specified by the processor scheduling model. + If width is zero, then the default dispatch width is used. + +.. option:: -max-retire-per-cycle= + + Specify the retire throughput (i.e. how many instructions can be retired by the + retire control unit every cycle). + +.. option:: -register-file-size= + + Specify the size of the register file. When specified, this flag limits + how many temporary registers are available for register renaming purposes. By + default, the number of temporary registers is unlimited. A value of zero for + this flag means "unlimited number of temporary registers". + +.. option:: -iterations= + + Specify the number of iterations to run. If this flag is set to 0, then the + tool sets the number of iterations to a default value (i.e. 70). + +.. option:: -noalias= + + If set, the tool assumes that loads and stores don't alias. This is the + default behavior. + +.. option:: -lqueue= + + Specify the size of the load queue in the load/store unit emulated by the tool. + By default, the tool assumes an unbound number of entries in the load queue. + A value of zero for this flag is ignored, and the default load queue size is + used instead. + +.. option:: -squeue= + + Specify the size of the store queue in the load/store unit emulated by the + tool. By default, the tool assumes an unbound number of entries in the store + queue. A value of zero for this flag is ignored, and the default store queue + size is used instead. + +.. option:: -verbose + + Enable verbose output. In particular, this flag enables a number of extra + statistics and performance counters for the dispatch logic, the reorder + buffer, the retire control unit and the register file. + +.. option:: -timeline + + Enable the timeline view. + +.. option:: -timeline-max-iterations= + + Limit the number of iterations to print in the timeline view. By default, the + timeline view prints information for up to 10 iterations. + +.. option:: -timeline-max-cycles= + + Limit the number of cycles in the timeline view. By default, the number of + cycles is set to 80. + + +EXIT STATUS +----------- + +:program:`llvm-mca` returns 0 on success. Otherwise, an error message is printed +to standard error, and the tool returns 1. + Index: llvm/trunk/test/tools/llvm-mca/ARM/lit.local.cfg =================================================================== --- llvm/trunk/test/tools/llvm-mca/ARM/lit.local.cfg +++ llvm/trunk/test/tools/llvm-mca/ARM/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'ARM' in config.root.targets: + config.unsupported = True Index: llvm/trunk/test/tools/llvm-mca/ARM/simple-test-cortex-a9.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/ARM/simple-test-cortex-a9.s +++ llvm/trunk/test/tools/llvm-mca/ARM/simple-test-cortex-a9.s @@ -0,0 +1,37 @@ +# RUN: llvm-mca -mtriple=arm-eabi -mcpu=cortex-a9 -iterations=100 < %s | FileCheck %s + +vadd.f32 s0, s2, s2 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 100 +# CHECK-NEXT: Total Cycles: 105 +# CHECK-NEXT: Dispatch Width: 2 +# CHECK-NEXT: IPC: 0.95 + +# CHECK: Resources: +# CHECK-NEXT: [0] - A9UnitAGU +# CHECK-NEXT: [1.0] - A9UnitALU +# CHECK-NEXT: [1.1] - A9UnitALU +# CHECK-NEXT: [2] - A9UnitB +# CHECK-NEXT: [3] - A9UnitFP +# CHECK-NEXT: [4] - A9UnitLS +# CHECK-NEXT: [5] - A9UnitMul + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] +# CHECK-NEXT: 1.00 - - - 1.00 - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] Instructions: +# CHECK-NEXT: 1.00 - - - 1.00 - - vadd.f32 s0, s2, s2 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 4 1.00 vadd.f32 s0, s2, s2 Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dot-product.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dot-product.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/dot-product.s @@ -0,0 +1,78 @@ +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=300 -timeline -timeline-max-iterations=3 < %s | FileCheck %s + +vmulps %xmm0, %xmm1, %xmm2 +vhaddps %xmm2, %xmm2, %xmm3 +vhaddps %xmm3, %xmm3, %xmm4 + +# CHECK: Iterations: 300 +# CHECK-NEXT: Instructions: 900 +# CHECK-NEXT: Total Cycles: 610 +# CHECK-NEXT: Dispatch Width: 2 +# CHECK-NEXT: IPC: 1.48 + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - - - 2.00 1.00 - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - - - - 1.00 - - - - - - - vmulps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - - vhaddps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: - - - - - 1.00 - - - - - - - - vhaddps %xmm3, %xmm3, %xmm4 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 2 1.00 vmulps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 1 3 1.00 vhaddps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: 1 3 1.00 vhaddps %xmm3, %xmm3, %xmm4 + + +# CHECK: Timeline view: +# CHECK-NEXT: 012345 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . . vmulps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,1] D==eeeER . . vhaddps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: [0,2] .D====eeeER . vhaddps %xmm3, %xmm3, %xmm4 + +# CHECK: [1,0] .DeeE-----R . vmulps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [1,1] . D=eeeE--R . vhaddps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: [1,2] . D====eeeER . vhaddps %xmm3, %xmm3, %xmm4 + +# CHECK: [2,0] . DeeE----R . vmulps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [2,1] . D====eeeER . vhaddps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: [2,2] . D======eeeER vhaddps %xmm3, %xmm3, %xmm4 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 3 1.0 1.0 3.0 vmulps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 1. 3 3.3 0.7 0.7 vhaddps %xmm2, %xmm2, %xmm3 +# CHECK-NEXT: 2. 3 5.7 0.0 0.0 vhaddps %xmm3, %xmm3, %xmm4 Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/load-store-alias.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/load-store-alias.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/load-store-alias.s @@ -0,0 +1,97 @@ +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 -timeline -timeline-max-iterations=1 -noalias=false < %s | FileCheck %s + +vmovaps (%rsi), %xmm0 +vmovaps %xmm0, (%rdi) +vmovaps 16(%rsi), %xmm0 +vmovaps %xmm0, 16(%rdi) +vmovaps 32(%rsi), %xmm0 +vmovaps %xmm0, 32(%rdi) +vmovaps 48(%rsi), %xmm0 +vmovaps %xmm0, 48(%rdi) + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 800 +# CHECK-NEXT: Total Cycles: 2403 +# CHECK-NEXT: Dispatch Width: 2 +# CHECK-NEXT: IPC: 0.33 + + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - - - - - 4.00 - 4.00 - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps (%rsi), %xmm0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, (%rdi) +# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 48(%rdi) + + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 5 1.00 * vmovaps (%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, (%rdi) +# CHECK-NEXT: 1 5 1.00 * vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: 1 5 1.00 * vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: 1 5 1.00 * vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 48(%rdi) + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. vmovaps (%rsi), %xmm0 +# CHECK-NEXT: [0,1] D=====eER . . . .. vmovaps %xmm0, (%rdi) +# CHECK-NEXT: [0,2] .D=====eeeeeER . . .. vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: [0,3] .D==========eER. . .. vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: [0,4] . D==========eeeeeER. .. vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: [0,5] . D===============eER .. vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: [0,6] . D===============eeeeeER. vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: [0,7] . D====================eER vmovaps %xmm0, 48(%rdi) + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps (%rsi), %xmm0 +# CHECK-NEXT: 1. 1 6.0 0.0 0.0 vmovaps %xmm0, (%rdi) +# CHECK-NEXT: 2. 1 6.0 0.0 0.0 vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: 3. 1 11.0 0.0 0.0 vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: 4. 1 11.0 0.0 0.0 vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: 5. 1 16.0 0.0 0.0 vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: 6. 1 16.0 0.0 0.0 vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: 7. 1 21.0 0.0 0.0 vmovaps %xmm0, 48(%rdi) Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/memcpy-like-test.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/memcpy-like-test.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/memcpy-like-test.s @@ -0,0 +1,100 @@ +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 -timeline -timeline-max-iterations=1 < %s | FileCheck %s + +vmovaps (%rsi), %xmm0 +vmovaps %xmm0, (%rdi) +vmovaps 16(%rsi), %xmm0 +vmovaps %xmm0, 16(%rdi) +vmovaps 32(%rsi), %xmm0 +vmovaps %xmm0, 32(%rdi) +vmovaps 48(%rsi), %xmm0 +vmovaps %xmm0, 48(%rdi) + + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 800 +# CHECK-NEXT: Total Cycles: 408 +# CHECK-NEXT: Dispatch Width: 2 +# CHECK-NEXT: IPC: 1.96 + + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - - - - - 4.00 - 4.00 - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps (%rsi), %xmm0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, (%rdi) +# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 48(%rdi) + + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 5 1.00 * vmovaps (%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, (%rdi) +# CHECK-NEXT: 1 5 1.00 * vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: 1 5 1.00 * vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: 1 5 1.00 * vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 48(%rdi) + + +# CHECK: Timeline view: +# CHECK-NEXT: 01 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeeeeER .. vmovaps (%rsi), %xmm0 +# CHECK-NEXT: [0,1] D=====eER .. vmovaps %xmm0, (%rdi) +# CHECK-NEXT: [0,2] .DeeeeeER .. vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: [0,3] .D=====eER.. vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: [0,4] . DeeeeeER.. vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: [0,5] . D=====eER. vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: [0,6] . DeeeeeER. vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: [0,7] . D=====eER vmovaps %xmm0, 48(%rdi) + + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps (%rsi), %xmm0 +# CHECK-NEXT: 1. 1 6.0 0.0 0.0 vmovaps %xmm0, (%rdi) +# CHECK-NEXT: 2. 1 1.0 1.0 0.0 vmovaps 16(%rsi), %xmm0 +# CHECK-NEXT: 3. 1 6.0 0.0 0.0 vmovaps %xmm0, 16(%rdi) +# CHECK-NEXT: 4. 1 1.0 1.0 0.0 vmovaps 32(%rsi), %xmm0 +# CHECK-NEXT: 5. 1 6.0 0.0 0.0 vmovaps %xmm0, 32(%rdi) +# CHECK-NEXT: 6. 1 1.0 1.0 0.0 vmovaps 48(%rsi), %xmm0 +# CHECK-NEXT: 7. 1 6.0 0.0 0.0 vmovaps %xmm0, 48(%rdi) Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/simple-test.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/simple-test.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/simple-test.s @@ -0,0 +1,45 @@ +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 < %s | FileCheck %s + +add %edi, %eax + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 100 +# CHECK-NEXT: Total Cycles: 103 +# CHECK-NEXT: Dispatch Width: 2 +# CHECK-NEXT: IPC: 0.97 + +# CHECK-LABEL: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - addl %edi, %eax + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.50 addl %edi, %eax Index: llvm/trunk/test/tools/llvm-mca/X86/cpus.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/cpus.s +++ llvm/trunk/test/tools/llvm-mca/X86/cpus.s @@ -0,0 +1,27 @@ +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 < %s | FileCheck --check-prefix=ALL --check-prefix=BTVER2 %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=znver1 < %s | FileCheck --check-prefix=ALL --check-prefix=ZNVER1 %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge < %s | FileCheck --check-prefix=ALL --check-prefix=SANDYBRIDGE %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=ivybridge < %s | FileCheck --check-prefix=ALL --check-prefix=IVYBRIDGE %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=haswell < %s | FileCheck --check-prefix=ALL --check-prefix=HASWELL %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=broadwell < %s | FileCheck --check-prefix=ALL --check-prefix=BROADWELL %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=knl < %s | FileCheck --check-prefix=ALL --check-prefix=KNL %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=skylake < %s | FileCheck --check-prefix=ALL --check-prefix=SKX %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 < %s | FileCheck --check-prefix=ALL --check-prefix=SKX-AVX512 %s +# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=slm < %s | FileCheck --check-prefix=ALL --check-prefix=SLM %s + +add %edi, %eax + +# ALL: Iterations: 70 +# ALL-NEXT: Instructions: 70 + +# BTVER2: Dispatch Width: 2 +# ZNVER1: Dispatch Width: 4 +# SANDYBRIDGE: Dispatch Width: 4 +# IVYBRIDGE: Dispatch Width: 4 +# HASWELL: Dispatch Width: 4 +# BROADWELL: Dispatch Width: 4 +# KNL: Dispatch Width: 4 +# SKX: Dispatch Width: 6 +# SKX-AVX512: Dispatch Width: 6 +# SLM: Dispatch Width: 2 + Index: llvm/trunk/test/tools/llvm-mca/X86/default-iterations.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/default-iterations.s +++ llvm/trunk/test/tools/llvm-mca/X86/default-iterations.s @@ -0,0 +1,11 @@ +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=0 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 < %s 2>&1 | FileCheck --check-prefix=CUSTOM %s + +add %eax, %eax + +# DEFAULT: Iterations: 70 +# DEFAULT-NEXT: Instructions: 70 + +# CUSTOM: Iterations: 1 +# CUSTOM-NEXT: Instructions: 1 Index: llvm/trunk/test/tools/llvm-mca/X86/dispatch_width.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/dispatch_width.s +++ llvm/trunk/test/tools/llvm-mca/X86/dispatch_width.s @@ -0,0 +1,8 @@ +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -dispatch=0 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -dispatch=1 < %s 2>&1 | FileCheck --check-prefix=CUSTOM %s + +add %eax, %eax + +# DEFAULT: Dispatch Width: 2 +# CUSTOM: Dispatch Width: 1 Index: llvm/trunk/test/tools/llvm-mca/X86/in-order-cpu.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/in-order-cpu.s +++ llvm/trunk/test/tools/llvm-mca/X86/in-order-cpu.s @@ -0,0 +1,3 @@ +# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=atom -o %t1 2>&1 | FileCheck %s + +# CHECK: error: please specify an out-of-order cpu. 'atom' is an in-order cpu. Index: llvm/trunk/test/tools/llvm-mca/X86/invalid-assembly-sequence.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/invalid-assembly-sequence.s +++ llvm/trunk/test/tools/llvm-mca/X86/invalid-assembly-sequence.s @@ -0,0 +1,3 @@ +# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s + +invalid_instruction_mnemonic Index: llvm/trunk/test/tools/llvm-mca/X86/invalid-cpu.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/invalid-cpu.s +++ llvm/trunk/test/tools/llvm-mca/X86/invalid-cpu.s @@ -0,0 +1,3 @@ +# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=foo -o %t1 2>&1 | FileCheck %s + +# CHECK: 'foo' is not a recognized processor for this target (ignoring processor) Index: llvm/trunk/test/tools/llvm-mca/X86/invalid-empty-file.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/invalid-empty-file.s +++ llvm/trunk/test/tools/llvm-mca/X86/invalid-empty-file.s @@ -0,0 +1,3 @@ +# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -o %t1 2>&1 | FileCheck %s + +# CHECK: error: no assembly instructions found. Index: llvm/trunk/test/tools/llvm-mca/X86/lit.local.cfg =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/lit.local.cfg +++ llvm/trunk/test/tools/llvm-mca/X86/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'X86' in config.root.targets: + config.unsupported = True + Index: llvm/trunk/test/tools/llvm-mca/X86/no-sched-model.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/no-sched-model.s +++ llvm/trunk/test/tools/llvm-mca/X86/no-sched-model.s @@ -0,0 +1,4 @@ +# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown < %s 2>&1 | FileCheck %s +# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=generic < %s 2>&1 | FileCheck %s + +# CHECK: error: unable to find instruction-level scheduling information for target triple 'x86_64-unknown-unknown' and cpu 'generic'. Index: llvm/trunk/test/tools/llvm-mca/invalid_input_file_name.test =================================================================== --- llvm/trunk/test/tools/llvm-mca/invalid_input_file_name.test +++ llvm/trunk/test/tools/llvm-mca/invalid_input_file_name.test @@ -0,0 +1,3 @@ +# RUN: not llvm-mca %t.blah -o %t2 2>&1 | FileCheck --check-prefix=ENOENT %s + +# ENOENT: {{.*}}.blah: {{[Nn]}}o such file or directory Index: llvm/trunk/test/tools/llvm-mca/lit.local.cfg =================================================================== --- llvm/trunk/test/tools/llvm-mca/lit.local.cfg +++ llvm/trunk/test/tools/llvm-mca/lit.local.cfg @@ -0,0 +1,4 @@ +# Requires a non-empty default triple for these tests +if 'default_triple' not in config.available_features: + config.unsupported = True + Index: llvm/trunk/tools/LLVMBuild.txt =================================================================== --- llvm/trunk/tools/LLVMBuild.txt +++ llvm/trunk/tools/LLVMBuild.txt @@ -37,6 +37,7 @@ llvm-link llvm-lto llvm-mc + llvm-mca llvm-mcmarkup llvm-modextract llvm-mt Index: llvm/trunk/tools/llvm-mca/Backend.h =================================================================== --- llvm/trunk/tools/llvm-mca/Backend.h +++ llvm/trunk/tools/llvm-mca/Backend.h @@ -0,0 +1,141 @@ +//===--------------------- Backend.h ----------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements an OoO backend for the llvm-mca tool. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_BACKEND_H +#define LLVM_TOOLS_LLVM_MCA_BACKEND_H + +#include "Dispatch.h" +#include "InstrBuilder.h" +#include "Scheduler.h" +#include "SourceMgr.h" + +namespace mca { + +struct HWEventListener; + +/// \brief An out of order backend for a specific subtarget. +/// +/// It emulates an out-of-order execution of instructions. Instructions are +/// fetched from a MCInst sequence managed by an object of class SourceMgr. +/// Instructions are firstly dispatched to the schedulers and then executed. +/// This class tracks the lifetime of an instruction from the moment where +/// it gets dispatched to the schedulers, to the moment where it finishes +/// executing and register writes are architecturally committed. +/// In particular, it monitors changes in the state of every instruction +/// in flight. +/// Instructions are executed in a loop of iterations. The number of iterations +/// is defined by the SourceMgr object. +/// The Backend entrypoint is method 'Run()' which execute cycles in a loop +/// until there are new instructions to dispatch, and not every instruction +/// has been retired. +/// Internally, the Backend collects statistical information in the form of +/// histograms. For example, it tracks how the dispatch group size changes +/// over time. +class Backend { + const llvm::MCSubtargetInfo &STI; + + std::unique_ptr IB; + std::unique_ptr HWS; + std::unique_ptr DU; + std::unique_ptr SM; + unsigned Cycles; + + llvm::DenseMap> Instructions; + std::set Listeners; + + void runCycle(unsigned Cycle); + +public: + Backend(const llvm::MCSubtargetInfo &Subtarget, const llvm::MCInstrInfo &MCII, + const llvm::MCRegisterInfo &MRI, std::unique_ptr Source, + unsigned DispatchWidth = 0, unsigned RegisterFileSize = 0, + unsigned MaxRetirePerCycle = 0, unsigned LoadQueueSize = 0, + unsigned StoreQueueSize = 0, bool AssumeNoAlias = false) + : STI(Subtarget), + HWS(llvm::make_unique(this, Subtarget.getSchedModel(), + LoadQueueSize, StoreQueueSize, + AssumeNoAlias)), + DU(llvm::make_unique( + this, MRI, Subtarget.getSchedModel().MicroOpBufferSize, + RegisterFileSize, MaxRetirePerCycle, DispatchWidth, HWS.get())), + SM(std::move(Source)), Cycles(0) { + IB = llvm::make_unique(MCII, getProcResourceMasks()); + } + + void run() { + while (SM->hasNext() || !DU->isRCUEmpty()) + runCycle(Cycles++); + } + + unsigned getNumIterations() const { return SM->getNumIterations(); } + unsigned getNumInstructions() const { return SM->size(); } + unsigned getNumCycles() const { return Cycles; } + unsigned getTotalRegisterMappingsCreated() const { + return DU->getTotalRegisterMappingsCreated(); + } + unsigned getMaxUsedRegisterMappings() const { + return DU->getMaxUsedRegisterMappings(); + } + unsigned getDispatchWidth() const { return DU->getDispatchWidth(); } + + const llvm::MCSubtargetInfo &getSTI() const { return STI; } + const llvm::MCSchedModel &getSchedModel() const { + return STI.getSchedModel(); + } + const llvm::ArrayRef getProcResourceMasks() const { + return HWS->getProcResourceMasks(); + } + + double getRThroughput(const InstrDesc &ID) const { + return HWS->getRThroughput(ID); + } + void getBuffersUsage(std::vector &Usage) const { + return HWS->getBuffersUsage(Usage); + } + + unsigned getNumRATStalls() const { return DU->getNumRATStalls(); } + unsigned getNumRCUStalls() const { return DU->getNumRCUStalls(); } + unsigned getNumSQStalls() const { return DU->getNumSQStalls(); } + unsigned getNumLDQStalls() const { return DU->getNumLDQStalls(); } + unsigned getNumSTQStalls() const { return DU->getNumSTQStalls(); } + unsigned getNumDispatchGroupStalls() const { + return DU->getNumDispatchGroupStalls(); + } + + const llvm::MCInst &getMCInstFromIndex(unsigned Index) const { + return SM->getMCInstFromIndex(Index); + } + + const InstrDesc &getInstrDesc(const llvm::MCInst &Inst) const { + return IB->getOrCreateInstrDesc(STI, Inst); + } + + const SourceMgr &getSourceMgr() const { return *SM; } + + void addEventListener(HWEventListener *Listener); + void notifyCycleBegin(unsigned Cycle); + void notifyInstructionDispatched(unsigned Index); + void notifyInstructionReady(unsigned Index); + void notifyInstructionIssued( + unsigned Index, + const llvm::ArrayRef> &Used); + void notifyInstructionExecuted(unsigned Index); + void notifyResourceAvailable(const ResourceRef &RR); + void notifyInstructionRetired(unsigned Index); + void notifyCycleEnd(unsigned Cycle); +}; + +} // namespace mca + +#endif Index: llvm/trunk/tools/llvm-mca/Backend.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/Backend.cpp +++ llvm/trunk/tools/llvm-mca/Backend.cpp @@ -0,0 +1,132 @@ +//===--------------------- Backend.cpp --------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Implementation of class Backend which emulates an hardware OoO backend. +/// +//===----------------------------------------------------------------------===// + +#include "Backend.h" +#include "HWEventListener.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/Support/Debug.h" + +namespace mca { + +#define DEBUG_TYPE "llvm-mca" + +using namespace llvm; + +void Backend::addEventListener(HWEventListener *Listener) { + if (Listener) + Listeners.insert(Listener); +} + +void Backend::runCycle(unsigned Cycle) { + notifyCycleBegin(Cycle); + + if (!SM->hasNext()) { + notifyCycleEnd(Cycle); + return; + } + + InstRef IR = SM->peekNext(); + const InstrDesc *Desc = &IB->getOrCreateInstrDesc(STI, *IR.second); + while (DU->isAvailable(Desc->NumMicroOps) && DU->canDispatch(*Desc)) { + Instruction *NewIS = IB->createInstruction(STI, *DU, IR.first, *IR.second); + Instructions[IR.first] = std::unique_ptr(NewIS); + NewIS->setRCUTokenID(DU->dispatch(IR.first, NewIS)); + + // If this is a zero latency instruction, then we don't need to dispatch + // it. Instead, we can mark it as executed. + if (NewIS->isZeroLatency()) + notifyInstructionExecuted(IR.first); + + // Check if we have dispatched all the instructions. + SM->updateNext(); + if (!SM->hasNext()) + break; + + // Prepare for the next round. + IR = SM->peekNext(); + Desc = &IB->getOrCreateInstrDesc(STI, *IR.second); + } + + notifyCycleEnd(Cycle); +} + +void Backend::notifyCycleBegin(unsigned Cycle) { + DEBUG(dbgs() << "[E] Cycle begin: " << Cycle << '\n'); + for (HWEventListener *Listener : Listeners) + Listener->onCycleBegin(Cycle); + + DU->cycleEvent(Cycle); + HWS->cycleEvent(Cycle); +} + +void Backend::notifyInstructionDispatched(unsigned Index) { + DEBUG(dbgs() << "[E] Instruction Dispatched: " << Index << '\n'); + for (HWEventListener *Listener : Listeners) + Listener->onInstructionDispatched(Index); +} + +void Backend::notifyInstructionReady(unsigned Index) { + DEBUG(dbgs() << "[E] Instruction Ready: " << Index << '\n'); + for (HWEventListener *Listener : Listeners) + Listener->onInstructionReady(Index); +} + +void Backend::notifyInstructionIssued( + unsigned Index, const ArrayRef> &Used) { + DEBUG( + dbgs() << "[E] Instruction Issued: " << Index << '\n'; + for (const std::pair &Resource : Used) { + dbgs() << "[E] Resource Used: [" << Resource.first.first << '.' + << Resource.first.second << "]\n"; + dbgs() << " cycles: " << Resource.second << '\n'; + } + ); + + for (HWEventListener *Listener : Listeners) + Listener->onInstructionIssued(Index, Used); +} + +void Backend::notifyInstructionExecuted(unsigned Index) { + DEBUG(dbgs() << "[E] Instruction Executed: " << Index << '\n'); + for (HWEventListener *Listener : Listeners) + Listener->onInstructionExecuted(Index); + + const Instruction &IS = *Instructions[Index]; + DU->onInstructionExecuted(IS.getRCUTokenID()); +} + +void Backend::notifyInstructionRetired(unsigned Index) { + DEBUG(dbgs() << "[E] Instruction Retired: " << Index << '\n'); + for (HWEventListener *Listener : Listeners) + Listener->onInstructionRetired(Index); + + const Instruction &IS = *Instructions[Index]; + DU->invalidateRegisterMappings(IS); + Instructions.erase(Index); +} + +void Backend::notifyResourceAvailable(const ResourceRef &RR) { + DEBUG(dbgs() << "[E] Resource Available: [" << RR.first << '.' << RR.second + << "]\n"); + for (HWEventListener *Listener : Listeners) + Listener->onResourceAvailable(RR); +} + +void Backend::notifyCycleEnd(unsigned Cycle) { + DEBUG(dbgs() << "[E] Cycle end: " << Cycle << "\n\n"); + for (HWEventListener *Listener : Listeners) + Listener->onCycleEnd(Cycle); +} + +} // namespace mca. Index: llvm/trunk/tools/llvm-mca/BackendPrinter.h =================================================================== --- llvm/trunk/tools/llvm-mca/BackendPrinter.h +++ llvm/trunk/tools/llvm-mca/BackendPrinter.h @@ -0,0 +1,102 @@ +//===--------------------- BackendPrinter.h ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements class BackendPrinter. +/// BackendPrinter is able to collect statistics related to the code executed +/// by the Backend class. Information is then printed out with the help of +/// a MCInstPrinter (to pretty print MCInst objects) and other helper classes. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_BACKENDPRINTER_H +#define LLVM_TOOLS_LLVM_MCA_BACKENDPRINTER_H + +#include "Backend.h" +#include "BackendStatistics.h" +#include "ResourcePressureView.h" +#include "TimelineView.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/ToolOutputFile.h" + +#define DEBUG_TYPE "llvm-mca" + +namespace mca { + +class ResourcePressureView; +class TimelineView; + +/// \brief A printer class that knows how to collects statistics on the +/// code analyzed by the llvm-mca tool. +/// +/// This class knows how to print out the analysis information collected +/// during the execution of the code. Internally, it delegates to other +/// classes the task of printing out timeline information as well as +/// resource pressure. +class BackendPrinter { + Backend &B; + bool EnableVerboseOutput; + + std::unique_ptr MCIP; + std::unique_ptr File; + + std::unique_ptr RPV; + std::unique_ptr TV; + std::unique_ptr BS; + + using Histogram = std::map; + void printDUStatistics(const Histogram &Stats, unsigned Cycles) const; + void printDispatchStalls(unsigned RATStalls, unsigned RCUStalls, + unsigned SQStalls, unsigned LDQStalls, + unsigned STQStalls, unsigned DGStalls) const; + void printRATStatistics(unsigned Mappings, unsigned MaxUsedMappings) const; + void printRCUStatistics(const Histogram &Histogram, unsigned Cycles) const; + void printIssuePerCycle(const Histogram &IssuePerCycle, + unsigned TotalCycles) const; + void printSchedulerUsage(const llvm::MCSchedModel &SM, + const llvm::ArrayRef &Usage) const; + void printGeneralStatistics(unsigned Iterations, unsigned Cycles, + unsigned Instructions, + unsigned DispatchWidth) const; + void printInstructionInfo() const; + + std::unique_ptr getOutputStream(std::string OutputFile); + void initialize(std::string OputputFileName); + +public: + BackendPrinter(Backend &backend, std::string OutputFileName, + std::unique_ptr IP, bool EnableVerbose) + : B(backend), EnableVerboseOutput(EnableVerbose), MCIP(std::move(IP)) { + initialize(OutputFileName); + } + + ~BackendPrinter() { + if (File) + File->keep(); + } + + bool isFileValid() const { return File.get(); } + llvm::raw_ostream &getOStream() const { + assert(isFileValid()); + return File->os(); + } + + llvm::MCInstPrinter &getMCInstPrinter() const { return *MCIP; } + + void addResourcePressureView(); + void addTimelineView(unsigned MaxIterations = 3, unsigned MaxCycles = 80); + + void printReport() const; +}; + +} // namespace mca + +#endif Index: llvm/trunk/tools/llvm-mca/BackendPrinter.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/BackendPrinter.cpp +++ llvm/trunk/tools/llvm-mca/BackendPrinter.cpp @@ -0,0 +1,209 @@ +//===--------------------- BackendPrinter.cpp -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the BackendPrinter interface. +/// +//===----------------------------------------------------------------------===// + +#include "BackendPrinter.h" +#include "llvm/CodeGen/TargetSchedule.h" + +namespace mca { + +using namespace llvm; + +std::unique_ptr +BackendPrinter::getOutputStream(std::string OutputFile) { + if (OutputFile == "") + OutputFile = "-"; + std::error_code EC; + auto Out = llvm::make_unique(OutputFile, EC, sys::fs::F_None); + if (!EC) + return Out; + errs() << EC.message() << '\n'; + return nullptr; +} + +void BackendPrinter::printGeneralStatistics(unsigned Iterations, + unsigned Cycles, + unsigned Instructions, + unsigned DispatchWidth) const { + unsigned TotalInstructions = Instructions * Iterations; + double IPC = (double)TotalInstructions / Cycles; + + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "Iterations: " << Iterations; + TempStream << "\nInstructions: " << TotalInstructions; + TempStream << "\nTotal Cycles: " << Cycles; + TempStream << "\nDispatch Width: " << DispatchWidth; + TempStream << "\nIPC: " << format("%.2f", IPC) << '\n'; + TempStream.flush(); + File->os() << Buffer; +} + +void BackendPrinter::printRATStatistics(unsigned TotalMappings, + unsigned MaxUsedMappings) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "\n\nRegister Alias Table:"; + TempStream << "\nTotal number of mappings created: " << TotalMappings; + TempStream << "\nMax number of mappings used: " << MaxUsedMappings + << '\n'; + TempStream.flush(); + File->os() << Buffer; +} + +void BackendPrinter::printDispatchStalls(unsigned RATStalls, unsigned RCUStalls, + unsigned SCHEDQStalls, + unsigned LDQStalls, unsigned STQStalls, + unsigned DGStalls) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "\n\nDynamic Dispatch Stall Cycles:\n"; + TempStream << "RAT - Register unavailable: " + << RATStalls; + TempStream << "\nRCU - Retire tokens unavailable: " + << RCUStalls; + TempStream << "\nSCHEDQ - Scheduler full: " + << SCHEDQStalls; + TempStream << "\nLQ - Load queue full: " + << LDQStalls; + TempStream << "\nSQ - Store queue full: " + << STQStalls; + TempStream << "\nGROUP - Static restrictions on the dispatch group: " + << DGStalls; + TempStream << '\n'; + TempStream.flush(); + File->os() << Buffer; +} + +void BackendPrinter::printSchedulerUsage( + const MCSchedModel &SM, const ArrayRef &Usage) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "\n\nScheduler's queue usage:\n"; + const ArrayRef ResourceMasks = B.getProcResourceMasks(); + for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { + const MCProcResourceDesc &ProcResource = *SM.getProcResource(I); + if (!ProcResource.BufferSize) + continue; + + for (const BufferUsageEntry &Entry : Usage) + if (ResourceMasks[I] == Entry.first) + TempStream << ProcResource.Name << ", " << Entry.second << '/' + << ProcResource.BufferSize << '\n'; + } + + TempStream.flush(); + File->os() << Buffer; +} + +void BackendPrinter::printInstructionInfo() const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + + TempStream << "\n\nInstruction Info:\n"; + TempStream << "[1]: #uOps\n[2]: Latency\n[3]: RThroughput\n" + << "[4]: MayLoad\n[5]: MayStore\n[6]: HasSideEffects\n\n"; + + TempStream << "[1] [2] [3] [4] [5] [6]\tInstructions:\n"; + for (unsigned I = 0, E = B.getNumInstructions(); I < E; ++I) { + const MCInst &Inst = B.getMCInstFromIndex(I); + const InstrDesc &ID = B.getInstrDesc(Inst); + unsigned NumMicroOpcodes = ID.NumMicroOps; + unsigned Latency = ID.MaxLatency; + double RThroughput = B.getRThroughput(ID); + TempStream << ' ' << NumMicroOpcodes << " "; + if (NumMicroOpcodes < 10) + TempStream << " "; + else if (NumMicroOpcodes < 100) + TempStream << ' '; + TempStream << Latency << " "; + if (Latency < 10.0) + TempStream << " "; + else if (Latency < 100.0) + TempStream << ' '; + if (RThroughput) { + TempStream << format("%.2f", RThroughput) << ' '; + if (RThroughput < 10.0) + TempStream << " "; + else if (RThroughput < 100.0) + TempStream << ' '; + } else { + TempStream << " - "; + } + TempStream << (ID.MayLoad ? " * " : " "); + TempStream << (ID.MayStore ? " * " : " "); + TempStream << (ID.HasSideEffects ? " * " : " "); + MCIP->printInst(&Inst, TempStream, "", B.getSTI()); + TempStream << '\n'; + } + + TempStream.flush(); + File->os() << Buffer; +} + +void BackendPrinter::printReport() const { + assert(isFileValid()); + unsigned Cycles = B.getNumCycles(); + printGeneralStatistics(B.getNumIterations(), Cycles, B.getNumInstructions(), + B.getDispatchWidth()); + if (EnableVerboseOutput) { + printDispatchStalls(B.getNumRATStalls(), B.getNumRCUStalls(), + B.getNumSQStalls(), B.getNumLDQStalls(), + B.getNumSTQStalls(), B.getNumDispatchGroupStalls()); + printRATStatistics(B.getTotalRegisterMappingsCreated(), + B.getMaxUsedRegisterMappings()); + BS->printHistograms(File->os()); + + std::vector Usage; + B.getBuffersUsage(Usage); + printSchedulerUsage(B.getSchedModel(), Usage); + } + + if (RPV) { + RPV->printResourcePressure(getOStream(), Cycles); + printInstructionInfo(); + } + + if (TV) { + TV->printTimeline(getOStream()); + TV->printAverageWaitTimes(getOStream()); + } +} + +void BackendPrinter::addResourcePressureView() { + if (!RPV) { + RPV = llvm::make_unique( + B.getSTI(), *MCIP, B.getSourceMgr(), B.getProcResourceMasks()); + B.addEventListener(RPV.get()); + } +} + +void BackendPrinter::addTimelineView(unsigned MaxIterations, + unsigned MaxCycles) { + if (!TV) { + TV = llvm::make_unique(B.getSTI(), *MCIP, B.getSourceMgr(), + MaxIterations, MaxCycles); + B.addEventListener(TV.get()); + } +} + +void BackendPrinter::initialize(std::string OutputFileName) { + File = getOutputStream(OutputFileName); + MCIP->setPrintImmHex(false); + if (EnableVerboseOutput) { + BS = llvm::make_unique(); + B.addEventListener(BS.get()); + } +} + +} // namespace mca. Index: llvm/trunk/tools/llvm-mca/BackendStatistics.h =================================================================== --- llvm/trunk/tools/llvm-mca/BackendStatistics.h +++ llvm/trunk/tools/llvm-mca/BackendStatistics.h @@ -0,0 +1,95 @@ +//===--------------------- BackendStatistics.h ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements a printer class for printing generic Backend +/// statistics related to the dispatch logic, scheduler and retire unit. +/// +/// Example: +/// ======== +/// +/// Dispatch Logic - number of cycles where we saw N instructions dispatched: +/// [# dispatched], [# cycles] +/// 0, 15 (11.5%) +/// 5, 4 (3.1%) +/// +/// Schedulers - number of cycles where we saw N instructions issued: +/// [# issued], [# cycles] +/// 0, 7 (5.4%) +/// 1, 4 (3.1%) +/// 2, 8 (6.2%) +/// +/// Retire Control Unit - number of cycles where we saw N instructions retired: +/// [# retired], [# cycles] +/// 0, 9 (6.9%) +/// 1, 6 (4.6%) +/// 2, 1 (0.8%) +/// 4, 3 (2.3%) +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_BACKENDSTATISTICS_H +#define LLVM_TOOLS_LLVM_MCA_BACKENDSTATISTICS_H + +#include "HWEventListener.h" +#include "llvm/Support/raw_ostream.h" +#include + +namespace mca { + +class BackendStatistics : public HWEventListener { + using Histogram = std::map; + Histogram DispatchGroupSizePerCycle; + Histogram RetiredPerCycle; + Histogram IssuedPerCycle; + + unsigned NumDispatched; + unsigned NumIssued; + unsigned NumRetired; + unsigned NumCycles; + + void updateHistograms() { + DispatchGroupSizePerCycle[NumDispatched]++; + IssuedPerCycle[NumIssued]++; + RetiredPerCycle[NumRetired]++; + NumDispatched = 0; + NumIssued = 0; + NumRetired = 0; + } + + void printRetireUnitStatistics(llvm::raw_ostream &OS) const; + void printDispatchUnitStatistics(llvm::raw_ostream &OS) const; + void printSchedulerStatistics(llvm::raw_ostream &OS) const; + +public: + BackendStatistics() : NumDispatched(0), NumIssued(0), NumRetired(0) {} + + void onInstructionDispatched(unsigned Index) override { NumDispatched++; } + void + onInstructionIssued(unsigned Index, + const llvm::ArrayRef> + & /* unused */) override { + NumIssued++; + } + void onInstructionRetired(unsigned Index) override { NumRetired++; } + + void onCycleBegin(unsigned Cycle) override { NumCycles++; } + + void onCycleEnd(unsigned Cycle) override { updateHistograms(); } + + void printHistograms(llvm::raw_ostream &OS) { + printDispatchUnitStatistics(OS); + printSchedulerStatistics(OS); + printRetireUnitStatistics(OS); + } +}; + +} // namespace mca + +#endif Index: llvm/trunk/tools/llvm-mca/BackendStatistics.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/BackendStatistics.cpp +++ llvm/trunk/tools/llvm-mca/BackendStatistics.cpp @@ -0,0 +1,79 @@ +//===--------------------- BackendStatistics.cpp ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Functionalities used by the BackendPrinter to print out histograms +/// related to number of {dispatch/issue/retire} per number of cycles. +/// +//===----------------------------------------------------------------------===// + +#include "BackendStatistics.h" +#include "llvm/Support/Format.h" + +using namespace llvm; + +namespace mca { + +void BackendStatistics::printRetireUnitStatistics(llvm::raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "\n\nRetire Control Unit - " + << "number of cycles where we saw N instructions retired:\n"; + TempStream << "[# retired], [# cycles]\n"; + + for (const std::pair &Entry : RetiredPerCycle) { + TempStream << " " << Entry.first; + if (Entry.first < 10) + TempStream << ", "; + else + TempStream << ", "; + TempStream << Entry.second << " (" + << format("%.1f", ((double)Entry.second / NumCycles) * 100.0) + << "%)\n"; + } + + TempStream.flush(); + OS << Buffer; +} + +void BackendStatistics::printDispatchUnitStatistics(llvm::raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "\n\nDispatch Logic - " + << "number of cycles where we saw N instructions dispatched:\n"; + TempStream << "[# dispatched], [# cycles]\n"; + for (const std::pair &Entry : DispatchGroupSizePerCycle) { + TempStream << " " << Entry.first << ", " << Entry.second + << " (" + << format("%.1f", ((double)Entry.second / NumCycles) * 100.0) + << "%)\n"; + } + + TempStream.flush(); + OS << Buffer; +} + +void BackendStatistics::printSchedulerStatistics(llvm::raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + TempStream << "\n\nSchedulers - number of cycles where we saw N instructions " + "issued:\n"; + TempStream << "[# issued], [# cycles]\n"; + for (const std::pair &Entry : IssuedPerCycle) { + TempStream << " " << Entry.first << ", " << Entry.second << " (" + << format("%.1f", ((double)Entry.second / NumCycles) * 100) + << "%)\n"; + } + + TempStream.flush(); + OS << Buffer; +} + +} // namespace mca + Index: llvm/trunk/tools/llvm-mca/CMakeLists.txt =================================================================== --- llvm/trunk/tools/llvm-mca/CMakeLists.txt +++ llvm/trunk/tools/llvm-mca/CMakeLists.txt @@ -0,0 +1,25 @@ +set(LLVM_LINK_COMPONENTS + AllTargetsAsmPrinters + AllTargetsAsmParsers + AllTargetsDescs + AllTargetsDisassemblers + AllTargetsInfos + MC + MCParser + Support + ) + +add_llvm_tool(llvm-mca + Backend.cpp + BackendPrinter.cpp + BackendStatistics.cpp + Dispatch.cpp + HWEventListener.cpp + InstrBuilder.cpp + Instruction.cpp + LSUnit.cpp + llvm-mca.cpp + ResourcePressureView.cpp + Scheduler.cpp + TimelineView.cpp + ) Index: llvm/trunk/tools/llvm-mca/Dispatch.h =================================================================== --- llvm/trunk/tools/llvm-mca/Dispatch.h +++ llvm/trunk/tools/llvm-mca/Dispatch.h @@ -0,0 +1,319 @@ +//===----------------------- Dispatch.h -------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements classes that are used to model register files, +/// reorder buffers and the hardware dispatch logic. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_DISPATCH_H +#define LLVM_TOOLS_LLVM_MCA_DISPATCH_H + +#include "Instruction.h" +#include "llvm/MC/MCRegisterInfo.h" +#include + +namespace mca { + +class WriteState; +class DispatchUnit; +class Scheduler; +class Backend; + +/// \brief Keeps track of register definitions. +/// +/// This class tracks register definitions, and performs register renaming +/// to break anti dependencies. +/// By default, there is no limit in the number of register aliases which +/// can be created for the purpose of register renaming. However, users can +/// specify at object construction time a limit in the number of temporary +/// registers which can be used by the register renaming logic. +class RegisterFile { + const llvm::MCRegisterInfo &MRI; + // Currently used mappings and maximum used mappings. + // These are to generate statistics only. + unsigned NumUsedMappings; + unsigned MaxUsedMappings; + // Total number of mappings created over time. + unsigned TotalMappingsCreated; + + // The maximum number of register aliases which can be used by the + // register renamer. Defaut value for this field is zero. + // A value of zero for this field means that there is no limit in the + // amount of register mappings which can be created. That is equivalent + // to having a theoretically infinite number of temporary registers. + unsigned TotalMappings; + + // This map contains an entry for every physical register. + // A register index is used as a key value to access a WriteState. + // This is how we track RAW dependencies for dispatched + // instructions. For every register, we track the last seen write only. + // This assumes that all writes fully update both super and sub registers. + // We need a flag in MCInstrDesc to check if a write also updates super + // registers. We can then have a extra tablegen flag to set for instructions. + // This is a separate patch on its own. + std::vector RegisterMappings; + // Assumptions are: + // a) a false dependencies is always removed by the register renamer. + // b) the register renamer can create an "infinite" number of mappings. + // Since we track the number of mappings created, in future we may + // introduce constraints on the number of mappings that can be created. + // For example, the maximum number of registers that are available for + // register renaming purposes may default to the size of the register file. + + // In future, we can extend this design to allow multiple register files, and + // apply different restrictions on the register mappings and the number of + // temporary registers used by mappings. + +public: + RegisterFile(const llvm::MCRegisterInfo &mri, unsigned Mappings = 0) + : MRI(mri), NumUsedMappings(0), MaxUsedMappings(0), + TotalMappingsCreated(0), TotalMappings(Mappings), + RegisterMappings(MRI.getNumRegs(), nullptr) {} + + // Creates a new register mapping for RegID. + // This reserves a temporary register in the register file. + void addRegisterMapping(WriteState &WS); + + // Invalidates register mappings associated to the input WriteState object. + // This releases temporary registers in the register file. + void invalidateRegisterMapping(const WriteState &WS); + + bool isAvailable(unsigned NumRegWrites); + void collectWrites(llvm::SmallVectorImpl &Writes, + unsigned RegID) const; + void updateOnRead(ReadState &RS, unsigned RegID); + unsigned getMaxUsedRegisterMappings() const { return MaxUsedMappings; } + unsigned getTotalRegisterMappingsCreated() const { + return TotalMappingsCreated; + } + +#ifndef NDEBUG + void dump() const; +#endif +}; + +/// \brief tracks which instructions are in-flight (i.e. dispatched but not +/// retired) in the OoO backend. +/// +/// This class checks on every cycle if/which instructions can be retired. +/// Instructions are retired in program order. +/// In the event of instruction retired, the DispatchUnit object that owns +/// this RetireControlUnit gets notified. +/// On instruction retired, register updates are all architecturally +/// committed, and any temporary registers originally allocated for the +/// retired instruction are freed. +struct RetireControlUnit { + // A "token" (object of class RUToken) is created by the retire unit for every + // instruction dispatched to the schedulers. Flag 'Executed' is used to + // quickly check if an instruction has reached the write-back stage. A token + // also carries information related to the number of entries consumed by the + // instruction in the reorder buffer. The idea is that those entries will + // become available again once the instruction is retired. On every cycle, + // the RCU (Retire Control Unit) scans every token starting to search for + // instructions that are ready to retire. retired. Instructions are retired + // in program order. Only 'Executed' instructions are eligible for retire. + // Note that the size of the reorder buffer is defined by the scheduling model + // via field 'NumMicroOpBufferSize'. + struct RUToken { + unsigned Index; // Instruction index. + unsigned NumSlots; // Slots reserved to this instruction. + bool Executed; // True if the instruction is past the WB stage. + }; + +private: + unsigned NextAvailableSlotIdx; + unsigned CurrentInstructionSlotIdx; + unsigned AvailableSlots; + unsigned MaxRetirePerCycle; // 0 means no limit. + std::vector Queue; + DispatchUnit *Owner; + +public: + RetireControlUnit(unsigned NumSlots, unsigned RPC, DispatchUnit *DU) + : NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0), + AvailableSlots(NumSlots), MaxRetirePerCycle(RPC), Owner(DU) { + assert(NumSlots && "Expected at least one slot!"); + Queue.resize(NumSlots); + } + + bool isFull() const { return !AvailableSlots; } + bool isEmpty() const { return AvailableSlots == Queue.size(); } + bool isAvailable(unsigned Quantity = 1) const { + // Some instructions may declare a number of uOps which exceedes the size + // of the reorder buffer. To avoid problems, cap the amount of slots to + // the size of the reorder buffer. + Quantity = std::min(Quantity, static_cast(Queue.size())); + return AvailableSlots >= Quantity; + } + + // Reserves a number of slots, and returns a new token. + unsigned reserveSlot(unsigned Index, unsigned NumMicroOps); + + /// Retires instructions in program order. + void cycleEvent(); + + void onInstructionExecuted(unsigned TokenID); + +#ifndef NDEBUG + void dump() const; +#endif +}; + +// \brief Implements the hardware dispatch logic. +// +// This class is responsible for the dispatch stage, in which instructions are +// dispatched in groups to the Scheduler. An instruction can be dispatched if +// functional units are available. +// To be more specific, an instruction can be dispatched to the Scheduler if: +// 1) There are enough entries in the reorder buffer (implemented by class +// RetireControlUnit) to accomodate all opcodes. +// 2) There are enough temporaries to rename output register operands. +// 3) There are enough entries available in the used buffered resource(s). +// +// The number of micro opcodes that can be dispatched in one cycle is limited by +// the value of field 'DispatchWidth'. A "dynamic dispatch stall" occurs when +// processor resources are not available (i.e. at least one of the +// abovementioned checks fails). Dispatch stall events are counted during the +// entire execution of the code, and displayed by the performance report when +// flag '-verbose' is specified. +// +// If the number of micro opcodes of an instruction is bigger than +// DispatchWidth, then it can only be dispatched at the beginning of one cycle. +// The DispatchUnit will still have to wait for a number of cycles (depending on +// the DispatchWidth and the number of micro opcodes) before it can serve other +// instructions. +class DispatchUnit { + unsigned DispatchWidth; + unsigned AvailableEntries; + unsigned CarryOver; + Scheduler *SC; + + std::unique_ptr RAT; + std::unique_ptr RCU; + Backend *Owner; + + /// Dispatch stall event identifiers. + /// + /// The naming convention is: + /// * Event names starts with the "DS_" prefix + /// * For dynamic dispatch stalls, the "DS_" prefix is followed by the + /// the unavailable resource/functional unit acronym (example: RAT) + /// * The last substring is the event reason (example: REG_UNAVAILABLE means + /// that register renaming couldn't find enough spare registers in the + /// register file). + /// + /// List of acronyms used for processor resoures: + /// RAT - Register Alias Table (used by the register renaming logic) + /// RCU - Retire Control Unit + /// SQ - Scheduler's Queue + /// LDQ - Load Queue + /// STQ - Store Queue + enum { + DS_RAT_REG_UNAVAILABLE, + DS_RCU_TOKEN_UNAVAILABLE, + DS_SQ_TOKEN_UNAVAILABLE, + DS_LDQ_TOKEN_UNAVAILABLE, + DS_STQ_TOKEN_UNAVAILABLE, + DS_DISPATCH_GROUP_RESTRICTION, + DS_LAST + }; + + // The DispatchUnit track dispatch stall events caused by unavailable + // of hardware resources. Events are classified based on the stall kind; + // so we have a counter for every source of dispatch stall. Counters are + // stored into a vector `DispatchStall` which is always of size DS_LAST. + std::vector DispatchStalls; + + bool checkRAT(const InstrDesc &Desc); + bool checkRCU(const InstrDesc &Desc); + bool checkScheduler(const InstrDesc &Desc); + + void notifyInstructionDispatched(unsigned IID); + +public: + DispatchUnit(Backend *B, const llvm::MCRegisterInfo &MRI, + unsigned MicroOpBufferSize, unsigned RegisterFileSize, + unsigned MaxRetirePerCycle, unsigned MaxDispatchWidth, + Scheduler *Sched) + : DispatchWidth(MaxDispatchWidth), AvailableEntries(MaxDispatchWidth), + CarryOver(0U), SC(Sched), + RAT(llvm::make_unique(MRI, RegisterFileSize)), + RCU(llvm::make_unique(MicroOpBufferSize, + MaxRetirePerCycle, this)), + Owner(B), DispatchStalls(DS_LAST, 0) {} + + unsigned getDispatchWidth() const { return DispatchWidth; } + + bool isAvailable(unsigned NumEntries) const { + return NumEntries <= AvailableEntries || AvailableEntries == DispatchWidth; + } + + bool isRCUEmpty() const { return RCU->isEmpty(); } + + bool canDispatch(const InstrDesc &Desc) { + assert(isAvailable(Desc.NumMicroOps)); + return checkRCU(Desc) && checkRAT(Desc) && checkScheduler(Desc); + } + + unsigned dispatch(unsigned IID, Instruction *NewInst); + + void collectWrites(llvm::SmallVectorImpl &Vec, + unsigned RegID) const { + return RAT->collectWrites(Vec, RegID); + } + unsigned getNumRATStalls() const { + return DispatchStalls[DS_RAT_REG_UNAVAILABLE]; + } + unsigned getNumRCUStalls() const { + return DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE]; + } + unsigned getNumSQStalls() const { + return DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE]; + } + unsigned getNumLDQStalls() const { + return DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE]; + } + unsigned getNumSTQStalls() const { + return DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE]; + } + unsigned getNumDispatchGroupStalls() const { + return DispatchStalls[DS_DISPATCH_GROUP_RESTRICTION]; + } + unsigned getMaxUsedRegisterMappings() const { + return RAT->getMaxUsedRegisterMappings(); + } + unsigned getTotalRegisterMappingsCreated() const { + return RAT->getTotalRegisterMappingsCreated(); + } + void addNewRegisterMapping(WriteState &WS) { RAT->addRegisterMapping(WS); } + + void cycleEvent(unsigned Cycle) { + RCU->cycleEvent(); + AvailableEntries = + CarryOver >= DispatchWidth ? 0 : DispatchWidth - CarryOver; + CarryOver = CarryOver >= DispatchWidth ? CarryOver - DispatchWidth : 0U; + } + + void notifyInstructionRetired(unsigned Index); + + void onInstructionExecuted(unsigned TokenID) { + RCU->onInstructionExecuted(TokenID); + } + + void invalidateRegisterMappings(const Instruction &Inst); +#ifndef NDEBUG + void dump() const; +#endif +}; + +} // namespace mca + +#endif Index: llvm/trunk/tools/llvm-mca/Dispatch.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/Dispatch.cpp +++ llvm/trunk/tools/llvm-mca/Dispatch.cpp @@ -0,0 +1,268 @@ +//===--------------------- Dispatch.cpp -------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements methods declared by class RegisterFile, DispatchUnit +/// and RetireControlUnit. +/// +//===----------------------------------------------------------------------===// + +#include "Dispatch.h" +#include "Backend.h" +#include "Scheduler.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "llvm-mca" + +namespace mca { + +void RegisterFile::addRegisterMapping(WriteState &WS) { + unsigned RegID = WS.getRegisterID(); + assert(RegID && "Adding an invalid register definition?"); + + RegisterMappings[RegID] = &WS; + for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) + RegisterMappings[*I] = &WS; + if (MaxUsedMappings == NumUsedMappings) + MaxUsedMappings++; + NumUsedMappings++; + TotalMappingsCreated++; + // If this is a partial update, then we are done. + if (!WS.fullyUpdatesSuperRegs()) + return; + + for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) + RegisterMappings[*I] = &WS; +} + +void RegisterFile::invalidateRegisterMapping(const WriteState &WS) { + unsigned RegID = WS.getRegisterID(); + bool ShouldInvalidateSuperRegs = WS.fullyUpdatesSuperRegs(); + + assert(RegID != 0 && "Invalidating an already invalid register?"); + assert(WS.getCyclesLeft() != -512 && + "Invalidating a write of unknown cycles!"); + assert(WS.getCyclesLeft() <= 0 && "Invalid cycles left for this write!"); + if (!RegisterMappings[RegID]) + return; + + assert(NumUsedMappings); + NumUsedMappings--; + + if (RegisterMappings[RegID] == &WS) + RegisterMappings[RegID] = nullptr; + + for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) + if (RegisterMappings[*I] == &WS) + RegisterMappings[*I] = nullptr; + + if (!ShouldInvalidateSuperRegs) + return; + + for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) + if (RegisterMappings[*I] == &WS) + RegisterMappings[*I] = nullptr; +} + +// Update the number of used mappings in the event of instruction retired. +// This mehod delegates to the register file the task of invalidating +// register mappings that were created for instruction IS. +void DispatchUnit::invalidateRegisterMappings(const Instruction &IS) { + for (const std::unique_ptr &WS : IS.getDefs()) { + DEBUG(dbgs() << "[RAT] Invalidating mapping for: "); + DEBUG(WS->dump()); + RAT->invalidateRegisterMapping(*WS.get()); + } +} + +void RegisterFile::collectWrites(SmallVectorImpl &Writes, + unsigned RegID) const { + assert(RegID && RegID < RegisterMappings.size()); + WriteState *WS = RegisterMappings[RegID]; + if (WS) { + DEBUG(dbgs() << "Found a dependent use of RegID=" << RegID << '\n'); + Writes.push_back(WS); + } + + // Handle potential partial register updates. + for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) { + WS = RegisterMappings[*I]; + if (WS && std::find(Writes.begin(), Writes.end(), WS) == Writes.end()) { + DEBUG(dbgs() << "Found a dependent use of subReg " << *I << " (part of " + << RegID << ")\n"); + Writes.push_back(WS); + } + } +} + +bool RegisterFile::isAvailable(unsigned NumRegWrites) { + if (!TotalMappings) + return true; + if (NumRegWrites > TotalMappings) { + // The user specified a too small number of registers. + // Artificially set the number of temporaries to NumRegWrites. + errs() << "warning: not enough temporaries in the register file. " + << "The register file size has been automatically increased to " + << NumRegWrites << '\n'; + TotalMappings = NumRegWrites; + } + + return NumRegWrites + NumUsedMappings <= TotalMappings; +} + +#ifndef NDEBUG +void RegisterFile::dump() const { + for (unsigned I = 0, E = MRI.getNumRegs(); I < E; ++I) + if (RegisterMappings[I]) { + dbgs() << MRI.getName(I) << ", " << I << ", "; + RegisterMappings[I]->dump(); + } + + dbgs() << "TotalMappingsCreated: " << TotalMappingsCreated + << ", MaxUsedMappings: " << MaxUsedMappings + << ", NumUsedMappings: " << NumUsedMappings << '\n'; +} +#endif + +// Reserves a number of slots, and returns a new token. +unsigned RetireControlUnit::reserveSlot(unsigned Index, unsigned NumMicroOps) { + assert(isAvailable(NumMicroOps)); + unsigned NormalizedQuantity = + std::min(NumMicroOps, static_cast(Queue.size())); + // Zero latency instructions may have zero mOps. Artificially bump this + // value to 1. Although zero latency instructions don't consume scheduler + // resources, they still consume one slot in the retire queue. + NormalizedQuantity = std::max(NormalizedQuantity, 1U); + unsigned TokenID = NextAvailableSlotIdx; + Queue[NextAvailableSlotIdx] = {Index, NormalizedQuantity, false}; + NextAvailableSlotIdx += NormalizedQuantity; + NextAvailableSlotIdx %= Queue.size(); + AvailableSlots -= NormalizedQuantity; + return TokenID; +} + +void DispatchUnit::notifyInstructionDispatched(unsigned Index) { + Owner->notifyInstructionDispatched(Index); +} + +void DispatchUnit::notifyInstructionRetired(unsigned Index) { + Owner->notifyInstructionRetired(Index); +} + +void RetireControlUnit::cycleEvent() { + if (isEmpty()) + return; + + unsigned NumRetired = 0; + while (!isEmpty()) { + if (MaxRetirePerCycle != 0 && NumRetired == MaxRetirePerCycle) + break; + RUToken &Current = Queue[CurrentInstructionSlotIdx]; + assert(Current.NumSlots && "Reserved zero slots?"); + if (!Current.Executed) + break; + Owner->notifyInstructionRetired(Current.Index); + CurrentInstructionSlotIdx += Current.NumSlots; + CurrentInstructionSlotIdx %= Queue.size(); + AvailableSlots += Current.NumSlots; + NumRetired++; + } +} + +void RetireControlUnit::onInstructionExecuted(unsigned TokenID) { + assert(Queue.size() > TokenID); + assert(Queue[TokenID].Executed == false && Queue[TokenID].Index != ~0U); + Queue[TokenID].Executed = true; +} + +#ifndef NDEBUG +void RetireControlUnit::dump() const { + dbgs() << "Retire Unit: { Total Slots=" << Queue.size() + << ", Available Slots=" << AvailableSlots << " }\n"; +} +#endif + +bool DispatchUnit::checkRAT(const InstrDesc &Desc) { + unsigned NumWrites = Desc.Writes.size(); + if (RAT->isAvailable(NumWrites)) + return true; + DispatchStalls[DS_RAT_REG_UNAVAILABLE]++; + return false; +} + +bool DispatchUnit::checkRCU(const InstrDesc &Desc) { + unsigned NumMicroOps = Desc.NumMicroOps; + if (RCU->isAvailable(NumMicroOps)) + return true; + DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE]++; + return false; +} + +bool DispatchUnit::checkScheduler(const InstrDesc &Desc) { + // If this is a zero-latency instruction, then it bypasses + // the scheduler. + switch (SC->canBeDispatched(Desc)) { + case Scheduler::HWS_AVAILABLE: + return true; + case Scheduler::HWS_QUEUE_UNAVAILABLE: + DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE]++; + break; + case Scheduler::HWS_LD_QUEUE_UNAVAILABLE: + DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE]++; + break; + case Scheduler::HWS_ST_QUEUE_UNAVAILABLE: + DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE]++; + break; + case Scheduler::HWS_DISPATCH_GROUP_RESTRICTION: + DispatchStalls[DS_DISPATCH_GROUP_RESTRICTION]++; + } + + return false; +} + +unsigned DispatchUnit::dispatch(unsigned IID, Instruction *NewInst) { + assert(!CarryOver && "Cannot dispatch another instruction!"); + unsigned NumMicroOps = NewInst->getDesc().NumMicroOps; + if (NumMicroOps > DispatchWidth) { + assert(AvailableEntries == DispatchWidth); + AvailableEntries = 0; + CarryOver = NumMicroOps - DispatchWidth; + } else { + assert(AvailableEntries >= NumMicroOps); + AvailableEntries -= NumMicroOps; + } + + // Reserve slots in the RCU. + unsigned RCUTokenID = RCU->reserveSlot(IID, NumMicroOps); + Owner->notifyInstructionDispatched(IID); + + SC->scheduleInstruction(IID, NewInst); + return RCUTokenID; +} + +#ifndef NDEBUG +void DispatchUnit::dump() const { + RAT->dump(); + RCU->dump(); + + unsigned DSRAT = DispatchStalls[DS_RAT_REG_UNAVAILABLE]; + unsigned DSRCU = DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE]; + unsigned DSSCHEDQ = DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE]; + unsigned DSLQ = DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE]; + unsigned DSSQ = DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE]; + + dbgs() << "STALLS --- RAT: " << DSRAT << ", RCU: " << DSRCU + << ", SCHED_QUEUE: " << DSSCHEDQ << ", LOAD_QUEUE: " << DSLQ + << ", STORE_QUEUE: " << DSSQ << '\n'; +} +#endif + +} // namespace mca Index: llvm/trunk/tools/llvm-mca/HWEventListener.h =================================================================== --- llvm/trunk/tools/llvm-mca/HWEventListener.h +++ llvm/trunk/tools/llvm-mca/HWEventListener.h @@ -0,0 +1,50 @@ + +//===----------------------- HWEventListener.h ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines the main interface for hardware event listeners. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H +#define LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H + +#include "llvm/ADT/ArrayRef.h" +#include + +namespace mca { + +struct HWEventListener { + // Events generated by the Retire Control Unit. + virtual void onInstructionRetired(unsigned Index) {}; + + // Events generated by the Scheduler. + using ResourceRef = std::pair; + virtual void + onInstructionIssued(unsigned Index, + const llvm::ArrayRef> &Used) {} + virtual void onInstructionExecuted(unsigned Index) {} + virtual void onInstructionReady(unsigned Index) {} + virtual void onResourceAvailable(const ResourceRef &RRef) {}; + + // Events generated by the Dispatch logic. + virtual void onInstructionDispatched(unsigned Index) {} + + // Generic events generated by the Backend. + virtual void onCycleBegin(unsigned Cycle) {} + virtual void onCycleEnd(unsigned Cycle) {} + + virtual ~HWEventListener() = default; + virtual void anchor(); +}; + +} // namespace mca + +#endif Index: llvm/trunk/tools/llvm-mca/HWEventListener.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/HWEventListener.cpp +++ llvm/trunk/tools/llvm-mca/HWEventListener.cpp @@ -0,0 +1,22 @@ +//===----------------------- HWEventListener.cpp ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines a vtable anchor for struct HWEventListener. +/// +//===----------------------------------------------------------------------===// + +#include "HWEventListener.h" + +namespace mca { + +// Anchor the vtable here. +void HWEventListener::anchor() {} + +} // namespace mca Index: llvm/trunk/tools/llvm-mca/InstrBuilder.h =================================================================== --- llvm/trunk/tools/llvm-mca/InstrBuilder.h +++ llvm/trunk/tools/llvm-mca/InstrBuilder.h @@ -0,0 +1,62 @@ +//===--------------------- InstrBuilder.h -----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// A builder class for instructions that are statically analyzed by llvm-mca. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H +#define LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H + +#include "Dispatch.h" +#include "Instruction.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" + +namespace mca { + +class DispatchUnit; + +/// \brief A builder class that knows how to construct Instruction objects. +/// +/// Every llvm-mca Instruction is described by an object of class InstrDesc. +/// An InstrDesc describes which registers are read/written by the instruction, +/// as well as the instruction latency and hardware resources consumed. +/// +/// This class is used by the tool to construct Instructions and instruction +/// descriptors (i.e. InstrDesc objects). +/// Information from the machine scheduling model is used to identify processor +/// resources that are consumed by an instruction. +class InstrBuilder { + const llvm::MCInstrInfo &MCII; + const llvm::ArrayRef ProcResourceMasks; + + llvm::DenseMap> Descriptors; + llvm::DenseMap> Instructions; + + void createInstrDescImpl(const llvm::MCSubtargetInfo &STI, + const llvm::MCInst &MCI); + +public: + InstrBuilder(const llvm::MCInstrInfo &mcii, + const llvm::ArrayRef Masks) + : MCII(mcii), ProcResourceMasks(Masks) {} + + const InstrDesc &getOrCreateInstrDesc(const llvm::MCSubtargetInfo &STI, + const llvm::MCInst &MCI); + + Instruction *createInstruction(const llvm::MCSubtargetInfo &STI, + DispatchUnit &DU, unsigned Idx, + const llvm::MCInst &MCI); +}; + +} // namespace mca + +#endif Index: llvm/trunk/tools/llvm-mca/InstrBuilder.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/InstrBuilder.cpp +++ llvm/trunk/tools/llvm-mca/InstrBuilder.cpp @@ -0,0 +1,525 @@ +//===--------------------- InstrBuilder.cpp ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements the InstrBuilder interface. +/// +//===----------------------------------------------------------------------===// + +#include "InstrBuilder.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "llvm-mca" + +namespace mca { + +using namespace llvm; + +static void +initializeUsedResources(InstrDesc &ID, const MCSchedClassDesc &SCDesc, + const MCSubtargetInfo &STI, + const ArrayRef ProcResourceMasks) { + const MCSchedModel &SM = STI.getSchedModel(); + + // Populate resources consumed. + using ResourcePlusCycles = std::pair; + std::vector Worklist; + for (unsigned I = 0, E = SCDesc.NumWriteProcResEntries; I < E; ++I) { + const MCWriteProcResEntry *PRE = STI.getWriteProcResBegin(&SCDesc) + I; + const MCProcResourceDesc &PR = *SM.getProcResource(PRE->ProcResourceIdx); + uint64_t Mask = ProcResourceMasks[PRE->ProcResourceIdx]; + if (PR.BufferSize != -1) + ID.Buffers.push_back(Mask); + CycleSegment RCy(0, PRE->Cycles, false); + Worklist.emplace_back(ResourcePlusCycles(Mask, ResourceUsage(RCy))); + } + + // Sort elements by mask popcount, so that we prioritize resource units over + // resource groups, and smaller groups over larger groups. + std::sort(Worklist.begin(), Worklist.end(), + [](const ResourcePlusCycles &A, const ResourcePlusCycles &B) { + unsigned popcntA = countPopulation(A.first); + unsigned popcntB = countPopulation(B.first); + if (popcntA < popcntB) + return true; + if (popcntA > popcntB) + return false; + return A.first < B.first; + }); + + uint64_t UsedResourceUnits = 0; + + // Remove cycles contributed by smaller resources. + for (unsigned I = 0, E = Worklist.size(); I < E; ++I) { + ResourcePlusCycles &A = Worklist[I]; + if (!A.second.size()) { + A.second.NumUnits = 0; + A.second.setReserved(); + ID.Resources.emplace_back(A); + continue; + } + + ID.Resources.emplace_back(A); + uint64_t NormalizedMask = A.first; + if (countPopulation(A.first) == 1) { + UsedResourceUnits |= A.first; + } else { + // Remove the leading 1 from the resource group mask. + NormalizedMask ^= PowerOf2Floor(NormalizedMask); + } + + for (unsigned J = I + 1; J < E; ++J) { + ResourcePlusCycles &B = Worklist[J]; + if ((NormalizedMask & B.first) == NormalizedMask) { + B.second.CS.Subtract(A.second.size()); + if (countPopulation(B.first) > 1) + B.second.NumUnits++; + } + } + } + + // A SchedWrite may specify a number of cycles in which a resource group + // is reserved. For example (on target x86; cpu Haswell): + // + // SchedWriteRes<[HWPort0, HWPort1, HWPort01]> { + // let ResourceCycles = [2, 2, 3]; + // } + // + // This means: + // Resource units HWPort0 and HWPort1 are both used for 2cy. + // Resource group HWPort01 is the union of HWPort0 and HWPort1. + // Since this write touches both HWPort0 and HWPort1 for 2cy, HWPort01 + // will not be usable for 2 entire cycles from instruction issue. + // + // On top of those 2cy, SchedWriteRes explicitly specifies an extra latency + // of 3 cycles for HWPort01. This tool assumes that the 3cy latency is an + // extra delay on top of the 2 cycles latency. + // During those extra cycles, HWPort01 is not usable by other instructions. + for (ResourcePlusCycles &RPC : ID.Resources) { + if (countPopulation(RPC.first) > 1 && !RPC.second.isReserved()) { + // Remove the leading 1 from the resource group mask. + uint64_t Mask = RPC.first ^ PowerOf2Floor(RPC.first); + if ((Mask & UsedResourceUnits) == Mask) + RPC.second.setReserved(); + } + } + + DEBUG( + for (const std::pair &R : ID.Resources) + dbgs() << "\t\tMask=" << R.first << ", cy=" << R.second.size() << '\n'; + for (const uint64_t R : ID.Buffers) + dbgs() << "\t\tBuffer Mask=" << R << '\n'; + ); +} + +static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc, + const MCSchedClassDesc &SCDesc, + const MCSubtargetInfo &STI) { + unsigned MaxLatency = 0; + unsigned NumWriteLatencyEntries = SCDesc.NumWriteLatencyEntries; + for (unsigned I = 0, E = NumWriteLatencyEntries; I < E; ++I) { + int Cycles = STI.getWriteLatencyEntry(&SCDesc, I)->Cycles; + // Check if this is an unknown latency. Conservatively (pessimistically) + // assume a latency of 100cy if late. + if (Cycles == -1) + Cycles = 100; + MaxLatency = std::max(MaxLatency, static_cast(Cycles)); + } + + if (MCDesc.isCall()) { + // We cannot estimate how long this call will take. + // Artificially set an arbitrarily high latency (100cy). + MaxLatency = std::max(100U, MaxLatency); + } + + // Check if this instruction consumes any processor resources. + // If the latency is unknown, then conservatively set it equal to the maximum + // number of cycles for a resource consumed by this instruction. + if (!MaxLatency && ID.Resources.size()) { + // Check if this instruction consumes any processor resources. + // If so, then compute the max of the cycles spent for each resource, and + // use it as the MaxLatency. + for (const std::pair &Resource : ID.Resources) + MaxLatency = std::max(MaxLatency, Resource.second.size()); + } + + if (SCDesc.isVariant() && MaxLatency == 0) { + errs() << "note: unknown latency for a variant opcode. Conservatively" + << " assume a default latency of 1cy.\n"; + MaxLatency = 1; + } + + ID.MaxLatency = MaxLatency; +} + +static void populateWrites(InstrDesc &ID, const MCInst &MCI, + const MCInstrDesc &MCDesc, + const MCSchedClassDesc &SCDesc, + const MCSubtargetInfo &STI) { + computeMaxLatency(ID, MCDesc, SCDesc, STI); + + // Set if writes through this opcode may update super registers. + // TODO: on x86-64, a 4 byte write of a general purpose register always + // fully updates the super-register. + // More in general, (at least on x86) not all register writes perform + // a partial (super-)register update. + // For example, an AVX instruction that writes on a XMM register implicitly + // zeroes the upper half of every aliasing super-register. + // + // For now, we pessimistically assume that writes are all potentially + // partial register updates. This is a good default for most targets, execept + // for those like x86 which implement a special semantic for certain opcodes. + // At least on x86, this may lead to an inaccurate prediction of the + // instruction level parallelism. + bool FullyUpdatesSuperRegisters = false; + + // Now Populate Writes. + + // This algorithm currently works under the strong (and potentially incorrect) + // assumption that information related to register def/uses can be obtained + // from MCInstrDesc. + // + // However class MCInstrDesc is used to describe MachineInstr objects and not + // MCInst objects. To be more specific, MCInstrDesc objects are opcode + // descriptors that are automatically generated via tablegen based on the + // instruction set information available from the target .td files. That + // means, the number of (explicit) definitions according to MCInstrDesc always + // matches the cardinality of the `(outs)` set in tablegen. + // + // By constructions, definitions must appear first in the operand sequence of + // a MachineInstr. Also, the (outs) sequence is preserved (example: the first + // element in the outs set is the first operand in the corresponding + // MachineInstr). That's the reason why MCInstrDesc only needs to declare the + // total number of register definitions, and not where those definitions are + // in the machine operand sequence. + // + // Unfortunately, it is not safe to use the information from MCInstrDesc to + // also describe MCInst objects. An MCInst object can be obtained from a + // MachineInstr through a lowering step which may restructure the operand + // sequence (and even remove or introduce new operands). So, there is a high + // risk that the lowering step breaks the assumptions that register + // definitions are always at the beginning of the machine operand sequence. + // + // This is a fundamental problem, and it is still an open problem. Essentially + // we have to find a way to correlate def/use operands of a MachineInstr to + // operands of an MCInst. Otherwise, we cannot correctly reconstruct data + // dependencies, nor we can correctly interpret the scheduling model, which + // heavily uses machine operand indices to define processor read-advance + // information, and to identify processor write resources. Essentially, we + // either need something like a MCInstrDesc, but for MCInst, or a way + // to map MCInst operands back to MachineInstr operands. + // + // Unfortunately, we don't have that information now. So, this prototype + // currently work under the strong assumption that we can always safely trust + // the content of an MCInstrDesc. For example, we can query a MCInstrDesc to + // obtain the number of explicit and implicit register defintions. We also + // assume that register definitions always come first in the operand sequence. + // This last assumption usually makes sense for MachineInstr, where register + // definitions always appear at the beginning of the operands sequence. In + // reality, these assumptions could be broken by the lowering step, which can + // decide to lay out operands in a different order than the original order of + // operand as specified by the MachineInstr. + // + // Things get even more complicated in the presence of "optional" register + // definitions. For MachineInstr, optional register definitions are always at + // the end of the operand sequence. Some ARM instructions that may update the + // status flags specify that register as a optional operand. Since we don't + // have operand descriptors for MCInst, we assume for now that the optional + // definition is always the last operand of a MCInst. Again, this assumption + // may be okay for most targets. However, there is no guarantee that targets + // would respect that. + // + // In conclusion: these are for now the strong assumptions made by the tool: + // * The number of explicit and implicit register definitions in a MCInst + // matches the number of explicit and implicit definitions according to + // the opcode descriptor (MCInstrDesc). + // * Register definitions take precedence over register uses in the operands + // list. + // * If an opcode specifies an optional definition, then the optional + // definition is always the last operand in the sequence, and it can be + // set to zero (i.e. "no register"). + // + // These assumptions work quite well for most out-of-order in-tree targets + // like x86. This is mainly because the vast majority of instructions is + // expanded to MCInst using a straightforward lowering logic that preserves + // the ordering of the operands. + // + // In the longer term, we need to find a proper solution for this issue. + unsigned NumExplicitDefs = MCDesc.getNumDefs(); + unsigned NumImplicitDefs = MCDesc.getNumImplicitDefs(); + unsigned NumWriteLatencyEntries = SCDesc.NumWriteLatencyEntries; + unsigned TotalDefs = NumExplicitDefs + NumImplicitDefs; + if (MCDesc.hasOptionalDef()) + TotalDefs++; + ID.Writes.resize(TotalDefs); + // Iterate over the operands list, and skip non-register operands. + // The first NumExplictDefs register operands are expected to be register + // definitions. + unsigned CurrentDef = 0; + unsigned i = 0; + for (; i < MCI.getNumOperands() && CurrentDef < NumExplicitDefs; ++i) { + const MCOperand &Op = MCI.getOperand(i); + if (!Op.isReg()) + continue; + + WriteDescriptor &Write = ID.Writes[CurrentDef]; + Write.OpIndex = i; + if (CurrentDef < NumWriteLatencyEntries) { + const MCWriteLatencyEntry &WLE = + *STI.getWriteLatencyEntry(&SCDesc, CurrentDef); + // Conservatively default to MaxLatency. + Write.Latency = WLE.Cycles == -1 ? ID.MaxLatency : WLE.Cycles; + Write.SClassOrWriteResourceID = WLE.WriteResourceID; + } else { + // Assign a default latency for this write. + Write.Latency = ID.MaxLatency; + Write.SClassOrWriteResourceID = 0; + } + Write.FullyUpdatesSuperRegs = FullyUpdatesSuperRegisters; + Write.IsOptionalDef = false; + DEBUG( + dbgs() << "\t\tOpIdx=" << Write.OpIndex + << ", Latency=" << Write.Latency << ", WriteResourceID=" + << Write.SClassOrWriteResourceID << '\n'; + ); + CurrentDef++; + } + + if (CurrentDef != NumExplicitDefs) + llvm::report_fatal_error( + "error: Expected more register operand definitions. "); + + CurrentDef = 0; + for (CurrentDef = 0; CurrentDef < NumImplicitDefs; ++CurrentDef) { + unsigned Index = NumExplicitDefs + CurrentDef; + WriteDescriptor &Write = ID.Writes[Index]; + Write.OpIndex = -1; + Write.RegisterID = MCDesc.getImplicitDefs()[CurrentDef]; + Write.Latency = ID.MaxLatency; + Write.SClassOrWriteResourceID = 0; + Write.IsOptionalDef = false; + assert(Write.RegisterID != 0 && "Expected a valid phys register!"); + DEBUG(dbgs() << "\t\tOpIdx=" << Write.OpIndex << ", PhysReg=" + << Write.RegisterID << ", Latency=" << Write.Latency + << ", WriteResourceID=" << Write.SClassOrWriteResourceID + << '\n'); + } + + if (MCDesc.hasOptionalDef()) { + // Always assume that the optional definition is the last operand of the + // MCInst sequence. + const MCOperand &Op = MCI.getOperand(MCI.getNumOperands() - 1); + if (i == MCI.getNumOperands() || !Op.isReg()) + llvm::report_fatal_error( + "error: expected a register operand for an optional " + "definition. Instruction has not be correctly analyzed.\n", + false); + + WriteDescriptor &Write = ID.Writes[TotalDefs - 1]; + Write.OpIndex = MCI.getNumOperands() - 1; + // Assign a default latency for this write. + Write.Latency = ID.MaxLatency; + Write.SClassOrWriteResourceID = 0; + Write.IsOptionalDef = true; + } +} + +static void populateReads(InstrDesc &ID, const MCInst &MCI, + const MCInstrDesc &MCDesc, + const MCSchedClassDesc &SCDesc, + const MCSubtargetInfo &STI) { + unsigned SchedClassID = MCDesc.getSchedClass(); + bool HasReadAdvanceEntries = SCDesc.NumReadAdvanceEntries > 0; + + unsigned i = 0; + unsigned NumExplicitDefs = MCDesc.getNumDefs(); + // Skip explicit definitions. + for (; i < MCI.getNumOperands() && NumExplicitDefs; ++i) { + const MCOperand &Op = MCI.getOperand(i); + if (Op.isReg()) + NumExplicitDefs--; + } + + if (NumExplicitDefs) + llvm::report_fatal_error( + "error: Expected more register operand definitions. ", false); + + unsigned NumExplicitUses = MCI.getNumOperands() - i; + unsigned NumImplicitUses = MCDesc.getNumImplicitUses(); + if (MCDesc.hasOptionalDef()) { + assert(NumExplicitUses); + NumExplicitUses--; + } + unsigned TotalUses = NumExplicitUses + NumImplicitUses; + if (!TotalUses) + return; + + ID.Reads.resize(TotalUses); + for (unsigned CurrentUse = 0; CurrentUse < NumExplicitUses; ++CurrentUse) { + ReadDescriptor &Read = ID.Reads[CurrentUse]; + Read.OpIndex = i + CurrentUse; + Read.HasReadAdvanceEntries = HasReadAdvanceEntries; + Read.SchedClassID = SchedClassID; + DEBUG(dbgs() << "\t\tOpIdx=" << Read.OpIndex); + } + + for (unsigned CurrentUse = 0; CurrentUse < NumImplicitUses; ++CurrentUse) { + ReadDescriptor &Read = ID.Reads[NumExplicitUses + CurrentUse]; + Read.OpIndex = -1; + Read.RegisterID = MCDesc.getImplicitUses()[CurrentUse]; + Read.HasReadAdvanceEntries = false; + Read.SchedClassID = SchedClassID; + DEBUG(dbgs() << "\t\tOpIdx=" << Read.OpIndex + << ", RegisterID=" << Read.RegisterID << '\n'); + } +} + +void InstrBuilder::createInstrDescImpl(const MCSubtargetInfo &STI, + const MCInst &MCI) { + assert(STI.getSchedModel().hasInstrSchedModel() && + "Itineraries are not yet supported!"); + + unsigned short Opcode = MCI.getOpcode(); + // Obtain the instruction descriptor from the opcode. + const MCInstrDesc &MCDesc = MCII.get(Opcode); + const MCSchedModel &SM = STI.getSchedModel(); + + // Then obtain the scheduling class information from the instruction. + const MCSchedClassDesc &SCDesc = + *SM.getSchedClassDesc(MCDesc.getSchedClass()); + + // Create a new empty descriptor. + InstrDesc *ID = new InstrDesc(); + + if (SCDesc.isVariant()) { + errs() << "warning: don't know how to model variant opcodes.\n" + << "note: assume 1 micro opcode.\n"; + ID->NumMicroOps = 1U; + } else { + ID->NumMicroOps = SCDesc.NumMicroOps; + } + + if (MCDesc.isCall()) { + // We don't correctly model calls. + errs() << "warning: found a call in the input assembly sequence.\n" + << "note: call instructions are not correctly modeled. Assume a " + "latency of 100cy.\n"; + } + + if (MCDesc.isReturn()) { + errs() << "warning: found a return instruction in the input assembly " + "sequence.\n" + << "note: program counter updates are ignored.\n"; + } + + ID->MayLoad = MCDesc.mayLoad(); + ID->MayStore = MCDesc.mayStore(); + ID->HasSideEffects = MCDesc.hasUnmodeledSideEffects(); + + initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks); + populateWrites(*ID, MCI, MCDesc, SCDesc, STI); + populateReads(*ID, MCI, MCDesc, SCDesc, STI); + + DEBUG(dbgs() << "\t\tMaxLatency=" << ID->MaxLatency << '\n'); + DEBUG(dbgs() << "\t\tNumMicroOps=" << ID->NumMicroOps << '\n'); + + // Now add the new descriptor. + Descriptors[Opcode] = std::unique_ptr(ID); +} + +const InstrDesc &InstrBuilder::getOrCreateInstrDesc(const MCSubtargetInfo &STI, + const MCInst &MCI) { + auto it = Descriptors.find(MCI.getOpcode()); + if (it == Descriptors.end()) + createInstrDescImpl(STI, MCI); + return *Descriptors[MCI.getOpcode()].get(); +} + +Instruction *InstrBuilder::createInstruction(const MCSubtargetInfo &STI, + DispatchUnit &DU, unsigned Idx, + const MCInst &MCI) { + const InstrDesc &D = getOrCreateInstrDesc(STI, MCI); + Instruction *NewIS = new Instruction(D); + + // Populate Reads first. + const MCSchedModel &SM = STI.getSchedModel(); + SmallVector DependentWrites; + for (const ReadDescriptor &RD : D.Reads) { + int RegID = -1; + if (RD.OpIndex != -1) { + // explicit read. + const MCOperand &Op = MCI.getOperand(RD.OpIndex); + // Skip non-register operands. + if (!Op.isReg()) + continue; + RegID = Op.getReg(); + } else { + // Implicit read. + RegID = RD.RegisterID; + } + + // Skip invalid register operands. + if (!RegID) + continue; + + // Okay, this is a register operand. Create a ReadState for it. + assert(RegID > 0 && "Invalid register ID found!"); + ReadState *NewRDS = new ReadState(RD); + NewIS->getUses().emplace_back(std::unique_ptr(NewRDS)); + DU.collectWrites(DependentWrites, RegID); + NewRDS->setDependentWrites(DependentWrites.size()); + DEBUG(dbgs() << "Found " << DependentWrites.size() + << " dependent writes\n"); + + // We know that this read depends on all the writes in DependentWrites. + // For each write, check if we have ReadAdvance information, and use it + // to figure out after how many cycles this read becomes available. + if (!RD.HasReadAdvanceEntries) { + for (WriteState *WS : DependentWrites) + WS->addUser(NewRDS, /* ReadAdvance */ 0); + // Prepare the set for another round. + DependentWrites.clear(); + continue; + } + + const MCSchedClassDesc *SC = SM.getSchedClassDesc(RD.SchedClassID); + for (WriteState *WS : DependentWrites) { + unsigned WriteResID = WS->getWriteResourceID(); + int ReadAdvance = STI.getReadAdvanceCycles(SC, RD.OpIndex, WriteResID); + WS->addUser(NewRDS, ReadAdvance); + } + + // Prepare the set for another round. + DependentWrites.clear(); + } + + // Now populate writes. + for (const WriteDescriptor &WD : D.Writes) { + unsigned RegID = + WD.OpIndex == -1 ? WD.RegisterID : MCI.getOperand(WD.OpIndex).getReg(); + assert((RegID || WD.IsOptionalDef) && "Expected a valid register ID!"); + // Special case where this is a optional definition, and the actual register + // is 0. + if (WD.IsOptionalDef && !RegID) + continue; + + WriteState *NewWS = new WriteState(WD); + NewIS->getDefs().emplace_back(std::unique_ptr(NewWS)); + NewWS->setRegisterID(RegID); + DU.addNewRegisterMapping(*NewWS); + } + + // Update Latency. + NewIS->setCyclesLeft(D.MaxLatency); + return NewIS; +} + +} // namespace mca Index: llvm/trunk/tools/llvm-mca/Instruction.h =================================================================== --- llvm/trunk/tools/llvm-mca/Instruction.h +++ llvm/trunk/tools/llvm-mca/Instruction.h @@ -0,0 +1,336 @@ +//===--------------------- Instruction.h ------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines abstractions used by the Backend to model register reads, +/// register writes and instructions. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H +#define LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H + +#include "llvm/Support/MathExtras.h" +#include +#include +#include + +namespace mca { + +struct WriteDescriptor; +struct ReadDescriptor; +class WriteState; +class ReadState; + +constexpr int UNKNOWN_CYCLES = -512; + +/// \brief A register write descriptor. +struct WriteDescriptor { + int OpIndex; // Operand index. -1 if this is an implicit write. + // Write latency. Number of cycles before write-back stage. + int Latency; + // This field is set to a value different than zero only if this + // is an implicit definition. + unsigned RegisterID; + // True if this write generates a partial update of a super-registers. + // On X86, this flag is set by byte/word writes on GPR registers. Also, + // a write of an XMM register only partially updates the corresponding + // YMM super-register if the write is associated to a legacy SSE instruction. + bool FullyUpdatesSuperRegs; + // Instruction itineraries would set this field to the SchedClass ID. + // Otherwise, it defaults to the WriteResourceID from teh MCWriteLatencyEntry + // element associated to this write. + // When computing read latencies, this value is matched against the + // "ReadAdvance" information. The hardware backend may implement + // dedicated forwarding paths to quickly propagate write results to dependent + // instructions waiting in the reservation station (effectively bypassing the + // write-back stage). + unsigned SClassOrWriteResourceID; + // True only if this is a write obtained from an optional definition. + // Optional definitions are allowed to reference regID zero (i.e. "no + // register"). + bool IsOptionalDef; +}; + +/// \brief A register read descriptor. +struct ReadDescriptor { + // This field defaults to -1 if this is an implicit read. + int OpIndex; + // This field is only set if this is an implicit read. + unsigned RegisterID; + // Scheduling Class Index. It is used to query the scheduling model for the + // MCSchedClassDesc object. + unsigned SchedClassID; + // True if there may be a local forwarding logic in hardware to serve a + // write used by this read. This information, along with SchedClassID, is + // used to dynamically check at Instruction creation time, if the input + // operands can benefit from a ReadAdvance bonus. + bool HasReadAdvanceEntries; +}; + +/// \brief Tracks uses of a register definition (e.g. register write). +/// +/// Each implicit/explicit register write is associated with an instance of +/// this class. A WriteState object tracks the dependent users of a +/// register write. It also tracks how many cycles are left before the write +/// back stage. +class WriteState { + const WriteDescriptor &WD; + // On instruction issue, this field is set equal to the write latency. + // Before instruction issue, this field defaults to -512, a special + // value that represents an "unknown" number of cycles. + int CyclesLeft; + + // Actual register defined by this write. This field is only used + // to speedup queries on the register file. + // For implicit writes, this field always matches the value of + // field RegisterID from WD. + unsigned RegisterID; + + // A list of dependent reads. Users is a set of dependent + // reads. A dependent read is added to the set only if CyclesLeft + // is "unknown". As soon as CyclesLeft is 'known', each user in the set + // gets notified with the actual CyclesLeft. + + // The 'second' element of a pair is a "ReadAdvance" number of cycles. + std::set> Users; + +public: + WriteState(const WriteDescriptor &Desc) + : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(Desc.RegisterID) {} + WriteState(const WriteState &Other) = delete; + WriteState &operator=(const WriteState &Other) = delete; + + int getCyclesLeft() const { return CyclesLeft; } + unsigned getWriteResourceID() const { return WD.SClassOrWriteResourceID; } + unsigned getRegisterID() const { return RegisterID; } + void setRegisterID(unsigned ID) { RegisterID = ID; } + + void addUser(ReadState *Use, int ReadAdvance); + bool fullyUpdatesSuperRegs() const { return WD.FullyUpdatesSuperRegs; } + bool isWrittenBack() const { return CyclesLeft == 0; } + + // On every cycle, update CyclesLeft and notify dependent users. + void cycleEvent(); + void onInstructionIssued(); + +#ifndef NDEBUG + void dump() const; +#endif +}; + +/// \brief Tracks register operand latency in cycles. +/// +/// A read may be dependent on more than one write. This occurs when some +/// writes only partially update the register associated to this read. +class ReadState { + const ReadDescriptor &RD; + unsigned DependentWrites; + int CyclesLeft; + unsigned TotalCycles; + +public: + bool isReady() const { + if (DependentWrites) + return false; + return (CyclesLeft == UNKNOWN_CYCLES || CyclesLeft == 0); + } + + ReadState(const ReadDescriptor &Desc) + : RD(Desc), DependentWrites(0), CyclesLeft(UNKNOWN_CYCLES), + TotalCycles(0) {} + ReadState(const ReadState &Other) = delete; + ReadState &operator=(const ReadState &Other) = delete; + + const ReadDescriptor &getDescriptor() const { return RD; } + unsigned getSchedClass() const { return RD.SchedClassID; } + void cycleEvent(); + void writeStartEvent(unsigned Cycles); + void setDependentWrites(unsigned Writes) { DependentWrites = Writes; } +}; + +/// \brief A sequence of cycles. +/// +/// This class can be used as a building block to construct ranges of cycles. +class CycleSegment { + unsigned Begin; // Inclusive. + unsigned End; // Exclusive. + bool Reserved; // Resources associated to this segment must be reserved. + +public: + CycleSegment(unsigned StartCycle, unsigned EndCycle, bool IsReserved = false) + : Begin(StartCycle), End(EndCycle), Reserved(IsReserved) {} + + bool contains(unsigned Cycle) const { return Cycle >= Begin && Cycle < End; } + bool startsAfter(const CycleSegment &CS) const { return End <= CS.Begin; } + bool endsBefore(const CycleSegment &CS) const { return Begin >= CS.End; } + bool overlaps(const CycleSegment &CS) const { + return !startsAfter(CS) && !endsBefore(CS); + } + bool isExecuting() const { return Begin == 0 && End != 0; } + bool isExecuted() const { return End == 0; } + bool operator<(const CycleSegment &Other) const { + return Begin < Other.Begin; + } + CycleSegment &operator--(void) { + if (Begin) + Begin--; + if (End) + End--; + return *this; + } + + bool isValid() const { return Begin <= End; } + unsigned size() const { return End - Begin; }; + void Subtract(unsigned Cycles) { + assert(End >= Cycles); + End -= Cycles; + } + + unsigned begin() const { return Begin; } + unsigned end() const { return End; } + void setEnd(unsigned NewEnd) { End = NewEnd; } + bool isReserved() const { return Reserved; } + void setReserved() { Reserved = true; } +}; + +/// \brief Helper used by class InstrDesc to describe how hardware resources +/// are used. +/// +/// This class describes how many resource units of a specific resource kind +/// (and how many cycles) are "used" by an instruction. +struct ResourceUsage { + CycleSegment CS; + unsigned NumUnits; + ResourceUsage(CycleSegment Cycles, unsigned Units = 1) + : CS(Cycles), NumUnits(Units) {} + unsigned size() const { return CS.size(); } + bool isReserved() const { return CS.isReserved(); } + void setReserved() { CS.setReserved(); } +}; + +/// \brief An instruction descriptor +struct InstrDesc { + std::vector Writes; // Implicit writes are at the end. + std::vector Reads; // Implicit reads are at the end. + + // For every resource used by an instruction of this kind, this vector + // reports the number of "consumed cycles". + std::vector> Resources; + + // A list of buffered resources consumed by this instruction. + std::vector Buffers; + unsigned MaxLatency; + // Number of MicroOps for this instruction. + unsigned NumMicroOps; + + bool MayLoad; + bool MayStore; + bool HasSideEffects; +}; + +/// An instruction dispatched to the out-of-order backend. +/// +/// This class is used to monitor changes in the internal state of instructions +/// that are dispatched by the DispatchUnit to the hardware schedulers. +class Instruction { + const InstrDesc &Desc; + + enum InstrStage { + IS_INVALID, // Instruction in an invalid state. + IS_AVAILABLE, // Instruction dispatched but operands are not ready. + IS_READY, // Instruction dispatched and operands ready. + IS_EXECUTING, // Instruction issued. + IS_EXECUTED, // Instruction executed. Values are written back. + IS_RETIRED // Instruction retired. + }; + + // The current instruction stage. + enum InstrStage Stage; + + // This value defaults to the instruction latency. This instruction is + // considered executed when field CyclesLeft goes to zero. + int CyclesLeft; + + // Retire Unit token ID for this instruction. + unsigned RCUTokenID; + + using UniqueDef = std::unique_ptr; + using UniqueUse = std::unique_ptr; + using VecDefs = std::vector; + using VecUses = std::vector; + + // Output dependencies. + // One entry per each implicit and explicit register definition. + VecDefs Defs; + + // Input dependencies. + // One entry per each implicit and explicit register use. + VecUses Uses; + + // This instruction has already been dispatched, and all operands are ready. + void setReady() { + assert(Stage == IS_AVAILABLE); + Stage = IS_READY; + } + +public: + Instruction(const InstrDesc &D) + : Desc(D), Stage(IS_INVALID), CyclesLeft(-1) {} + Instruction(const Instruction &Other) = delete; + Instruction &operator=(const Instruction &Other) = delete; + + VecDefs &getDefs() { return Defs; } + const VecDefs &getDefs() const { return Defs; } + VecUses &getUses() { return Uses; } + const VecUses &getUses() const { return Uses; } + const InstrDesc &getDesc() const { return Desc; } + + unsigned getRCUTokenID() const { return RCUTokenID; } + int getCyclesLeft() const { return CyclesLeft; } + void setCyclesLeft(int Cycles) { CyclesLeft = Cycles; } + void setRCUTokenID(unsigned TokenID) { RCUTokenID = TokenID; } + + // Transition to the dispatch stage. + // No definition is updated because the instruction is not "executing". + void dispatch() { + assert(Stage == IS_INVALID); + Stage = IS_AVAILABLE; + } + + // Instruction issued. Transition to the IS_EXECUTING state, and update + // all the definitions. + void execute(); + + void forceExecuted() { + assert((Stage == IS_INVALID && isZeroLatency()) || + (Stage == IS_READY && Desc.MaxLatency == 0)); + Stage = IS_EXECUTED; + } + + // Checks if operands are available. If all operands area ready, + // then this forces a transition from IS_AVAILABLE to IS_READY. + bool isReady(); + + bool isDispatched() const { return Stage == IS_AVAILABLE; } + bool isExecuting() const { return Stage == IS_EXECUTING; } + bool isExecuted() const { return Stage == IS_EXECUTED; } + bool isZeroLatency() const; + + void retire() { + assert(Stage == IS_EXECUTED); + Stage = IS_RETIRED; + } + + void cycleEvent(); +}; + +} // namespace mca + +#endif Index: llvm/trunk/tools/llvm-mca/Instruction.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/Instruction.cpp +++ llvm/trunk/tools/llvm-mca/Instruction.cpp @@ -0,0 +1,134 @@ +//===--------------------- Instruction.cpp ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines abstractions used by the Backend to model register reads, +// register writes and instructions. +// +//===----------------------------------------------------------------------===// + +#include "Instruction.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +namespace mca { + +using namespace llvm; + +void ReadState::writeStartEvent(unsigned Cycles) { + assert(DependentWrites); + assert(CyclesLeft == UNKNOWN_CYCLES); + + // This read may be dependent on more than one write. This typically occurs + // when a definition is the result of multiple writes where at least one + // write does a partial register update. + // The HW is forced to do some extra bookkeeping to track of all the + // dependent writes, and implement a merging scheme for the partial writes. + --DependentWrites; + TotalCycles = std::max(TotalCycles, Cycles); + + if (!DependentWrites) + CyclesLeft = TotalCycles; +} + +void WriteState::onInstructionIssued() { + assert(CyclesLeft == UNKNOWN_CYCLES); + // Update the number of cycles left based on the WriteDescriptor info. + CyclesLeft = WD.Latency; + + // Now that the time left before write-back is know, notify + // all the users. + for (const std::pair &User : Users) { + ReadState *RS = User.first; + unsigned ReadCycles = std::max(0, CyclesLeft - User.second); + RS->writeStartEvent(ReadCycles); + } +} + +void WriteState::addUser(ReadState *User, int ReadAdvance) { + // If CyclesLeft is different than -1, then we don't need to + // update the list of users. We can just notify the user with + // the actual number of cycles left (which may be zero). + if (CyclesLeft != UNKNOWN_CYCLES) { + unsigned ReadCycles = std::max(0, CyclesLeft - ReadAdvance); + User->writeStartEvent(ReadCycles); + return; + } + + std::pair NewPair(User, ReadAdvance); + Users.insert(NewPair); +} + +void WriteState::cycleEvent() { + // Note: CyclesLeft can be a negative number. It is an error to + // make it an unsigned quantity because users of this write may + // specify a negative ReadAdvance. + if (CyclesLeft != UNKNOWN_CYCLES) + CyclesLeft--; +} + +void ReadState::cycleEvent() { + // If CyclesLeft is unknown, then bail out immediately. + if (CyclesLeft == UNKNOWN_CYCLES) + return; + + // If there are still dependent writes, or we reached cycle zero, + // then just exit. + if (DependentWrites || CyclesLeft == 0) + return; + + CyclesLeft--; +} + +#ifndef NDEBUG +void WriteState::dump() const { + dbgs() << "{ OpIdx=" << WD.OpIndex << ", Lat=" << WD.Latency << ", RegID " + << getRegisterID() << ", Cycles Left=" << getCyclesLeft() << " }\n"; +} +#endif + +bool Instruction::isReady() { + if (Stage == IS_READY) + return true; + + assert(Stage == IS_AVAILABLE); + for (const UniqueUse &Use : Uses) + if (!Use.get()->isReady()) + return false; + + setReady(); + return true; +} + +void Instruction::execute() { + assert(Stage == IS_READY); + Stage = IS_EXECUTING; + for (UniqueDef &Def : Defs) + Def->onInstructionIssued(); +} + +bool Instruction::isZeroLatency() const { + return Desc.MaxLatency == 0 && Defs.size() == 0 && Uses.size() == 0; +} + +void Instruction::cycleEvent() { + if (isDispatched()) { + for (UniqueUse &Use : Uses) + Use->cycleEvent(); + return; + } + if (isExecuting()) { + for (UniqueDef &Def : Defs) + Def->cycleEvent(); + CyclesLeft--; + } + if (!CyclesLeft) + Stage = IS_EXECUTED; +} + +} // namespace mca Index: llvm/trunk/tools/llvm-mca/LLVMBuild.txt =================================================================== --- llvm/trunk/tools/llvm-mca/LLVMBuild.txt +++ llvm/trunk/tools/llvm-mca/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./tools/llvm-mc/LLVMBuild.txt ----------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Tool +name = llvm-mca +parent = Tools +required_libraries = MC MCParser Support all-targets Index: llvm/trunk/tools/llvm-mca/LSUnit.h =================================================================== --- llvm/trunk/tools/llvm-mca/LSUnit.h +++ llvm/trunk/tools/llvm-mca/LSUnit.h @@ -0,0 +1,160 @@ +//===------------------------- LSUnit.h --------------------------*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// A Load/Store unit class that models load/store queues and that implements +/// a simple weak memory consistency model. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_LSUNIT_H +#define LLVM_TOOLS_LLVM_MCA_LSUNIT_H + +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include + +#define DEBUG_TYPE "llvm-mca" + +namespace mca { + +/// \brief A Load/Store Unit implementing a load and store queues. +/// +/// This class implements a load queue and a store queue to emulate the +/// out-of-order execution of memory operations. +/// Each load (or store) consumes an entry in the load (or store) queue. +/// +/// Rules are: +/// 1) A younger load is allowed to pass an older load only if there are no +/// stores nor barriers in between the two loads. +/// 2) An younger store is not allowed to pass an older store. +/// 3) A younger store is not allowed to pass an older load. +/// 4) A younger load is allowed to pass an older store only if the load does +/// not alias with the store. +/// +/// This class optimistically assumes that loads don't alias store operations. +/// Under this assumption, younger loads are always allowed to pass older +/// stores (this would only affects rule 4). +/// Essentially, this LSUnit doesn't attempt to run any sort alias analysis to +/// predict when loads and stores don't alias with eachother. +/// +/// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be +/// set to `false` by the constructor of LSUnit. +/// +/// In the case of write-combining memory, rule 2. could be relaxed to allow +/// reordering of non-aliasing store operations. At the moment, this is not +/// allowed. +/// To put it in another way, there is no option to specify a different memory +/// type for memory operations (example: write-through, write-combining, etc.). +/// Also, there is no way to weaken the memory model, and this unit currently +/// doesn't support write-combining behavior. +/// +/// No assumptions are made on the size of the store buffer. +/// As mentioned before, this class doesn't perform alias analysis. +/// Consequently, LSUnit doesn't know how to identify cases where +/// store-to-load forwarding may occur. +/// +/// LSUnit doesn't attempt to predict whether a load or store hits or misses +/// the L1 cache. To be more specific, LSUnit doesn't know anything about +/// the cache hierarchy and memory types. +/// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the +/// scheduling model provides an "optimistic" load-to-use latency (which usually +/// matches the load-to-use latency for when there is a hit in the L1D). +/// +/// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor +/// memory-barrier like instructions. +/// LSUnit conservatively assumes that an instruction which `mayLoad` and has +/// `unmodeled side effects` behave like a "soft" load-barrier. That means, it +/// serializes loads without forcing a flush of the load queue. +/// Similarly, instructions that both `mayStore` and have `unmodeled side +/// effects` are treated like store barriers. A full memory +/// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side +/// effects. This is obviously inaccurate, but this is the best that we can do +/// at the moment. +/// +/// Each load/store barrier consumes one entry in the load/store queue. A +/// load/store barrier enforces ordering of loads/stores: +/// - A younger load cannot pass a load barrier. +/// - A younger store cannot pass a store barrier. +/// +/// A younger load has to wait for the memory load barrier to execute. +/// A load/store barrier is "executed" when it becomes the oldest entry in +/// the load/store queue(s). That also means, all the older loads/stores have +/// already been executed. +class LSUnit { + // Load queue size. + // LQ_Size == 0 means that there are infinite slots in the load queue. + unsigned LQ_Size; + + // Store queue size. + // SQ_Size == 0 means that there are infinite slots in the store queue. + unsigned SQ_Size; + + // If true, loads will never alias with stores. This is the default. + bool NoAlias; + + std::set LoadQueue; + std::set StoreQueue; + + void assignLQSlot(unsigned Index); + void assignSQSlot(unsigned Index); + bool isReadyNoAlias(unsigned Index) const; + + // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is + // conservatively treated as a store barrier. It forces older store to be + // executed before newer stores are issued. + std::set StoreBarriers; + + // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is + // conservatively treated as a load barrier. It forces older loads to execute + // before newer loads are issued. + std::set LoadBarriers; + +public: + LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false) + : LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {} + +#ifndef NDEBUG + void dump() const; +#endif + + bool isSQEmpty() const { return StoreQueue.empty(); } + bool isLQEmpty() const { return LoadQueue.empty(); } + bool isSQFull() const { return SQ_Size != 0 && StoreQueue.size() == SQ_Size; } + bool isLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; } + + void reserve(unsigned Index, bool MayLoad, bool MayStore, bool IsMemBarrier) { + if (!MayLoad && !MayStore) + return; + if (MayLoad) { + if (IsMemBarrier) + LoadBarriers.insert(Index); + assignLQSlot(Index); + } + if (MayStore) { + if (IsMemBarrier) + StoreBarriers.insert(Index); + assignSQSlot(Index); + } + } + + // The rules are: + // 1. A store may not pass a previous store. + // 2. A load may not pass a previous store unless flag 'NoAlias' is set. + // 3. A load may pass a previous load. + // 4. A store may not pass a previous load (regardless of flag 'NoAlias'). + // 5. A load has to wait until an older load barrier is fully executed. + // 6. A store has to wait until an older store barrier is fully executed. + bool isReady(unsigned Index) const; + void onInstructionExecuted(unsigned Index); +}; + +} // namespace mca + +#endif Index: llvm/trunk/tools/llvm-mca/LSUnit.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/LSUnit.cpp +++ llvm/trunk/tools/llvm-mca/LSUnit.cpp @@ -0,0 +1,115 @@ +//===----------------------- LSUnit.cpp --------------------------*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// A Load-Store Unit for the llvm-mca tool. +/// +//===----------------------------------------------------------------------===// + +#include "LSUnit.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "llvm-mca" + +namespace mca { + +#ifndef NDEBUG +void LSUnit::dump() const { + dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n'; + dbgs() << "[LSUnit] SQ_Size = " << SQ_Size << '\n'; + dbgs() << "[LSUnit] NextLQSlotIdx = " << LoadQueue.size() << '\n'; + dbgs() << "[LSUnit] NextSQSlotIdx = " << StoreQueue.size() << '\n'; +} +#endif + +void LSUnit::assignLQSlot(unsigned Index) { + assert(!isLQFull()); + assert(LoadQueue.count(Index) == 0); + + DEBUG(dbgs() << "[LSUnit] - AssignLQSlot \n"); + LoadQueue.insert(Index); +} + +void LSUnit::assignSQSlot(unsigned Index) { + assert(!isSQFull()); + assert(StoreQueue.count(Index) == 0); + + DEBUG(dbgs() << "[LSUnit] - AssignSQSlot \n"); + StoreQueue.insert(Index); +} + +bool LSUnit::isReady(unsigned Index) const { + bool IsALoad = LoadQueue.count(Index) != 0; + bool IsAStore = StoreQueue.count(Index) != 0; + unsigned LoadBarrierIndex = LoadBarriers.empty() ? 0 : *LoadBarriers.begin(); + unsigned StoreBarrierIndex = StoreBarriers.empty() ? 0 : *StoreBarriers.begin(); + + if (IsALoad && LoadBarrierIndex) { + if (Index > LoadBarrierIndex) + return false; + if (Index == LoadBarrierIndex && Index != *LoadQueue.begin()) + return false; + } + + if (IsAStore && StoreBarrierIndex) { + if (Index > StoreBarrierIndex) + return false; + if (Index == StoreBarrierIndex && Index != *StoreQueue.begin()) + return false; + } + + if (NoAlias && IsALoad) + return true; + + if (StoreQueue.size()) { + // Check if this memory operation is younger than the older store. + if (Index > *StoreQueue.begin()) + return false; + } + + // Okay, we are older than the oldest store in the queue. + // If there are no pending loads, then we can say for sure that this + // instruction is ready. + if (isLQEmpty()) + return true; + + // Check if there are no older loads. + if (Index <= *LoadQueue.begin()) + return true; + + // There is at least one younger load. + return !IsAStore; +} + +void LSUnit::onInstructionExecuted(unsigned Index) { + std::set::iterator it = LoadQueue.find(Index); + if (it != LoadQueue.end()) { + DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index + << " has been removed from the load queue.\n"); + LoadQueue.erase(it); + } + + it = StoreQueue.find(Index); + if (it != StoreQueue.end()) { + DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index + << " has been removed from the store queue.\n"); + StoreQueue.erase(it); + } + + if (!StoreBarriers.empty() && Index == *StoreBarriers.begin()) + StoreBarriers.erase(StoreBarriers.begin()); + if (!LoadBarriers.empty() && Index == *LoadBarriers.begin()) + LoadBarriers.erase(LoadBarriers.begin()); +} +} // namespace mca Index: llvm/trunk/tools/llvm-mca/README.txt =================================================================== --- llvm/trunk/tools/llvm-mca/README.txt +++ llvm/trunk/tools/llvm-mca/README.txt @@ -0,0 +1,884 @@ +llvm-mca - LLVM Machine Code Analyzer +------------------------------------- + +llvm-mca is a performance analysis tool that uses information which is already +available in LLVM (e.g. scheduling models) to statically measure the performance +of machine code in a specific cpu. + +Performance is measured in terms of throughput as well as processor resource +consumption. The tool currently works for processors with an out-of-order +backend, for which there is a scheduling model available in LLVM. + +The main goal of this tool is not just to predict the performance of the code +when run on the target, but also help with diagnosing potential performance +issues. + +Given an assembly code sequence, llvm-mca estimates the IPC (instructions Per +cycle), as well as hardware resources pressure. The analysis and reporting style +were inspired by the IACA tool from Intel. + +The presence of long data dependency chains, as well as poor usage of hardware +resources may lead to bottlenecks in the back-end. The tool is able to generate +a detailed report which should help with identifying and analyzing sources of +bottlenecks. + +Scheduling models are mostly used to compute instruction latencies, to obtain +read-advance information, and understand how processor resources are used by +instructions. By design, the quality of the performance analysis conducted by +the tool is inevitably affected by the quality of the target scheduling models. + +However, scheduling models intentionally do not describe all processors details, +since the goal is just to enable the scheduling of machine instructions during +compilation. That means, there are processor details which are not important for +the purpose of scheduling instructions (and therefore not described by the +scheduling model), but are very important for this tool. + +A few examples of details that are missing in scheduling models are: + - Maximum number of instructions retired per cycle. + - Actual dispatch width (it often differs from the issue width). + - Number of temporary registers available for renaming. + - Number of read/write ports in the register file(s). + - Length of the load/store queue in the LSUnit. + +It is also very difficult to find a "good" abstract model to describe the +behavior of out-of-order processors. So, we have to keep in mind that all of +these aspects are going to affect the quality of the static analysis performed +by the tool. + +An extensive list of known limitations is reported in one of the last sections +of this document. There is also a section related to design problems which must +be addressed (hopefully with the help of the community). At the moment, the +tool has been mostly tested for x86 targets, but there are still several +limitations, some of which could be overcome by integrating extra information +into the scheduling models. + +How the tool works +------------------ + +The tool takes assembly code as input. Assembly code is parsed into a sequence +of MCInst with the help of the existing LLVM target assembly parsers. The parsed +sequence of MCInst is then analyzed by a 'Backend' module to generate a +performance report. + +The Backend module internally emulates the execution of the machine code +sequence in a loop of iterations (which by default is 70). At the end of this +process, the backend collects a number of statistics which are then printed out +in the form of a report. + +Here is an example of performance report generated by the tool for a dot-product +of two packed float vectors of four elements. The analysis is conducted for +target x86, cpu btver2: + +/////////////////// + +Iterations: 300 +Instructions: 900 +Total Cycles: 610 +Dispatch Width: 2 +IPC: 1.48 + + +Resources: +[0] - JALU0 +[1] - JALU1 +[2] - JDiv +[3] - JFPM +[4] - JFPU0 +[5] - JFPU1 +[6] - JLAGU +[7] - JSAGU +[8] - JSTC +[9] - JVIMUL + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + - - - - 2.00 1.00 - - - - + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: + - - - - - 1.00 - - - - vmulps %xmm0, %xmm1, %xmm2 + - - - - 1.00 - - - - - vhaddps %xmm2, %xmm2, %xmm3 + - - - - 1.00 - - - - - vhaddps %xmm3, %xmm3, %xmm4 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects + +[1] [2] [3] [4] [5] [6] Instructions: + 1 2 1.00 vmulps %xmm0, %xmm1, %xmm2 + 1 3 1.00 vhaddps %xmm2, %xmm2, %xmm3 + 1 3 1.00 vhaddps %xmm3, %xmm3, %xmm4 + +/////////////////// + +According to this report, the dot-product kernel has been executed 300 times, +for a total of 900 instructions dynamically executed. + +The report is structured in three main sections. A first section collects a few +performance numbers; the goal of this section is to give a very quick overview +of the performance throughput. In this example, the two important perforamce +indicators are a) the predicted total number of cycles, and b) the IPC. +IPC is probably the most important throughput indicator. A big delta between the +Dispatch Width and the computed IPC is an indicator of potential performance +issues. + +The second section is the so-called "resource pressure view". This view reports +the average number of resource cycles consumed every iteration by instructions +for every processor resource unit available on the target. Information is +structured in two tables. The first table reports the number of resource cycles +spent on average every iteration. The second table correlates the resource +cycles to the machine instruction in the sequence. For example, every iteration +of the dot-product, instruction 'vmulps' always executes on resource unit [5] +(JFPU1 - floating point pipeline #1), consuming an average of 1 resource cycle +per iteration. Note that on Jaguar, vector FP multiply can only be issued to +pipeline JFPU1, while horizontal FP adds can only be issued to pipeline JFPU0. + +The third (and last) section of the report shows the latency and reciprocal +throughput of every instruction in the sequence. That section also reports extra +information related to the number of micro opcodes, and opcode properties (i.e. +'MayLoad', 'MayStore' and 'UnmodeledSideEffects'). + +The resource pressure view helps with identifying bottlenecks caused by high +usage of specific hardware resources. Situations with resource pressure mainly +concentrated on a few resources should, in general, be avoided. Ideally, +pressure should be uniformly distributed between multiple resources. + +Timeline View +------------- + +A detailed report of each instruction's state transitions over time can be +enabled using the command line flag '-timeline'. This prints an extra section +in the report which contains the so-called "timeline view". Below is the +timeline view for the dot-product example from the previous section. + +/////////////// +Timeline view: + 012345 +Index 0123456789 + +[0,0] DeeER. . . vmulps %xmm0, %xmm1, %xmm2 +[0,1] D==eeeER . . vhaddps %xmm2, %xmm2, %xmm3 +[0,2] .D====eeeER . vhaddps %xmm3, %xmm3, %xmm4 + +[1,0] .DeeE-----R . vmulps %xmm0, %xmm1, %xmm2 +[1,1] . D=eeeE---R . vhaddps %xmm2, %xmm2, %xmm3 +[1,2] . D====eeeER . vhaddps %xmm3, %xmm3, %xmm4 + +[2,0] . DeeE-----R . vmulps %xmm0, %xmm1, %xmm2 +[2,1] . D====eeeER . vhaddps %xmm2, %xmm2, %xmm3 +[2,2] . D======eeeER vhaddps %xmm3, %xmm3, %xmm4 + + +Average Wait times (based on the timeline view): +[0]: Executions +[1]: Average time spent waiting in a scheduler's queue +[2]: Average time spent waiting in a scheduler's queue while ready +[3]: Average time elapsed from WB until retire stage + + [0] [1] [2] [3] +0. 3 1.0 1.0 3.3 vmulps %xmm0, %xmm1, %xmm2 +1. 3 3.3 0.7 1.0 vhaddps %xmm2, %xmm2, %xmm3 +2. 3 5.7 0.0 0.0 vhaddps %xmm3, %xmm3, %xmm4 +/////////////// + +The timeline view is very interesting because it shows how instructions changed +in state during execution. It also gives an idea of how the tool "sees" +instructions executed on the target. + +The timeline view is structured in two tables. The first table shows how +instructions change in state over time (measured in cycles); the second table +(named "Average Wait times") reports useful timing statistics which should help +diagnose performance bottlenecks caused by long data dependencies and +sub-optimal usage of hardware resources. + +An instruction in the timeline view is identified by a pair of indices, where +the 'first' index identifies an iteration, and the 'second' index is the actual +instruction index (i.e. where it appears in the code sequence). + +Excluding the first and last column, the remaining columns are in cycles. Cycles +are numbered sequentially starting from 0. The following characters are used to +describe the state of an instruction: + + D : Instruction dispatched. + e : Instruction executing. + E : Instruction executed. + R : Instruction retired. + = : Instruction already dispatched, waiting to be executed. + - : Instruction executed, waiting to be retired. + +Based on the timeline view from the example, we know that: + - Instruction [1, 0] was dispatched at cycle 1. + - Instruction [1, 0] started executing at cycle 2. + - Instruction [1, 0] reached the write back stage at cycle 4. + - Instruction [1, 0] was retired at cycle 10. + +Instruction [1, 0] (i.e. the vmulps from iteration #1) doesn't have to wait in +the scheduler's queue for the operands to become available. By the time the +vmulps is dispatched, operands are already available, and pipeline JFPU1 is +ready to serve another instruction. So the instruction can be immediately +issued on the JFPU1 pipeline. That is demonstrated by the fact that the +instruction only spent 1cy in the scheduler's queue. + +There is a gap of 5 cycles between the write-back stage and the retire event. +That is because instructions must retire in program order, so [1,0] has to wait +for [0, 2] to be retired first (i.e it has to wait unti cycle 10). + +In the dot-product example, all instructions are in a RAW (Read After Write) +dependency chain. Register %xmm2 written by the vmulps is immediately used by +the first vhaddps, and register %xmm3 written by the first vhaddps is used by +the second vhaddps. Long data dependencies negatively affect the ILP +(Instruction Level Parallelism). + +In the dot-product example, there are anti-dependencies introduced by +instructions from different iterations. However, those dependencies can be +removed at register renaming stage (at the cost of allocating register aliases, +and therefore consuming temporary registers). + +Table "Average Wait times" helps diagnose performance issues that are caused by +the presence of long latency instructions and potentially long data dependencies +which may limit the ILP. Note that the tool by default assumes at least 1cy +between the dispatch event and the issue event. + +When the performance is limited by data dependencies and/or long latency +instructions, the number of cycles spent while in the "ready" state is expected +to be very small when compared with the total number of cycles spent in the +scheduler's queue. So the difference between the two counters is a good +indicator of how big of an impact data dependencies had on the execution of +instructions. When performance is mostly limited by the lack of hardware +resources, the delta between the two counters is small. However, the number of +cycles spent in the queue tends to be bigger (i.e. more than 1-3cy) especially +when compared with other low latency instructions. + +Extra statistics to further diagnose performance issues. +-------------------------------------------------------- + +Flag '-verbose' enables extra statistics and performance counters for the +dispatch logic, the reorder buffer, the retire control unit and the register +file. + +Below is an example of verbose output generated by the tool for the dot-product +example discussed in the previous sections. + +/////////////////// +Iterations: 300 +Instructions: 900 +Total Cycles: 610 +Dispatch Width: 2 +IPC: 1.48 + + +Dynamic Dispatch Stall Cycles: +RAT - Register unavailable: 0 +RCU - Retire tokens unavailable: 0 +SCHEDQ - Scheduler full: 272 +LQ - Load queue full: 0 +SQ - Store queue full: 0 +GROUP - Static restrictions on the dispatch group: 0 + + +Register Alias Table: +Total number of mappings created: 900 +Max number of mappings used: 35 + + +Dispatch Logic - number of cycles where we saw N instructions dispatched: +[# dispatched], [# cycles] + 0, 24 (3.9%) + 1, 272 (44.6%) + 2, 314 (51.5%) + + +Schedulers - number of cycles where we saw N instructions issued: +[# issued], [# cycles] + 0, 7 (1.1%) + 1, 306 (50.2%) + 2, 297 (48.7%) + + +Retire Control Unit - number of cycles where we saw N instructions retired: +[# retired], [# cycles] + 0, 109 (17.9%) + 1, 102 (16.7%) + 2, 399 (65.4%) + + +Scheduler's queue usage: +JALU01, 0/20 +JFPU01, 18/18 +JLSAGU, 0/12 +/////////////////// + +Based on the verbose report, the backend was only able to dispatch two +instructions 51.5% of the time. The dispatch group was limited to one +instruction 44.6% of the cycles, which corresponds to 272 cycles. + +If we look at section "Dynamic Dispatch Stall Cycles", we can see how counter +SCHEDQ reports 272 cycles. Counter SCHEDQ is incremented every time the dispatch +logic is unable to dispatch a full group of two instructions because the +scheduler's queue is full. + +Section "Scheduler's queue usage" shows how the maximum number of buffer entries +(i.e. scheduler's queue entries) used at runtime for resource JFPU01 reached its +maximum. Note that AMD Jaguar implements three schedulers: + * JALU01 - A scheduler for ALU instructions + * JLSAGU - A scheduler for address generation + * JFPU01 - A scheduler floating point operations. + +The dot-product is a kernel of three floating point instructions (a vector +multiply followed by two horizontal adds). That explains why only the floating +point scheduler appears to be used according to section "Scheduler's queue +usage". + +A full scheduler's queue is either caused by data dependency chains, or by a +sub-optimal usage of hardware resources. Sometimes, resource pressure can be +mitigated by rewriting the kernel using different instructions that consume +different scheduler resources. Schedulers with a small queue are less resilient +to bottlenecks caused by the presence of long data dependencies. + +In this example, we can conclude that the IPC is mostly limited by data +dependencies, and not by resource pressure. + +LLVM-MCA instruction flow +------------------------- + +This section describes the instruction flow through the out-of-order backend, as +well as the functional units involved in the process. + +An instruction goes through a default sequence of stages: + - Dispatch (Instruction is dispatched to the schedulers). + - Issue (Instruction is issued to the processor pipelines). + - Write Back (Instruction is executed, and results are written back). + - Retire (Instruction is retired; writes are architecturally committed). + +The tool only models the out-of-order portion of a processor. Therefore, the +instruction fetch and decode stages are not modeled. Performance bottlenecks in +the frontend are not diagnosed by this tool. The tool assumes that instructions +have all been decoded and placed in a queue. Also, the tool doesn't know +anything about branch prediction. + +The long term plan is to make the process customizable, so that processors can +define their own. This is a future work. + +Instruction Dispatch +-------------------- + +During the Dispatch stage, instructions are picked in program order from a queue +of already decoded instructions, and dispatched in groups to the hardware +schedulers. The dispatch logic is implemented by class DispatchUnit in file +Dispatch.h. + +The size of a dispatch group depends on the availability of hardware resources, +and it cannot exceed the value of field 'DispatchWidth' in class DispatchUnit. +Note that field DispatchWidth defaults to the value of field 'IssueWidth' from +the scheduling model. + +Users can override the DispatchWidth value with flag "-dispatch=" (where 'N' +is an unsigned quantity). + +An instruction can be dispatched if: + - The size of the dispatch group is smaller than DispatchWidth + - There are enough entries in the reorder buffer + - There are enough temporary registers to do register renaming + - Schedulers are not full. + +Scheduling models don't describe register files, and therefore the tool doesn't +know if there is more than one register file, and how many temporaries are +available for register renaming. + +By default, the tool (optimistically) assumes a single register file with an +unbounded number of temporary registers. Users can limit the number of +temporary registers available for register renaming using flag +`-register-file-size=`, where N is the number of temporaries. A value of +zero for N means 'unbounded'. Knowing how many temporaries are available for +register renaming, the tool can predict dispatch stalls caused by the lack of +temporaries. + +The number of reorder buffer entries consumed by an instruction depends on the +number of micro-opcodes it specifies in the target scheduling model (see field +'NumMicroOpcodes' of tablegen class ProcWriteResources and its derived classes; +TargetSchedule.td). + +The reorder buffer is implemented by class RetireControlUnit (see Dispatch.h). +Its goal is to track the progress of instructions that are "in-flight", and +retire instructions in program order. The number of entries in the reorder +buffer defaults to the value of field 'MicroOpBufferSize' from the target +scheduling model. + +Instructions that are dispatched to the schedulers consume scheduler buffer +entries. The tool queries the scheduling model to figure out the set of +buffered resources consumed by an instruction. Buffered resources are treated +like "scheduler" resources, and the field 'BufferSize' (from the processor +resource tablegen definition) defines the size of the scheduler's queue. + +Zero latency instructions (for example NOP instructions) don't consume scheduler +resources. However, those instructions still reserve a number of slots in the +reorder buffer. + +Instruction Issue +----------------- + +As mentioned in the previous section, each scheduler resource implements a queue +of instructions. An instruction has to wait in the scheduler's queue until +input register operands become available. Only at that point, does the +instruction becomes eligible for execution and may be issued (potentially +out-of-order) to a pipeline for execution. + +Instruction latencies can be computed by the tool with the help of the +scheduling model; latency values are defined by the scheduling model through +ProcWriteResources objects. + +Class Scheduler (see file Scheduler.h) knows how to emulate multiple processor +schedulers. A Scheduler is responsible for tracking data dependencies, and +dynamically select which processor resources are consumed/used by instructions. + +Internally, the Scheduler class delegates the management of processor resource +units and resource groups to the ResourceManager class. ResourceManager is also +responsible for selecting resource units that are effectively consumed by +instructions. For example, if an instruction consumes 1cy of a resource group, +the ResourceManager object selects one of the available units from the group; by +default, it uses a round-robin selector to guarantee that resource usage is +uniformly distributed between all units of a group. + +Internally, class Scheduler implements three instruction queues: + - WaitQueue: a queue of instructions whose operands are not ready yet. + - ReadyQueue: a queue of instructions ready to execute. + - IssuedQueue: a queue of instructions executing. + +Depending on the operands availability, instructions that are dispatched to the +Scheduler are either placed into the WaitQueue or into the ReadyQueue. + +Every cycle, class Scheduler checks if instructions can be moved from the +WaitQueue to the ReadyQueue, and if instructions from the ReadyQueue can be +issued to the underlying pipelines. The algorithm prioritizes older +instructions over younger instructions. + +Objects of class ResourceState (see Scheduler.h) describe processor resources. +There is an instance of class ResourceState for each single processor resource +specified by the scheduling model. A ResourceState object for a processor +resource with multiple units dynamically tracks the availability of every single +unit. For example, the ResourceState of a resource group tracks the +availability of every resource in that group. Internally, ResourceState +implements a round-robin selector to dynamically pick the next unit to use from +the group. + +Write-Back and Retire Stage +--------------------------- + +Issued instructions are moved from the ReadyQueue to the IssuedQueue. There, +instructions wait until they reach the write-back stage. At that point, they +get removed from the queue and the retire control unit is notified. + +On the event of "instruction executed", the retire control unit flags the +instruction as "ready to retire". + +Instruction are retired in program order; an "instruction retired" event is sent +to the register file which frees the temporary registers allocated for the +instruction at register renaming stage. + +Load/Store Unit and Memory Consistency Model +-------------------------------------------- + +The tool attempts to emulate out-of-order execution of memory operations. Class +LSUnit (see file LSUnit.h) emulates a load/store unit implementing queues for +speculative execution of loads and stores. + +Each load (or store) consumes an entry in the load (or store) queue. The number +of slots in the load/store queues is unknown by the tool, since there is no +mention of it in the scheduling model. In practice, users can specify flag +`-lqueue=N` (vic. `-squeue=N`) to limit the number of entries in the queue to be +equal to exactly N (an unsigned value). If N is zero, then the tool assumes an +unbounded queue (this is the default). + +LSUnit implements a relaxed consistency model for memory loads and stores. The +rules are: +1) A younger load is allowed to pass an older load only if there is no + intervening store in between the two loads. +2) An younger store is not allowed to pass an older store. +3) A younger store is not allowed to pass an older load. +4) A younger load is allowed to pass an older store provided that the load does + not alias with the store. + +By default, this class conservatively (i.e. pessimistically) assumes that loads +always may-alias store operations. Essentially, this LSUnit doesn't perform any +sort of alias analysis to rule out cases where loads and stores don't overlap +with each other. The downside of this approach however is that younger loads are +never allowed to pass older stores. To make it possible for a younger load to +pass an older store, users can use the command line flag -noalias. Under +'noalias', a younger load is always allowed to pass an older store. + +Note that, in the case of write-combining memory, rule 2. could be relaxed a bit +to allow reordering of non-aliasing store operations. That being said, at the +moment, there is no way to further relax the memory model (flag -noalias is the +only option). Essentially, there is no option to specify a different memory +type (for example: write-back, write-combining, write-through; etc.) and +consequently to weaken or strengthen the memory model. + +Other limitations are: + * LSUnit doesn't know when store-to-load forwarding may occur. + * LSUnit doesn't know anything about the cache hierarchy and memory types. + * LSUnit doesn't know how to identify serializing operations and memory fences. + +No assumption is made on the store buffer size. As mentioned before, LSUnit +conservatively assumes a may-alias relation between loads and stores, and it +doesn't attempt to identify cases where store-to-load forwarding would occur in +practice. + +LSUnit doesn't attempt to predict whether a load or store hits or misses the L1 +cache. It only knows if an instruction "MayLoad" and/or "MayStore". For loads, +the scheduling model provides an "optimistic" load-to-use latency (which usually +matches the load-to-use latency for when there is a hit in the L1D). + +Class MCInstrDesc in LLVM doesn't know about serializing operations, nor +memory-barrier like instructions. LSUnit conservatively assumes that an +instruction which has both 'MayLoad' and 'UnmodeledSideEffects' behaves like a +"soft" load-barrier. That means, it serializes loads without forcing a flush of +the load queue. Similarly, instructions flagged with both 'MayStore' and +'UnmodeledSideEffects' are treated like store barriers. A full memory barrier +is a 'MayLoad' and 'MayStore' instruction with 'UnmodeledSideEffects'. This is +inaccurate, but it is the best that we can do at the moment with the current +information available in LLVM. + +A load/store barrier consumes one entry of the load/store queue. A load/store +barrier enforces ordering of loads/stores. A younger load cannot pass a load +barrier. Also, a younger store cannot pass a store barrier. A younger load has +to wait for the memory/load barrier to execute. A load/store barrier is +"executed" when it becomes the oldest entry in the load/store queue(s). That +also means, by construction, all the older loads/stores have been executed. + +In conclusion the full set of rules is: + 1. A store may not pass a previous store. + 2. A load may not pass a previous store unless flag 'NoAlias' is set. + 3. A load may pass a previous load. + 4. A store may not pass a previous load (regardless of flag 'NoAlias'). + 5. A load has to wait until an older load barrier is fully executed. + 6. A store has to wait until an older store barrier is fully executed. + +Known limitations +----------------- +Previous sections described cases where the tool is missing information to give +an accurate report. For example, the first sections of this document explained +how the lack of knowledge about the processor negatively affects the performance +analysis. The lack of knowledge is often a consequence of how scheduling models +are defined; as mentioned before, scheduling models intentionally don't describe +processors in fine details. That being said, the LLVM machine model can be +extended to expose more details, as long as they are opt-in for targets. + +The accuracy of the performance analysis is also affected by assumptions made by +the processor model used by the tool. + +Most recent Intel and AMD processors implement dedicated LoopBuffer/OpCache in +the hardware frontend to speedup the throughput in the presence of tight loops. +The presence of these buffers complicates the decoding logic, and requires +knowledge on the branch predictor too. Class 'SchedMachineModel' in tablegen +provides a field named 'LoopMicroOpBufferSize' which is used to describe loop +buffers. However, the purpose of that field is to enable loop unrolling of +tight loops; essentially, it affects the cost model used by pass loop-unroll. + +At the current state, the tool only describes the out-of-order portion of a +processor, and consequently doesn't try to predict the frontend throughput. That +being said, this tool could be definitely extended in future to also account for +the hardware frontend when doing performance analysis. This would inevitably +require extra (extensive) processor knowledge related to all the available +decoding paths in the hardware frontend, as well as branch prediction. + +Currently, the tool assumes a zero-latency "perfect" fetch&decode +stage; the full sequence of decoded instructions is immediately visible to the +dispatch logic from the start. + +The tool doesn't know about simultaneous mutithreading. According to the tool, +processor resources are not statically/dynamically partitioned. Processor +resources are fully available to the hardware thread executing the +microbenchmark. + +The execution model implemented by this tool assumes that instructions are +firstly dispatched in groups to hardware schedulers, and then issued to +pipelines for execution. The model assumes dynamic scheduling of instructions. +Instructions are placed in a queue and potentially executed out-of-order (based +on the operand availability). The dispatch stage is definitely distinct from the +issue stage. This will change in future; as mentioned in the first section, the +end goal is to let processors customize the process. + +This model doesn't correctly describe processors where the dispatch/issue is a +single stage. This is what happens for example in VLIW processors, where +instructions are packaged and statically scheduled at compile time; it is up to +the compiler to predict the latency of instructions and package issue groups +accordingly. For such targets, there is no dynamic scheduling done by the +hardware. + +Existing classes (DispatchUnit, Scheduler, etc.) could be extended/adapted to +support processors with a single dispatch/issue stage. The execution flow would +require some changes in the way how existing components (i.e. DispatchUnit, +Scheduler, etc.) interact. This can be a future development. + +The following sections describes other known limitations. The goal is not to +provide an extensive list of limitations; we want to report what we believe are +the most important limitations, and suggest possible methods to overcome them. + +Load/Store barrier instructions and serializing operations +---------------------------------------------------------- +Section "Load/Store Unit and Memory Consistency Model" already mentioned how +LLVM doesn't know about serializing operations and memory barriers. Most of it +boils down to the fact that class MCInstrDesc (intentionally) doesn't expose +those properties. Instead, both serializing operations and memory barriers +"have side-effects" according to MCInstrDesc. That is because, at least for +scheduling purposes, knowing that an instruction has unmodeled side effects is +often enough to treat the instruction like a compiler scheduling barrier. + +A performance analysis tool could use the extra knowledge on barriers and +serializing operations to generate a more accurate performance report. One way +to improve this is by reserving a couple of bits in field 'Flags' from class +MCInstrDesc: one bit for barrier operations, and another bit to mark +instructions as serializing operations. + +Lack of support for instruction itineraries +------------------------------------------- +The current version of the tool doesn't know how to process instruction +itineraries. This is probably one of the most important limitations, since it +affects a few out-of-order processors in LLVM. + +As mentioned in section 'Instruction Issue', class Scheduler delegates to an +instance of class ResourceManager the handling of processor resources. +ResourceManager is where most of the scheduling logic is implemented. + +Adding support for instruction itineraries requires that we teach +ResourceManager how to handle functional units and instruction stages. This +development can be a future extension, and it would probably require a few +changes to the ResourceManager interface. + +Instructions that affect control flow are not correctly modeled +--------------------------------------------------------------- +Examples of instructions that affect the control flow are: return, indirect +branches, calls, etc. The tool doesn't try to predict/evaluate branch targets. +In particular, the tool doesn't model any sort of branch prediction, nor does it +attempt to track changes to the program counter. The tool always assumes that +the input assembly sequence is the body of a microbenchmark (a simple loop +executed for a number of iterations). The "next" instruction in sequence is +always the next instruction to dispatch. + +Call instructions default to an arbitrary high latency of 100cy. A warning is +generated if the tool encounters a call instruction in the sequence. Return +instructions are not evaluated, and therefore control flow is not affected. +However, the tool still queries the processor scheduling model to obtain latency +information for instructions that affect the control flow. + +Possible extensions to the scheduling model +------------------------------------------- +Section "Instruction Dispatch" explained how the tool doesn't know about the +register files, and temporaries available in each register file for register +renaming purposes. + +The LLVM scheduling model could be extended to better describe register files. +Ideally, scheduling model should be able to define: + - The size of each register file + - How many temporary registers are available for register renaming + - How register classes map to register files + +The scheduling model doesn't specify the retire throughput (i.e. how many +instructions can be retired every cycle). Users can specify flag +`-max-retire-per-cycle=` to limit how many instructions the retire control +unit can retire every cycle. Ideally, every processor should be able to specify +the retire throughput (for example, by adding an extra field to the scheduling +model tablegen class). + +Known limitations on X86 processors +----------------------------------- + +1) Partial register updates versus full register updates. + +On x86-64, a 32-bit GPR write fully updates the super-register. Example: + add %edi %eax ## eax += edi + +Here, register %eax aliases the lower half of 64-bit register %rax. On x86-64, +register %rax is fully updated by the 'add' (the upper half of %rax is zeroed). +Essentially, it "kills" any previous definition of (the upper half of) register +%rax. + +On the other hand, 8/16 bit register writes only perform a so-called "partial +register update". Example: + add %di, %ax ## ax += di + +Here, register %eax is only partially updated. To be more specific, the lower +half of %eax is set, and the upper half is left unchanged. There is also no +change in the upper 48 bits of register %rax. + +To get accurate performance analysis, the tool has to know which instructions +perform a partial register update, and which instructions fully update the +destination's super-register. + +One way to expose this information is (again) via tablegen. For example, we +could add a flag in the tablegen instruction class to tag instructions that +perform partial register updates. Something like this: 'bit +hasPartialRegisterUpdate = 1'. However, this would force a `let +hasPartialRegisterUpdate = 0` on several instruction definitions. + +Another approach is to have a MCSubtargetInfo hook similar to this: + virtual bool updatesSuperRegisters(unsigned short opcode) { return false; } + +Targets will be able to override this method if needed. Again, this is just an +idea. But the plan is to have this fixed as a future development. + +2) Macro Op fusion. + +The tool doesn't know about macro-op fusion. On modern x86 processors, a +'cmp/test' followed by a 'jmp' is fused into a single macro operation. The +advantage is that the fused pair only consumes a single slot in the dispatch +group. + +As a future development, the tool should be extended to address macro-fusion. +Ideally, we could have LLVM generate a table enumerating all the opcode pairs +that can be fused together. That table could be exposed to the tool via the +MCSubtargetInfo interface. This is just an idea; there may be better ways to +implement this. + +3) Intel processors: mixing legacy SSE with AVX instructions. + +On modern Intel processors with AVX, mixing legacy SSE code with AVX code +negatively impacts the performance. The tool is not aware of this issue, and +the performance penalty is not accounted when doing the analysis. This is +something that we would like to improve in future. + +4) Zero-latency register moves and Zero-idioms. + +Most modern AMD/Intel processors know how to optimize out register-register +moves and zero idioms at register renaming stage. The tool doesn't know +about these patterns, and this may negatively impact the performance analysis. + +Known design problems +--------------------- +This section describes two design issues that are currently affecting the tool. +The long term plan is to "fix" these issues. +Both limitations would be easily fixed if we teach the tool how to directly +manipulate MachineInstr objects (instead of MCInst objects). + +1) Variant instructions not correctly modeled. + +The tool doesn't know how to analyze instructions with a "variant" scheduling +class descriptor. A variant scheduling class needs to be resolved dynamically. +The "actual" scheduling class often depends on the subtarget, as well as +properties of the specific MachineInstr object. + +Unfortunately, the tool manipulates MCInst, and it doesn't know anything about +MachineInstr. As a consequence, the tool cannot use the existing machine +subtarget hooks that are normally used to resolve the variant scheduling class. +This is a major design issue which mostly affects ARM/AArch64 targets. It +mostly boils down to the fact that the existing scheduling framework was meant +to work for MachineInstr. + +When the tool encounters a "variant" instruction, it assumes a generic 1cy +latency. However, the tool would not be able to tell which processor resources +are effectively consumed by the variant instruction. + +2) MCInst and MCInstrDesc. + +Performance analysis tools require data dependency information to correctly +predict the runtime performance of the code. This tool must always be able to +obtain the set of implicit/explicit register defs/uses for every instruction of +the input assembly sequence. + +In the first section of this document, it was mentioned how the tool takes as +input an assembly sequence. That sequence is parsed into a MCInst sequence with +the help of assembly parsers available from the targets. + +A MCInst is a very low-level instruction representation. The tool can inspect +the MCOperand sequence of an MCInst to identify register operands. However, +there is no way to tell register operands that are definitions from register +operands that are uses. + +In LLVM, class MCInstrDesc is used to fully describe target instructions and +their operands. The opcode of a machine instruction (a MachineInstr object) can +be used to query the instruction set through method `MCInstrInfo::get' to obtain +the associated MCInstrDesc object. + +However class MCInstrDesc describes properties and operands of MachineInstr +objects. Essentially, MCInstrDesc is not meant to be used to describe MCInst +objects. To be more specific, MCInstrDesc objects are automatically generated +via tablegen from the instruction set description in the target .td files. For +example, field `MCInstrDesc::NumDefs' is always equal to the cardinality of the +`(outs)` set from the tablegen instruction definition. + +By construction, register definitions always appear at the beginning of the +MachineOperands list in MachineInstr. Basically, the (outs) are the first +operands of a MachineInstr, and the (ins) will come after in the machine operand +list. Knowing the number of register definitions is enough to identify +all the register operands that are definitions. + +In a normal compilation process, MCInst objects are generated from MachineInstr +objects through a lowering step. By default the lowering logic simply iterates +over the machine operands of a MachineInstr, and converts/expands them into +equivalent MCOperand objects. + +The default lowering strategy has the advantage of preserving all of the +above mentioned assumptions on the machine operand sequence. That means, register +definitions would still be at the beginning of the MCOperand sequence, and +register uses would come after. + +Targets may still define custom lowering routines for specific opcodes. Some of +these routines may lower operands in a way that potentially breaks (some of) the +assumptions on the machine operand sequence which were valid for MachineInstr. +Luckily, this is not the most common form of lowering done by the targets, and +the vast majority of the MachineInstr are lowered based on the default strategy +which preserves the original machine operand sequence. This is especially true +for x86, where the custom lowering logic always preserves the original (i.e. +from the MachineInstr) operand sequence. + +This tool currently works under the strong (and potentially incorrect) +assumption that register def/uses in a MCInst can always be identified by +querying the machine instruction descriptor for the opcode. This assumption made +it possible to develop this tool and get good numbers at least for the +processors available in the x86 backend. + +That being said, the analysis is still potentially incorrect for other targets. +So we plan (with the help of the community) to find a proper mechanism to map +when possible MCOperand indices back to MachineOperand indices of the equivalent +MachineInstr. This would be equivalent to describing changes made by the +lowering step which affected the operand sequence. For example, we could have an +index for every register MCOperand (or -1, if the operand didn't exist in the +original MachineInstr). The mapping could look like this <0,1,3,2>. Here, +MCOperand #2 was obtained from the lowering of MachineOperand #3. etc. + +This information could be automatically generated via tablegen for all the +instructions whose custom lowering step breaks assumptions made by the tool on +the register operand sequence (In general, these instructions should be the +minority of a target's instruction set). Unfortunately, we don't have that +information now. As a consequence, we assume that the number of explicit +register definitions is the same number specified in MCInstrDesc. We also +assume that register definitions always come first in the operand sequence. + +In conclusion: these are for now the strong assumptions made by the tool: + * The number of explicit and implicit register definitions in a MCInst + matches the number of explicit and implicit definitions specified by the + MCInstrDesc object. + * Register uses always come after register definitions. + * If an opcode specifies an optional definition, then the optional + definition is always the last register operand in the sequence. + +Note that some of the information accessible from the MCInstrDesc is always +valid for MCInst. For example: implicit register defs, implicit register uses +and 'MayLoad/MayStore/HasUnmodeledSideEffects' opcode properties still apply to +MCInst. The tool knows about this, and uses that information during its +analysis. + +Future work +----------- + * Address limitations (described in section "Known limitations"). + * Integrate extra description in the processor models, and make it opt-in for + the targets (see section "Possible extensions to the scheduling model"). + * Let processors specify the selection strategy for processor resource groups + and resources with multiple units. The tool currently uses a round-robin + selector to pick the next resource to use. + * Address limitations specifically described in section "Known limitations on + X86 processors". + * Address design issues identified in section "Known design problems". + * Define a standard interface for "Views". This would let users customize the + performance report generated by the tool. + * Simplify the Backend interface. + +When interfaces are mature/stable: + * Move the logic into a library. This will enable a number of other + interesting use cases. Index: llvm/trunk/tools/llvm-mca/ResourcePressureView.h =================================================================== --- llvm/trunk/tools/llvm-mca/ResourcePressureView.h +++ llvm/trunk/tools/llvm-mca/ResourcePressureView.h @@ -0,0 +1,113 @@ +//===--------------------- ResourcePressureView.h ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file define class ResourcePressureView. +/// Class ResourcePressureView observes hardware events generated by +/// the Backend object and collects statistics related to resource usage at +/// instruction granularity. +/// Resource pressure information is then printed out to a stream in the +/// form of a table like the one from the example below: +/// +/// Resources: +/// [0] - JALU0 +/// [1] - JALU1 +/// [2] - JDiv +/// [3] - JFPM +/// [4] - JFPU0 +/// [5] - JFPU1 +/// [6] - JLAGU +/// [7] - JSAGU +/// [8] - JSTC +/// [9] - JVIMUL +/// +/// Resource pressure per iteration: +/// [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] +/// 0.00 0.00 0.00 0.00 2.00 2.00 0.00 0.00 0.00 0.00 +/// +/// Resource pressure by instruction: +/// [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: +/// - - - - - 1.00 - - - - vpermilpd $1, %xmm0, +/// %xmm1 +/// - - - - 1.00 - - - - - vaddps %xmm0, %xmm1, +/// %xmm2 +/// - - - - - 1.00 - - - - vmovshdup %xmm2, %xmm3 +/// - - - - 1.00 - - - - - vaddss %xmm2, %xmm3, +/// %xmm4 +/// +/// In this example, we have AVX code executed on AMD Jaguar (btver2). +/// Both shuffles and vector floating point add operations on XMM registers have +/// a reciprocal throughput of 1cy. +/// Each add is issued to pipeline JFPU0, while each shuffle is issued to +/// pipeline JFPU1. The overall pressure per iteration is reported by two +/// tables: the first smaller table is the resource pressure per iteration; +/// the second table reports resource pressure per instruction. Values are the +/// average resource cycles consumed by an instruction. +/// Every vector add from the example uses resource JFPU0 for an average of 1cy +/// per iteration. Consequently, the resource pressure on JFPU0 is of 2cy per +/// iteration. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H +#define LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H + +#include "HWEventListener.h" +#include "SourceMgr.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include + +namespace mca { + +class Backend; + +/// This class collects resource pressure statistics and it is able to print +/// out all the collected information as a table to an output stream. +class ResourcePressureView : public HWEventListener { + const llvm::MCSubtargetInfo &STI; + llvm::MCInstPrinter &MCIP; + const SourceMgr &Source; + + // Map to quickly get a resource descriptor from a mask. + std::map Resource2VecIndex; + + // Table of resources used by instructions. + std::vector ResourceUsage; + unsigned NumResourceUnits; + + const llvm::MCInst &GetMCInstFromIndex(unsigned Index) const; + void printResourcePressurePerIteration(llvm::raw_ostream &OS, + unsigned Executions) const; + void printResourcePressurePerInstruction(llvm::raw_ostream &OS, + unsigned Executions) const; + void initialize(const llvm::ArrayRef ProcResoureMasks); + +public: + ResourcePressureView(const llvm::MCSubtargetInfo &ST, + llvm::MCInstPrinter &Printer, const SourceMgr &SM, + const llvm::ArrayRef ProcResourceMasks) + : STI(ST), MCIP(Printer), Source(SM) { + initialize(ProcResourceMasks); + } + + void onInstructionIssued( + unsigned Index, + const llvm::ArrayRef> &Used) override; + + void printResourcePressure(llvm::raw_ostream &OS, unsigned Cycles) const { + unsigned Executions = Source.getNumIterations(); + printResourcePressurePerIteration(OS, Executions); + printResourcePressurePerInstruction(OS, Executions); + } +}; + +} // namespace mca + +#endif Index: llvm/trunk/tools/llvm-mca/ResourcePressureView.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/ResourcePressureView.cpp +++ llvm/trunk/tools/llvm-mca/ResourcePressureView.cpp @@ -0,0 +1,176 @@ +//===--------------------- ResourcePressureView.cpp -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file implements methods in the ResourcePressureView interface. +/// +//===----------------------------------------------------------------------===// + +#include "ResourcePressureView.h" +#include "llvm/Support/raw_ostream.h" + +namespace mca { + +using namespace llvm; + +void ResourcePressureView::initialize( + const ArrayRef ProcResourceMasks) { + // Populate the map of resource descriptors. + unsigned R2VIndex = 0; + const MCSchedModel &SM = STI.getSchedModel(); + for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { + const MCProcResourceDesc &ProcResource = *SM.getProcResource(I); + unsigned NumUnits = ProcResource.NumUnits; + // Skip groups and invalid resources with zero units. + if (ProcResource.SubUnitsIdxBegin || !NumUnits) + continue; + + uint64_t ResourceMask = ProcResourceMasks[I]; + Resource2VecIndex.insert( + std::pair(ResourceMask, R2VIndex)); + R2VIndex += ProcResource.NumUnits; + } + + NumResourceUnits = R2VIndex; + ResourceUsage.resize(NumResourceUnits * (Source.size() + 1)); + std::fill(ResourceUsage.begin(), ResourceUsage.end(), 0); +} + +void ResourcePressureView::onInstructionIssued( + unsigned Index, const ArrayRef> &Used) { + unsigned SourceIdx = Index % Source.size(); + for (const std::pair &Use : Used) { + const ResourceRef &RR = Use.first; + assert(Resource2VecIndex.find(RR.first) != Resource2VecIndex.end()); + unsigned R2VIndex = Resource2VecIndex[RR.first]; + R2VIndex += countTrailingZeros(RR.second); + ResourceUsage[R2VIndex + NumResourceUnits * SourceIdx] += Use.second; + ResourceUsage[R2VIndex + NumResourceUnits * Source.size()] += Use.second; + } +} + +static void printColumnNames(raw_string_ostream &OS, const MCSchedModel &SM) { + for (unsigned I = 1, ResourceIndex = 0, E = SM.getNumProcResourceKinds(); + I < E; ++I) { + const MCProcResourceDesc &ProcResource = *SM.getProcResource(I); + unsigned NumUnits = ProcResource.NumUnits; + // Skip groups and invalid resources with zero units. + if (ProcResource.SubUnitsIdxBegin || !NumUnits) + continue; + + if (NumUnits == 1) { + OS << '[' << ResourceIndex << ']'; + if (ResourceIndex < 10) + OS << " "; + else + OS << " "; + ResourceIndex++; + continue; + } + + for (unsigned J = 0; J < NumUnits; ++J) { + OS << "[" << ResourceIndex << '.' << J << ']'; + if (ResourceIndex < 10) + OS << " "; + else + OS << ' '; + } + ResourceIndex++; + } +} + +void ResourcePressureView::printResourcePressurePerIteration( + raw_ostream &OS, unsigned Executions) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + + TempStream << "\n\nResources:\n"; + const MCSchedModel &SM = STI.getSchedModel(); + for (unsigned I = 1, ResourceIndex = 0, E = SM.getNumProcResourceKinds(); + I < E; ++I) { + const MCProcResourceDesc &ProcResource = *SM.getProcResource(I); + unsigned NumUnits = ProcResource.NumUnits; + // Skip groups and invalid resources with zero units. + if (ProcResource.SubUnitsIdxBegin || !NumUnits) + continue; + + for (unsigned J = 0; J < NumUnits; ++J) { + TempStream << '[' << ResourceIndex; + if (NumUnits > 1) + TempStream << '.' << J; + TempStream << "] - " << ProcResource.Name << '\n'; + } + + ResourceIndex++; + } + + TempStream << "\n\nResource pressure per iteration:\n"; + printColumnNames(TempStream, SM); + TempStream << '\n'; + + for (unsigned I = 0; I < NumResourceUnits; ++I) { + unsigned Usage = ResourceUsage[I + Source.size() * NumResourceUnits]; + if (!Usage) { + TempStream << " - "; + continue; + } + + double Pressure = (double)Usage / Executions; + TempStream << format("%.2f", Pressure); + if (Pressure < 10.0) + TempStream << " "; + else if (Pressure < 100.0) + TempStream << " "; + else + TempStream << ' '; + } + + TempStream.flush(); + OS << Buffer; +} + +void ResourcePressureView::printResourcePressurePerInstruction( + raw_ostream &OS, unsigned Executions) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + + TempStream << "\n\nResource pressure by instruction:\n"; + printColumnNames(TempStream, STI.getSchedModel()); + TempStream << "\tInstructions:\n"; + + for (unsigned I = 0, E = Source.size(); I < E; ++I) { + for (unsigned J = 0; J < NumResourceUnits; ++J) { + unsigned Usage = ResourceUsage[J + I * NumResourceUnits]; + if (Usage == 0) { + TempStream << " - "; + } else { + double Pressure = (double)Usage / Executions; + if (Pressure < 0.005) { + TempStream << " - "; + } else { + TempStream << format("%.2f", Pressure); + if (Pressure < 10.0) + TempStream << " "; + else if (Pressure < 100.0) + TempStream << " "; + else + TempStream << ' '; + } + } + } + + MCIP.printInst(&Source.getMCInstFromIndex(I), TempStream, "", STI); + TempStream << '\n'; + TempStream.flush(); + OS << Buffer; + Buffer = ""; + } +} + +} // namespace mca Index: llvm/trunk/tools/llvm-mca/Scheduler.h =================================================================== --- llvm/trunk/tools/llvm-mca/Scheduler.h +++ llvm/trunk/tools/llvm-mca/Scheduler.h @@ -0,0 +1,543 @@ +//===--------------------- Scheduler.h ------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// A scheduler for Processor Resource Units and Processor Resource Groups. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_SCHEDULER_H +#define LLVM_TOOLS_LLVM_MCA_SCHEDULER_H + +#include "Instruction.h" +#include "LSUnit.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include + +namespace mca { + +class Backend; +/// Used to notify the internal state of a processor resource. +/// +/// A processor resource is available if it is not reserved, and there are +/// available slots in the buffer. A processor resource is unavailable if it +/// is either reserved, or the associated buffer is full. A processor resource +/// with a buffer size of -1 is always available if it is not reserved. +/// +/// Values of type ResourceStateEvent are returned by method +/// ResourceState::isBufferAvailable(), which is used to query the internal +/// state of a resource. +/// +/// The naming convention for resource state events is: +/// * Event names start with prefix RS_ +/// * Prefix RS_ is followed by a string describing the actual resource state. +enum ResourceStateEvent { + RS_BUFFER_AVAILABLE, + RS_BUFFER_UNAVAILABLE, + RS_RESERVED +}; + +/// \brief A descriptor for processor resources. +/// +/// Each object of class ResourceState is associated to a specific processor +/// resource. There is an instance of this class for every processor resource +/// defined by the scheduling model. +/// A ResourceState dynamically tracks the availability of units of a processor +/// resource. For example, the ResourceState of a ProcResGroup tracks the +/// availability of resource units which are part of the group. +/// +/// Internally, ResourceState uses a round-robin selector to identify +/// which unit of the group shall be used next. +class ResourceState { + // A resource unique identifier generated by the tool. + // For processor resource groups, the number of number of bits set in this + // mask is equivalent to the cardinality of the group plus one. + // Excluding the most significant bit, the remaining bits in the resource mask + // identify resources that are part of the group. + // + // Example, lets assume that this ResourceState describes a + // group containing ResourceA and ResourceB: + // + // ResourceA -- Mask: 0b001 + // ResourceB -- Mask: 0b010 + // ResourceAB -- Mask: 0b100 U (ResourceA::Mask | ResourceB::Mask) == 0b111 + // + // There is only one bit set for non-group resources. + // A ResourceMask can be used to solve set membership problems with simple bit + // manipulation operations. + uint64_t ResourceMask; + + // A ProcResource can specify a number of units. For the purpose of dynamic + // scheduling, a processor resource with more than one unit behaves like a + // group. This field has one bit set for every unit/resource that is part of + // the group. + // For groups, this field defaults to 'ResourceMask'. For non-group + // resources, the number of bits set in this mask is equivalent to the + // number of units (i.e. field 'NumUnits' in 'ProcResourceUnits'). + uint64_t ResourceSizeMask; + + // A simple round-robin selector for processor resources. + // Each bit of the mask identifies a sub resource within this group. + // + // As an example, lets assume that this ResourceState describes a + // processor resource group composed of the following three units: + // ResourceA -- 0b001 + // ResourceB -- 0b010 + // ResourceC -- 0b100 + // + // Each unit is identified by a ResourceMask which always contains a + // single bit set. Field NextInSequenceMask is initially set to value + // 0xb111. That value is obtained by OR'ing the resource masks of + // processor resource that are part of the group. + // + // NextInSequenceMask -- 0b111 + // + // Field NextInSequenceMask is used by the resource manager (i.e. + // an object of class ResourceManager) to select the "next available resource" + // from the set. The algorithm would prioritize resources with a bigger + // ResourceMask value. + // + // In this example, there are three resources in the set, and 'ResourceC' + // has the highest mask value. The round-robin selector would firstly select + // 'ResourceC', then 'ResourceB', and eventually 'ResourceA'. + // + // When a resource R is used, its corresponding bit is cleared from the set. + // + // Back to the example: + // If 'ResourceC' is selected, then the new value of NextInSequenceMask + // becomes 0xb011. + // + // When NextInSequenceMask becomes zero, it is reset to its original value + // (in this example, that value would be 0b111). + uint64_t NextInSequenceMask; + + // Some instructions can only be issued on very specific pipeline resources. + // For those instructions, we know exactly which resource would be consumed + // without having to dynamically select it using field 'NextInSequenceMask'. + // + // The resource mask bit associated to the (statically) selected + // processor resource is still cleared from the 'NextInSequenceMask'. + // If that bit was already zero in NextInSequenceMask, then we update + // mask 'RemovedFromNextInSequence'. + // + // When NextInSequenceMask is reset back to its initial value, the algorithm + // removes any bits which are set in RemoveFromNextInSequence. + uint64_t RemovedFromNextInSequence; + + // A mask of ready units. + uint64_t ReadyMask; + + // Buffered resources will have this field set to a positive number bigger + // than 0. A buffered resource behaves like a separate reservation station + // implementing its own buffer for out-of-order execution. + // A buffer of 1 is for units that force in-order execution. + // A value of 0 is treated specially. In particular, a resource with + // A BufferSize = 0 is for an in-order issue/dispatch resource. + // That means, this resource is reserved starting from the dispatch event, + // until all the "resource cycles" are consumed after the issue event. + // While this resource is reserved, no other instruction may be dispatched. + int BufferSize; + + // Available slots in the buffer (zero, if this is not a buffered resource). + unsigned AvailableSlots; + + // Maximum number of buffer slots seen used during one cycle. + // This helps tracking dynamic dispatch stalls caused by the lack of + // entries in the scheduler's queue. + unsigned MaxUsedSlots; + + // True if this is resource is currently unavailable. + // An instruction may "reserve" a resource for a number of cycles. + // During those cycles, the reserved resource cannot be used for other + // instructions, even if the ReadyMask is set. + bool Unavailable; + + bool isSubResourceReady(uint64_t ID) const { return ReadyMask & ID; } + + /// Returns the mask identifier of the next available resource in the set. + uint64_t getNextInSequence() const { + assert(NextInSequenceMask); + return llvm::PowerOf2Floor(NextInSequenceMask); + } + + /// Returns the mask of the next available resource within the set, + /// and updates the resource selector. + void updateNextInSequence() { + NextInSequenceMask ^= getNextInSequence(); + if (!NextInSequenceMask) + NextInSequenceMask = ResourceSizeMask; + } + + uint64_t computeResourceSizeMaskForGroup(uint64_t ResourceMask) { + assert(llvm::countPopulation(ResourceMask) > 1); + return ResourceMask ^ llvm::PowerOf2Floor(ResourceMask); + } + +public: + ResourceState(const llvm::MCProcResourceDesc &Desc, uint64_t Mask) + : ResourceMask(Mask) { + bool IsAGroup = llvm::countPopulation(ResourceMask) > 1; + ResourceSizeMask = IsAGroup ? computeResourceSizeMaskForGroup(ResourceMask) + : ((1ULL << Desc.NumUnits) - 1); + NextInSequenceMask = ResourceSizeMask; + RemovedFromNextInSequence = 0; + ReadyMask = ResourceSizeMask; + BufferSize = Desc.BufferSize; + AvailableSlots = BufferSize == -1 ? 0U : static_cast(BufferSize); + MaxUsedSlots = 0; + Unavailable = false; + } + + uint64_t getResourceMask() const { return ResourceMask; } + int getBufferSize() const { return BufferSize; } + unsigned getMaxUsedSlots() const { return MaxUsedSlots; } + + bool isBuffered() const { return BufferSize > 0; } + bool isInOrder() const { return BufferSize == 1; } + bool isADispatchHazard() const { return BufferSize == 0; } + bool isReserved() const { return Unavailable; } + + void setReserved() { Unavailable = true; } + void clearReserved() { Unavailable = false; } + + // A resource is ready if it is not reserved, and if there are enough + // available units. + // If a resource is also a dispatch hazard, then we don't check if + // it is reserved because that check would always return true. + // A resource marked as "dispatch hazard" is always reserved at + // dispatch time. When this method is called, the assumption is that + // the user of this resource has been already dispatched. + bool isReady(unsigned NumUnits = 1) const { + return (!isReserved() || isADispatchHazard()) && + llvm::countPopulation(ReadyMask) >= NumUnits; + } + bool isAResourceGroup() const { + return llvm::countPopulation(ResourceMask) > 1; + } + + bool containsResource(uint64_t ID) const { return ResourceMask & ID; } + + void markSubResourceAsUsed(uint64_t ID) { + assert(isSubResourceReady(ID)); + ReadyMask ^= ID; + } + + void releaseSubResource(uint64_t ID) { + assert(!isSubResourceReady(ID)); + ReadyMask ^= ID; + } + + unsigned getNumUnits() const { + return isAResourceGroup() ? 1U : llvm::countPopulation(ResourceSizeMask); + } + + uint64_t selectNextInSequence(); + void removeFromNextInSequence(uint64_t ID); + + ResourceStateEvent isBufferAvailable() const { + if (isADispatchHazard() && isReserved()) + return RS_RESERVED; + if (!isBuffered() || AvailableSlots) + return RS_BUFFER_AVAILABLE; + return RS_BUFFER_UNAVAILABLE; + } + + void reserveBuffer() { + if (AvailableSlots) + AvailableSlots--; + unsigned UsedSlots = static_cast(BufferSize) - AvailableSlots; + MaxUsedSlots = std::max(MaxUsedSlots, UsedSlots); + } + + void releaseBuffer() { + if (BufferSize > 0) + AvailableSlots++; + assert(AvailableSlots <= static_cast(BufferSize)); + } + +#ifndef NDEBUG + void dump() const; +#endif +}; + +/// \brief A resource unit identifier. +/// +/// This is used to identify a specific processor resource unit using a pair +/// of indices where the 'first' index is a processor resource mask, and the +/// 'second' index is an index for a "sub-resource" (i.e. unit). +typedef std::pair ResourceRef; + +// First: a resource mask identifying a buffered resource. +// Second: max number of buffer entries used in this resource. +typedef std::pair BufferUsageEntry; + +/// A resource manager for processor resource units and groups. +/// +/// This class owns all the ResourceState objects, and it is responsible for +/// acting on requests from a Scheduler by updating the internal state of +/// ResourceState objects. +/// This class doesn't know about instruction itineraries and functional units. +/// In future, it can be extended to support itineraries too through the same +/// public interface. +class ResourceManager { + // The resource manager owns all the ResourceState. + using UniqueResourceState = std::unique_ptr; + llvm::SmallDenseMap Resources; + + // Keeps track of which resources are busy, and how many cycles are left + // before those become usable again. + llvm::SmallDenseMap BusyResources; + + // A table to map processor resource IDs to processor resource masks. + llvm::SmallVector ProcResID2Mask; + + // Adds a new resource state in Resources, as well as a new descriptor in + // ResourceDescriptor. + void addResource(const llvm::MCProcResourceDesc &Desc, uint64_t Mask); + + // Compute processor resource masks for each processor resource declared by + // the scheduling model. + void computeProcResourceMasks(const llvm::MCSchedModel &SM); + + // Populate resource descriptors. + void initialize(const llvm::MCSchedModel &SM) { + computeProcResourceMasks(SM); + for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) + addResource(*SM.getProcResource(I), ProcResID2Mask[I]); + } + + // Returns the actual resource unit that will be used. + ResourceRef selectPipe(uint64_t ResourceID); + + void use(ResourceRef RR); + void release(ResourceRef RR); + + unsigned getNumUnits(uint64_t ResourceID) const { + assert(Resources.find(ResourceID) != Resources.end()); + return Resources.find(ResourceID)->getSecond()->getNumUnits(); + } + + // Reserve a specific Resource kind. + void reserveBuffer(uint64_t ResourceID) { + assert(isBufferAvailable(ResourceID) == + ResourceStateEvent::RS_BUFFER_AVAILABLE); + ResourceState &Resource = *Resources[ResourceID]; + Resource.reserveBuffer(); + } + + void releaseBuffer(uint64_t ResourceID) { + Resources[ResourceID]->releaseBuffer(); + } + + ResourceStateEvent isBufferAvailable(uint64_t ResourceID) const { + const ResourceState &Resource = *Resources.find(ResourceID)->second; + return Resource.isBufferAvailable(); + } + + bool isReady(uint64_t ResourceID, unsigned NumUnits) const { + const ResourceState &Resource = *Resources.find(ResourceID)->second; + return Resource.isReady(NumUnits); + } + +public: + ResourceManager(const llvm::MCSchedModel &SM) { initialize(SM); } + + ResourceStateEvent canBeDispatched(const InstrDesc &Desc) const { + ResourceStateEvent Result = ResourceStateEvent::RS_BUFFER_AVAILABLE; + for (uint64_t Buffer : Desc.Buffers) { + Result = isBufferAvailable(Buffer); + if (Result != ResourceStateEvent::RS_BUFFER_AVAILABLE) + break; + } + + return Result; + } + + void reserveBuffers(const InstrDesc &Desc) { + for (const uint64_t R : Desc.Buffers) + reserveBuffer(R); + } + + void releaseBuffers(const InstrDesc &Desc) { + for (const uint64_t R : Desc.Buffers) + releaseBuffer(R); + } + + void reserveResource(uint64_t ResourceID) { + ResourceState &Resource = *Resources[ResourceID]; + assert(!Resource.isReserved()); + Resource.setReserved(); + } + + void releaseResource(uint64_t ResourceID) { + ResourceState &Resource = *Resources[ResourceID]; + Resource.clearReserved(); + } + + void reserveDispatchHazardResources(const InstrDesc &Desc); + + // Returns true if all resources are in-order, and there is at least one + // resource which is a dispatch hazard (BufferSize = 0). + bool mustIssueImmediately(const InstrDesc &Desc); + + bool canBeIssued(const InstrDesc &Desc) const; + double getRThroughput(const InstrDesc &Desc) const; + + void issueInstruction( + unsigned Index, const InstrDesc &Desc, + llvm::SmallVectorImpl> &Pipes); + + void cycleEvent(llvm::SmallVectorImpl &ResourcesFreed); + + void getBuffersUsage(std::vector &Usage) const { + for (const std::pair &Resource : Resources) { + const ResourceState &RS = *Resource.second; + if (RS.isBuffered()) + Usage.emplace_back(std::pair(RS.getResourceMask(), + RS.getMaxUsedSlots())); + } + } + + const llvm::ArrayRef getProcResMasks() const { + return ProcResID2Mask; + } + +#ifndef NDEBUG + void dump() const { + for (const std::pair &Resource : Resources) + Resource.second->dump(); + } +#endif +}; // namespace mca + +/// Class Scheduler is responsible for issuing instructions to pipeline +/// resources. Internally, it delegates to a ResourceManager the management of +/// processor resources. +/// This class is also responsible for tracking the progress of instructions +/// from the dispatch stage, until the write-back stage. +/// +/// An nstruction dispatched to the Scheduler is initially placed into either +/// the 'WaitQueue' or the 'ReadyQueue' depending on the availability of the +/// input operands. Instructions in the WaitQueue are ordered by instruction +/// index. An instruction is moved from the WaitQueue to the ReadyQueue when +/// register operands become available, and all memory dependencies are met. +/// Instructions that are moved from the WaitQueue to the ReadyQueue transition +/// from state 'IS_AVAILABLE' to state 'IS_READY'. +/// +/// At the beginning of each cycle, the Scheduler checks if there are +/// instructions in the WaitQueue that can be moved to the ReadyQueue. If the +/// ReadyQueue is not empty, then older instructions from the queue are issued +/// to the processor pipelines, and the underlying ResourceManager is updated +/// accordingly. The ReadyQueue is ordered by instruction index to guarantee +/// that the first instructions in the set are also the oldest. +/// +/// An Instruction is moved from the ReadyQueue the `IssuedQueue` when it is +/// issued to a (one or more) pipeline(s). This event also causes an instruction +/// state transition (i.e. from state IS_READY, to state IS_EXECUTING). +/// An Instruction leaves the IssuedQueue when it reaches the write-back stage. +class Scheduler { + const llvm::MCSchedModel &SM; + + // Hardware resources that are managed by this scheduler. + std::unique_ptr Resources; + std::unique_ptr LSU; + + // The Backend gets notified when instructions are ready/issued/executed. + Backend *Owner; + + using QueueEntryTy = std::pair; + std::map WaitQueue; + std::map ReadyQueue; + std::map IssuedQueue; + + void notifyInstructionIssued( + unsigned Index, + const llvm::ArrayRef> &Used); + void notifyInstructionExecuted(unsigned Index); + void notifyInstructionReady(unsigned Index); + void notifyResourceAvailable(const ResourceRef &RR); + + /// Issue instructions from the ready queue by giving priority to older + /// instructions. + void issue(); + + /// Issue an instruction without updating the ready queue. + void issueInstruction(Instruction *IS, unsigned InstrIndex); + void updatePendingQueue(); + void updateIssuedQueue(); + +public: + Scheduler(Backend *B, const llvm::MCSchedModel &Model, unsigned LoadQueueSize, + unsigned StoreQueueSize, bool AssumeNoAlias) + : SM(Model), Resources(llvm::make_unique(SM)), + LSU(llvm::make_unique(LoadQueueSize, StoreQueueSize, + AssumeNoAlias)), + Owner(B) {} + + /// Scheduling events. + /// + /// The DispatchUnit is responsible for querying the Scheduler before + /// dispatching new instructions. Queries are performed through method + /// `Scheduler::CanBeDispatched`, which returns an instance of this enum to + /// tell if the dispatch would fail or not. If scheduling resources are + /// available, and the instruction can be dispatched, then the query returns + /// HWS_AVAILABLE. A values different than HWS_AVAILABLE means that the + /// instruction cannot be dispatched during this cycle. + /// + /// Each event name starts with prefix "HWS_", and it is followed by + /// a substring which describes the reason why the Scheduler was unavailable + /// (or "AVAILABLE" if the instruction is allowed to be dispatched). + /// + /// HWS_QUEUE_UNAVAILABLE is returned if there are not enough available slots + /// in the scheduler's queue. That means, one (or more) buffered resources + /// consumed by the instruction were full. + /// + /// HWS_LD_QUEUE_UNAVAILABLE is returned when an instruction 'mayLoad', and + /// the load queue in the load/store unit (implemented by class LSUnit) is + /// full. Similarly, HWS_ST_QUEUE_UNAVAILABLE is returned when the store + /// queue is full, and the instruction to be dispatched 'mayStore'. + /// + /// HWS_DISPATCH_GROUP_RESTRICTION is only returned in special cases where the + /// instruction consumes an in-order issue/dispatch resource (i.e. a resource + /// with `BufferSize=0`), and the pipeline resource is not immediately + /// available. + enum Event { + HWS_AVAILABLE, + HWS_QUEUE_UNAVAILABLE, + HWS_DISPATCH_GROUP_RESTRICTION, + HWS_LD_QUEUE_UNAVAILABLE, + HWS_ST_QUEUE_UNAVAILABLE + }; + + Event canBeDispatched(const InstrDesc &Desc) const; + Instruction *scheduleInstruction(unsigned Idx, Instruction *MCIS); + + double getRThroughput(const InstrDesc &Desc) const { + return Resources->getRThroughput(Desc); + } + + void cycleEvent(unsigned Cycle); + + void getBuffersUsage(std::vector &Usage) const { + Resources->getBuffersUsage(Usage); + } + + const llvm::ArrayRef getProcResourceMasks() const { + return Resources->getProcResMasks(); + } + +#ifndef NDEBUG + void dump() const; +#endif +}; + +} // Namespace mca + +#endif Index: llvm/trunk/tools/llvm-mca/Scheduler.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/Scheduler.cpp +++ llvm/trunk/tools/llvm-mca/Scheduler.cpp @@ -0,0 +1,458 @@ +//===--------------------- Scheduler.cpp ------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// A scheduler for processor resource units and processor resource groups. +// +//===----------------------------------------------------------------------===// + +#include "Scheduler.h" +#include "Backend.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "llvm-mca" + +namespace mca { + +using namespace llvm; + +uint64_t ResourceState::selectNextInSequence() { + assert(isReady()); + uint64_t Next = getNextInSequence(); + while (!isSubResourceReady(Next)) { + updateNextInSequence(); + Next = getNextInSequence(); + } + return Next; +} + +#ifndef NDEBUG +void ResourceState::dump() const { + dbgs() << "MASK: " << ResourceMask << ", SIZE_MASK: " << ResourceSizeMask + << ", NEXT: " << NextInSequenceMask << ", RDYMASK: " << ReadyMask + << ", BufferSize=" << BufferSize + << ", AvailableSlots=" << AvailableSlots + << ", Reserved=" << Unavailable << '\n'; +} +#endif + +// Adds a new resource state in Resources, as well as a new descriptor in +// ResourceDescriptor. Map 'Resources' allows to quickly obtain ResourceState +// objects from resource mask identifiers. +void ResourceManager::addResource(const MCProcResourceDesc &Desc, + uint64_t Mask) { + assert(Resources.find(Mask) == Resources.end() && "Resource already added!"); + Resources[Mask] = llvm::make_unique(Desc, Mask); +} + +// Populate vector ProcResID2Mask with resource masks. One per each processor +// resource declared by the scheduling model. +void ResourceManager::computeProcResourceMasks(const MCSchedModel &SM) { + unsigned ProcResourceID = 0; + + // Create a unique bitmask for every processor resource unit. + // Skip resource at index 0, since it always references 'InvalidUnit'. + ProcResID2Mask.resize(SM.getNumProcResourceKinds()); + for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { + const MCProcResourceDesc &Desc = *SM.getProcResource(I); + if (Desc.SubUnitsIdxBegin) + continue; + ProcResID2Mask[I] = 1ULL << ProcResourceID; + ProcResourceID++; + } + + // Create a unique bitmask for every processor resource group. + for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { + const MCProcResourceDesc &Desc = *SM.getProcResource(I); + if (!Desc.SubUnitsIdxBegin) + continue; + ProcResID2Mask[I] |= 1ULL << ProcResourceID; + for (unsigned U = 0; U < Desc.NumUnits; ++U) { + uint64_t OtherMask = ProcResID2Mask[Desc.SubUnitsIdxBegin[U]]; + ProcResID2Mask[I] |= OtherMask; + } + ProcResourceID++; + } +} + +// Returns the actual resource consumed by this Use. +// First, is the primary resource ID. +// Second, is the specific sub-resource ID. +std::pair ResourceManager::selectPipe(uint64_t ResourceID) { + ResourceState &RS = *Resources[ResourceID]; + uint64_t SubResourceID = RS.selectNextInSequence(); + if (RS.isAResourceGroup()) + return selectPipe(SubResourceID); + return std::pair(ResourceID, SubResourceID); +} + +void ResourceState::removeFromNextInSequence(uint64_t ID) { + assert(NextInSequenceMask); + assert(countPopulation(ID) == 1); + if (ID > getNextInSequence()) + RemovedFromNextInSequence |= ID; + NextInSequenceMask = NextInSequenceMask & (~ID); + if (!NextInSequenceMask) { + NextInSequenceMask = ResourceSizeMask; + assert(NextInSequenceMask != RemovedFromNextInSequence); + NextInSequenceMask ^= RemovedFromNextInSequence; + RemovedFromNextInSequence = 0; + } +} + +void ResourceManager::use(ResourceRef RR) { + // Mark the sub-resource referenced by RR as used. + ResourceState &RS = *Resources[RR.first]; + RS.markSubResourceAsUsed(RR.second); + // If there are still available units in RR.first, + // then we are done. + if (RS.isReady()) + return; + + // Notify to other resources that RR.first is no longer available. + for (const std::pair &Res : Resources) { + ResourceState &Current = *Res.second.get(); + if (!Current.isAResourceGroup() || Current.getResourceMask() == RR.first) + continue; + + if (Current.containsResource(RR.first)) { + Current.markSubResourceAsUsed(RR.first); + Current.removeFromNextInSequence(RR.first); + } + } +} + +void ResourceManager::release(ResourceRef RR) { + ResourceState &RS = *Resources[RR.first]; + bool WasFullyUsed = !RS.isReady(); + RS.releaseSubResource(RR.second); + if (!WasFullyUsed) + return; + + for (const std::pair &Res : Resources) { + ResourceState &Current = *Res.second.get(); + if (!Current.isAResourceGroup() || Current.getResourceMask() == RR.first) + continue; + + if (Current.containsResource(RR.first)) + Current.releaseSubResource(RR.first); + } +} + +void ResourceManager::reserveDispatchHazardResources(const InstrDesc &Desc) { + for (const uint64_t R : Desc.Buffers) { + ResourceState &Resource = *Resources[R]; + if (Resource.isADispatchHazard()) { + assert(!Resource.isReserved()); + Resource.setReserved(); + } + } +} + +bool ResourceManager::canBeIssued(const InstrDesc &Desc) const { + return std::all_of(Desc.Resources.begin(), Desc.Resources.end(), + [&](const std::pair &E) { + unsigned NumUnits = + E.second.isReserved() ? 0U : E.second.NumUnits; + return isReady(E.first, NumUnits); + }); +} + +// Returns true if all resources are in-order, and there is at least one +// resource which is a dispatch hazard (BufferSize = 0). +bool ResourceManager::mustIssueImmediately(const InstrDesc &Desc) { + if (!canBeIssued(Desc)) + return false; + bool AllInOrderResources = std::all_of( + Desc.Buffers.begin(), Desc.Buffers.end(), [&](const unsigned BufferMask) { + const ResourceState &Resource = *Resources[BufferMask]; + return Resource.isInOrder() || Resource.isADispatchHazard(); + }); + if (!AllInOrderResources) + return false; + + return std::any_of(Desc.Buffers.begin(), Desc.Buffers.end(), + [&](const unsigned BufferMask) { + return Resources[BufferMask]->isADispatchHazard(); + }); +} + +double ResourceManager::getRThroughput(const InstrDesc &ID) const { + double RThroughput = 0; + for (const std::pair &Usage : ID.Resources) { + const CycleSegment &CS = Usage.second.CS; + assert(CS.begin() == 0); + + if (Usage.second.isReserved()) { + RThroughput = std::max(RThroughput, (double)CS.size()); + continue; + } + + unsigned Population = std::max(1U, countPopulation(Usage.first) - 1); + unsigned NumUnits = Population * getNumUnits(Usage.first); + NumUnits -= Usage.second.NumUnits - 1; + unsigned Cycles = CS.size(); + RThroughput = std::max(RThroughput, (double)Cycles / NumUnits); + } + return RThroughput; +} + +void ResourceManager::issueInstruction( + unsigned Index, const InstrDesc &Desc, + SmallVectorImpl> &Pipes) { + releaseBuffers(Desc); + for (const std::pair &R : Desc.Resources) { + const CycleSegment &CS = R.second.CS; + if (!CS.size()) { + releaseResource(R.first); + continue; + } + + assert(CS.begin() == 0 && "Invalid {Start, End} cycles!"); + if (!R.second.isReserved()) { + ResourceRef Pipe = selectPipe(R.first); + use(Pipe); + BusyResources[Pipe] += CS.size(); + Pipes.emplace_back(std::pair(Pipe, CS.size())); + } else { + assert((countPopulation(R.first) > 1) && "Expected a group!"); + // Mark this group as reserved. + assert(R.second.isReserved()); + reserveResource(R.first); + BusyResources[ResourceRef(R.first, R.first)] += CS.size(); + } + } +} + +void ResourceManager::cycleEvent(SmallVectorImpl &ResourcesFreed) { + for (std::pair &BR : BusyResources) { + if (BR.second) + BR.second--; + if (!BR.second) { + // Release this resource. + const ResourceRef &RR = BR.first; + + if (countPopulation(RR.first) == 1) + release(RR); + + releaseResource(RR.first); + ResourcesFreed.push_back(RR); + } + } + + for (const ResourceRef &RF : ResourcesFreed) + BusyResources.erase(RF); +} + +Instruction *Scheduler::scheduleInstruction(unsigned Idx, Instruction *MCIS) { + assert(WaitQueue.find(Idx) == WaitQueue.end()); + assert(ReadyQueue.find(Idx) == ReadyQueue.end()); + assert(IssuedQueue.find(Idx) == IssuedQueue.end()); + + // Special case where MCIS is a zero-latency instruction. A zero-latency + // instruction doesn't consume any scheduler resources. That is because it + // doesn't need to be executed. Most of the times, zero latency instructions + // are removed at register renaming stage. For example, register-register + // moves can be removed at register renaming stage by creating new aliases. + // Zero-idiom instruction (for example: a `xor reg, reg`) can also be + // eliminated at register renaming stage, since we know in advance that those + // clear their output register. + if (MCIS->isZeroLatency()) { + MCIS->forceExecuted(); + notifyInstructionIssued(Idx, {}); + notifyInstructionExecuted(Idx); + return MCIS; + } + + // Consume entries in the reservation stations. + const InstrDesc &Desc = MCIS->getDesc(); + Resources->reserveBuffers(Desc); + + // Mark units with BufferSize=0 as reserved. These resources will only + // be released after MCIS is issued, and all the ResourceCycles for + // those units have been consumed. + Resources->reserveDispatchHazardResources(Desc); + + bool MayLoad = Desc.MayLoad; + bool MayStore = Desc.MayStore; + if (MayLoad || MayStore) + LSU->reserve(Idx, MayLoad, MayStore, Desc.HasSideEffects); + + MCIS->dispatch(); + bool IsReady = MCIS->isReady(); + if (IsReady && (MayLoad || MayStore)) + IsReady &= LSU->isReady(Idx); + + if (!IsReady) { + DEBUG(dbgs() << "[SCHEDULER] Adding " << Idx << " to the Wait Queue\n"); + WaitQueue[Idx] = MCIS; + return MCIS; + } + notifyInstructionReady(Idx); + + // Special case where the instruction is ready, and it uses an in-order + // dispatch/issue processor resource. The instruction is issued immediately to + // the pipelines. Any other in-order buffered resources (i.e. BufferSize=1) + // are consumed. + if (Resources->mustIssueImmediately(Desc)) { + DEBUG(dbgs() << "[SCHEDULER] Instruction " << Idx + << " issued immediately\n"); + issueInstruction(MCIS, Idx); + return MCIS; + } + + DEBUG(dbgs() << "[SCHEDULER] Adding " << Idx << " to the Ready Queue\n"); + ReadyQueue[Idx] = MCIS; + return MCIS; +} + +void Scheduler::cycleEvent(unsigned /* unused */) { + SmallVector ResourcesFreed; + Resources->cycleEvent(ResourcesFreed); + + for (const ResourceRef &RR : ResourcesFreed) + notifyResourceAvailable(RR); + + updateIssuedQueue(); + updatePendingQueue(); + issue(); +} + +#ifndef NDEBUG +void Scheduler::dump() const { + dbgs() << "[SCHEDULER]: WaitQueue size is: " << WaitQueue.size() << '\n'; + dbgs() << "[SCHEDULER]: ReadyQueue size is: " << ReadyQueue.size() << '\n'; + dbgs() << "[SCHEDULER]: IssuedQueue size is: " << IssuedQueue.size() << '\n'; + Resources->dump(); +} +#endif + +Scheduler::Event Scheduler::canBeDispatched(const InstrDesc &Desc) const { + if (Desc.MayLoad && LSU->isLQFull()) + return HWS_LD_QUEUE_UNAVAILABLE; + if (Desc.MayStore && LSU->isSQFull()) + return HWS_ST_QUEUE_UNAVAILABLE; + + Scheduler::Event Event; + switch (Resources->canBeDispatched(Desc)) { + case ResourceStateEvent::RS_BUFFER_AVAILABLE: + Event = HWS_AVAILABLE; + break; + case ResourceStateEvent::RS_BUFFER_UNAVAILABLE: + Event = HWS_QUEUE_UNAVAILABLE; + break; + case ResourceStateEvent::RS_RESERVED: + Event = HWS_DISPATCH_GROUP_RESTRICTION; + } + return Event; +} + +void Scheduler::issueInstruction(Instruction *IS, unsigned InstrIndex) { + // Issue the instruction and collect all the consumed resources + // into a vector. That vector is then used to notify the listener. + // Most instructions consume very few resurces (typically one or + // two resources). We use a small vector here, and conservatively + // initialize its capacity to 4. This should address the majority of + // the cases. + SmallVector, 4> UsedResources; + + const InstrDesc &D = IS->getDesc(); + Resources->issueInstruction(InstrIndex, D, UsedResources); + // Notify the instruction that it started executing. + // This updates the internal state of each write. + IS->execute(); + + if (D.MaxLatency) { + IssuedQueue[InstrIndex] = IS; + notifyInstructionIssued(InstrIndex, UsedResources); + } else { + // A zero latency instruction which reads and/or updates registers. + notifyInstructionIssued(InstrIndex, UsedResources); + notifyInstructionExecuted(InstrIndex); + } +} + +void Scheduler::issue() { + std::vector ToRemove; + for (const QueueEntryTy QueueEntry : ReadyQueue) { + // Give priority to older instructions in ReadyQueue. The ready queue is + // ordered by key, and therefore older instructions are visited first. + Instruction *IS = QueueEntry.second; + const InstrDesc &D = IS->getDesc(); + if (!Resources->canBeIssued(D)) + continue; + unsigned InstrIndex = QueueEntry.first; + issueInstruction(IS, InstrIndex); + ToRemove.emplace_back(InstrIndex); + } + + for (const unsigned InstrIndex : ToRemove) + ReadyQueue.erase(InstrIndex); +} + +void Scheduler::updatePendingQueue() { + // Scan the set of waiting instructions and promote them to the + // ready queue if operands are all ready. + for (auto I = WaitQueue.begin(), E = WaitQueue.end(); I != E;) { + const QueueEntryTy Entry = *I; + Entry.second->cycleEvent(); + + const InstrDesc &Desc = Entry.second->getDesc(); + bool IsMemOp = Desc.MayLoad || Desc.MayStore; + bool IsReady = Entry.second->isReady(); + if (IsReady && IsMemOp) + IsReady &= LSU->isReady(Entry.first); + + if (IsReady) { + notifyInstructionReady(Entry.first); + ReadyQueue[Entry.first] = Entry.second; + auto ToRemove = I; + ++I; + WaitQueue.erase(ToRemove); + } else { + ++I; + } + } +} + +void Scheduler::updateIssuedQueue() { + for (auto I = IssuedQueue.begin(), E = IssuedQueue.end(); I != E;) { + const QueueEntryTy Entry = *I; + Entry.second->cycleEvent(); + if (Entry.second->isExecuted()) { + notifyInstructionExecuted(Entry.first); + auto ToRemove = I; + ++I; + IssuedQueue.erase(ToRemove); + } else { + DEBUG(dbgs() << "[SCHEDULER]: Instruction " << Entry.first + << " is still executing.\n"); + ++I; + } + } +} + +void Scheduler::notifyInstructionIssued( + unsigned Index, const ArrayRef> &Used) { + Owner->notifyInstructionIssued(Index, Used); +} + +void Scheduler::notifyInstructionExecuted(unsigned Index) { + LSU->onInstructionExecuted(Index); + Owner->notifyInstructionExecuted(Index); +} + +void Scheduler::notifyInstructionReady(unsigned Index) { + Owner->notifyInstructionReady(Index); +} + +void Scheduler::notifyResourceAvailable(const ResourceRef &RR) { + Owner->notifyResourceAvailable(RR); +} +} // namespace mca Index: llvm/trunk/tools/llvm-mca/SourceMgr.h =================================================================== --- llvm/trunk/tools/llvm-mca/SourceMgr.h +++ llvm/trunk/tools/llvm-mca/SourceMgr.h @@ -0,0 +1,65 @@ +//===--------------------- SourceMgr.h --------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements class SourceMgr. Class SourceMgr abstracts the input +/// code sequence (a sequence of MCInst), and assings unique identifiers to +/// every instruction in the sequence. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H +#define LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H + +#include "llvm/MC/MCInst.h" +#include + +namespace mca { + +typedef std::pair InstRef; + +class SourceMgr { + using InstVec = std::vector>; + InstVec Sequence; + unsigned Current; + unsigned Iterations; + static const unsigned DefaultIterations = 70; + +public: + SourceMgr(unsigned NumIterations) + : Current(0), + Iterations(NumIterations ? NumIterations : DefaultIterations) {} + + unsigned getCurrentIteration() const { return Current / Sequence.size(); } + unsigned getNumIterations() const { return Iterations; } + unsigned size() const { return Sequence.size(); } + const InstVec &getSequence() const { return Sequence; } + InstVec &getSequence() { return Sequence; } + + bool hasNext() { return Current < (Iterations * size()); } + void updateNext() { Current++; } + + const InstRef peekNext() const { + unsigned Index = getCurrentInstructionIndex(); + return InstRef(Current, Sequence[Index].get()); + } + + unsigned getCurrentInstructionIndex() const { + return Current % Sequence.size(); + } + + const llvm::MCInst &getMCInstFromIndex(unsigned Index) const { + return *Sequence[Index % size()]; + } + + bool isEmpty() const { return size() == 0; } +}; + +} // namespace mca + +#endif Index: llvm/trunk/tools/llvm-mca/TimelineView.h =================================================================== --- llvm/trunk/tools/llvm-mca/TimelineView.h +++ llvm/trunk/tools/llvm-mca/TimelineView.h @@ -0,0 +1,181 @@ +//===--------------------- TimelineView.h ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \brief +/// +/// This file implements a timeline view for the llvm-mca tool. +/// +/// Class TimelineView observes events generated by the backend. For every +/// instruction executed by the backend, it stores information related to +/// state transition. It then plots that information in the form of a table +/// as reported by the example below: +/// +/// Timeline view: +/// 0123456 +/// Index 0123456789 +/// +/// [0,0] DeER . . .. vmovshdup %xmm0, %xmm1 +/// [0,1] DeER . . .. vpermilpd $1, %xmm0, %xmm2 +/// [0,2] .DeER. . .. vpermilps $231, %xmm0, %xmm5 +/// [0,3] .DeeeER . .. vaddss %xmm1, %xmm0, %xmm3 +/// [0,4] . D==eeeER. .. vaddss %xmm3, %xmm2, %xmm4 +/// [0,5] . D=====eeeER .. vaddss %xmm4, %xmm5, %xmm6 +/// +/// [1,0] . DeE------R .. vmovshdup %xmm0, %xmm1 +/// [1,1] . DeE------R .. vpermilpd $1, %xmm0, %xmm2 +/// [1,2] . DeE-----R .. vpermilps $231, %xmm0, %xmm5 +/// [1,3] . D=eeeE--R .. vaddss %xmm1, %xmm0, %xmm3 +/// [1,4] . D===eeeER .. vaddss %xmm3, %xmm2, %xmm4 +/// [1,5] . D======eeeER vaddss %xmm4, %xmm5, %xmm6 +/// +/// There is an entry for every instruction in the input assembly sequence. +/// The first field is a pair of numbers obtained from the instruction index. +/// The first element of the pair is the iteration index, while the second +/// element of the pair is a sequence number (i.e. a position in the assembly +/// sequence). +/// The second field of the table is the actual timeline information; each +/// column is the information related to a specific cycle of execution. +/// The timeline of an instruction is described by a sequence of character +/// where each character represents the instruction state at a specific cycle. +/// +/// Possible instruction states are: +/// D: Instruction Dispatched +/// e: Instruction Executing +/// E: Instruction Executed (write-back stage) +/// R: Instruction retired +/// =: Instruction waiting in the Scheduler's queue +/// -: Instruction executed, waiting to retire in order. +/// +/// dots ('.') and empty spaces are cycles where the instruction is not +/// in-flight. +/// +/// The last column is the assembly instruction associated to the entry. +/// +/// Based on the timeline view information from the example, instruction 0 +/// at iteration 0 was dispatched at cycle 0, and was retired at cycle 3. +/// Instruction [0,1] was also dispatched at cycle 0, and it retired at +/// the same cycle than instruction [0,0]. +/// Instruction [0,4] has been dispatched at cycle 2. However, it had to +/// wait for two cycles before being issued. That is because operands +/// became ready only at cycle 5. +/// +/// This view helps further understanding bottlenecks and the impact of +/// resource pressure on the code. +/// +/// To better understand why instructions had to wait for multiple cycles in +/// the scheduler's queue, class TimelineView also reports extra timing info +/// in another table named "Average Wait times" (see example below). +/// +/// +/// Average Wait times (based on the timeline view): +/// [0]: Executions +/// [1]: Average time spent waiting in a scheduler's queue +/// [2]: Average time spent waiting in a scheduler's queue while ready +/// [3]: Average time elapsed from WB until retire stage +/// +/// [0] [1] [2] [3] +/// 0. 2 1.0 1.0 3.0 vmovshdup %xmm0, %xmm1 +/// 1. 2 1.0 1.0 3.0 vpermilpd $1, %xmm0, %xmm2 +/// 2. 2 1.0 1.0 2.5 vpermilps $231, %xmm0, %xmm5 +/// 3. 2 1.5 0.5 1.0 vaddss %xmm1, %xmm0, %xmm3 +/// 4. 2 3.5 0.0 0.0 vaddss %xmm3, %xmm2, %xmm4 +/// 5. 2 6.5 0.0 0.0 vaddss %xmm4, %xmm5, %xmm6 +/// +/// By comparing column [2] with column [1], we get an idea about how many +/// cycles were spent in the scheduler's queue due to data dependencies. +/// +/// In this example, instruction 5 spent an average of ~6 cycles in the +/// scheduler's queue. As soon as operands became ready, the instruction +/// was immediately issued to the pipeline(s). +/// That is expected because instruction 5 cannot transition to the "ready" +/// state until %xmm4 is written by instruction 4. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H +#define LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H + +#include "HWEventListener.h" +#include "SourceMgr.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/raw_ostream.h" +#include + +namespace mca { + +/// \brief This class listens to instruction state transition events +/// in order to construct a timeline information. +/// +/// For every instruction executed by the Backend, this class constructs +/// a TimelineViewEntry object. TimelineViewEntry objects are then used +/// to print the timeline information, as well as the "average wait times" +/// for every instruction in the input assembly sequence. +class TimelineView : public HWEventListener { + const llvm::MCSubtargetInfo &STI; + llvm::MCInstPrinter &MCIP; + const SourceMgr &AsmSequence; + + unsigned CurrentCycle; + unsigned MaxCycle; + unsigned LastCycle; + + struct TimelineViewEntry { + unsigned CycleDispatched; + unsigned CycleReady; + unsigned CycleIssued; + unsigned CycleExecuted; + unsigned CycleRetired; + }; + std::vector Timeline; + + struct WaitTimeEntry { + unsigned Executions; + unsigned CyclesSpentInSchedulerQueue; + unsigned CyclesSpentInSQWhileReady; + unsigned CyclesSpentAfterWBAndBeforeRetire; + }; + std::vector WaitTime; + + void printTimelineViewEntry(llvm::raw_string_ostream &OS, + const TimelineViewEntry &E, unsigned Iteration, + unsigned SourceIndex) const; + void printWaitTimeEntry(llvm::raw_string_ostream &OS, const WaitTimeEntry &E, + unsigned Index) const; + + const unsigned DEFAULT_ITERATIONS = 10; + +public: + TimelineView(const llvm::MCSubtargetInfo &sti, llvm::MCInstPrinter &Printer, + const SourceMgr &Sequence, unsigned MaxIterations, + unsigned Cycles) + : STI(sti), MCIP(Printer), AsmSequence(Sequence), CurrentCycle(0), + MaxCycle(Cycles == 0 ? 80 : Cycles), LastCycle(0) { + initialize(MaxIterations); + } + + void initialize(unsigned MaxIterations); + + // Event handlers. + void onInstructionDispatched(unsigned Index) override; + void onInstructionReady(unsigned Index) override; + void onInstructionIssued( + unsigned Index, + const llvm::ArrayRef> &Used) override; + void onInstructionExecuted(unsigned Index) override; + void onInstructionRetired(unsigned Index) override; + void onCycleBegin(unsigned Cycle) override { CurrentCycle = Cycle; } + + // print functionalities. + void printTimeline(llvm::raw_ostream &OS) const; + void printAverageWaitTimes(llvm::raw_ostream &OS) const; +}; + +} // namespace mca + +#endif Index: llvm/trunk/tools/llvm-mca/TimelineView.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/TimelineView.cpp +++ llvm/trunk/tools/llvm-mca/TimelineView.cpp @@ -0,0 +1,246 @@ +//===--------------------- TimelineView.cpp ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \brief +/// +/// This file implements the TimelineView interface. +/// +//===----------------------------------------------------------------------===// + +#include "TimelineView.h" + +using namespace llvm; + +namespace mca { + +void TimelineView::initialize(unsigned MaxIterations) { + unsigned NumInstructions = + AsmSequence.getNumIterations() * AsmSequence.size(); + if (!MaxIterations) + MaxIterations = DEFAULT_ITERATIONS; + unsigned NumEntries = + std::min(NumInstructions, MaxIterations * AsmSequence.size()); + Timeline.resize(NumEntries); + TimelineViewEntry NullTVEntry = {0, 0, 0, 0, 0}; + std::fill(Timeline.begin(), Timeline.end(), NullTVEntry); + + WaitTime.resize(AsmSequence.size()); + WaitTimeEntry NullWTEntry = {0, 0, 0, 0}; + std::fill(WaitTime.begin(), WaitTime.end(), NullWTEntry); +} + +void TimelineView::onInstructionDispatched(unsigned Index) { + if (CurrentCycle >= MaxCycle || Index >= Timeline.size()) + return; + Timeline[Index].CycleDispatched = CurrentCycle; + LastCycle = std::max(LastCycle, CurrentCycle); +} + +void TimelineView::onInstructionReady(unsigned Index) { + if (CurrentCycle >= MaxCycle || Index >= Timeline.size()) + return; + Timeline[Index].CycleReady = CurrentCycle; + LastCycle = std::max(LastCycle, CurrentCycle); +} + +void TimelineView::onInstructionIssued( + unsigned Index, + const ArrayRef> & /* Unused */) { + if (CurrentCycle >= MaxCycle || Index >= Timeline.size()) + return; + Timeline[Index].CycleIssued = CurrentCycle; + LastCycle = std::max(LastCycle, CurrentCycle); +} + +void TimelineView::onInstructionExecuted(unsigned Index) { + if (CurrentCycle >= MaxCycle || Index >= Timeline.size()) + return; + Timeline[Index].CycleExecuted = CurrentCycle; + LastCycle = std::max(LastCycle, CurrentCycle); +} + +void TimelineView::onInstructionRetired(unsigned Index) { + if (CurrentCycle >= MaxCycle || Index >= Timeline.size()) + return; + TimelineViewEntry &TVEntry = Timeline[Index]; + TVEntry.CycleRetired = CurrentCycle; + LastCycle = std::max(LastCycle, CurrentCycle); + + // Update the WaitTime entry which corresponds to this Index. + + WaitTimeEntry &WTEntry = WaitTime[Index % AsmSequence.size()]; + WTEntry.Executions++; + WTEntry.CyclesSpentInSchedulerQueue += + TVEntry.CycleIssued - TVEntry.CycleDispatched; + assert(TVEntry.CycleDispatched <= TVEntry.CycleReady); + WTEntry.CyclesSpentInSQWhileReady += TVEntry.CycleIssued - TVEntry.CycleReady; + WTEntry.CyclesSpentAfterWBAndBeforeRetire += + (TVEntry.CycleRetired - 1) - TVEntry.CycleExecuted; +} + +void TimelineView::printWaitTimeEntry(raw_string_ostream &OS, + const WaitTimeEntry &Entry, + unsigned SourceIndex) const { + OS << SourceIndex << '.'; + if (SourceIndex < 10) + OS << " "; + else if (SourceIndex < 100) + OS << " "; + else if (SourceIndex < 1000) + OS << " "; + else + OS << ' '; + + if (Entry.Executions == 0) { + OS << " - - - - "; + } else { + double AverageTime1, AverageTime2, AverageTime3; + unsigned Executions = Entry.Executions; + AverageTime1 = (double)Entry.CyclesSpentInSchedulerQueue / Executions; + AverageTime2 = (double)Entry.CyclesSpentInSQWhileReady / Executions; + AverageTime3 = (double)Entry.CyclesSpentAfterWBAndBeforeRetire / Executions; + if (Executions < 10) + OS << ' ' << Executions << " "; + else if (Executions < 100) + OS << ' ' << Executions << " "; + else + OS << Executions << " "; + + OS << format("%.1f", AverageTime1); + if (AverageTime1 < 10.0) + OS << " "; + else if (AverageTime1 < 100.0) + OS << " "; + else + OS << " "; + + OS << format("%.1f", AverageTime2); + if (AverageTime2 < 10.0) + OS << " "; + else if (AverageTime2 < 100.0) + OS << " "; + else + OS << " "; + + OS << format("%.1f", AverageTime3); + if (AverageTime3 < 10.0) + OS << " "; + else if (AverageTime3 < 100.0) + OS << ' '; + } +} + +void TimelineView::printAverageWaitTimes(raw_ostream &OS) const { + if (WaitTime.empty()) + return; + + std::string Buffer; + raw_string_ostream TempStream(Buffer); + + TempStream + << "\n\nAverage Wait times (based on the timeline view):\n" + << "[0]: Executions\n" + << "[1]: Average time spent waiting in a scheduler's queue\n" + << "[2]: Average time spent waiting in a scheduler's queue while ready\n" + << "[3]: Average time elapsed from WB until retire stage\n\n"; + TempStream << " [0] [1] [2] [3]\n"; + + for (unsigned I = 0, E = WaitTime.size(); I < E; ++I) { + printWaitTimeEntry(TempStream, WaitTime[I], I); + // Append the instruction info at the end of the line. + const MCInst &Inst = AsmSequence.getMCInstFromIndex(I); + MCIP.printInst(&Inst, TempStream, "", STI); + TempStream << '\n'; + TempStream.flush(); + OS << Buffer; + Buffer = ""; + } +} + +void TimelineView::printTimelineViewEntry(raw_string_ostream &OS, + const TimelineViewEntry &Entry, + unsigned Iteration, + unsigned SourceIndex) const { + if (SourceIndex == 0) + OS << '\n'; + OS << '[' << Iteration << ',' << SourceIndex << "]\t"; + for (unsigned I = 0, E = Entry.CycleDispatched; I < E; ++I) + OS << ((I % 5 == 0) ? '.' : ' '); + OS << 'D'; + if (Entry.CycleDispatched != Entry.CycleExecuted) { + // Zero latency instructions have the same value for CycleDispatched, + // CycleIssued and CycleExecuted. + for (unsigned I = Entry.CycleDispatched + 1, E = Entry.CycleIssued; I < E; ++I) + OS << '='; + if (Entry.CycleIssued == Entry.CycleExecuted) + OS << 'E'; + else { + if (Entry.CycleDispatched != Entry.CycleIssued) + OS << 'e'; + for (unsigned I = Entry.CycleIssued + 1, E = Entry.CycleExecuted; I < E; ++I) + OS << 'e'; + OS << 'E'; + } + } + + for (unsigned I = Entry.CycleExecuted + 1, E = Entry.CycleRetired; I < E; ++I) + OS << '-'; + OS << 'R'; + + // Skip other columns. + for (unsigned I = Entry.CycleRetired + 1, E = LastCycle; I <= E; ++I) + OS << ((I % 5 == 0 || I == LastCycle) ? '.' : ' '); +} + +static void printTimelineHeader(raw_string_ostream &OS, unsigned Cycles) { + OS << "\n\nTimeline view:\n"; + OS << " \t"; + for (unsigned I = 0; I <= Cycles; ++I) { + if (((I / 10) & 1) == 0) + OS << ' '; + else + OS << I % 10; + } + + OS << "\nIndex\t"; + for (unsigned I = 0; I <= Cycles; ++I) { + if (((I / 10) & 1) == 0) + OS << I % 10; + else + OS << ' '; + } + OS << '\n'; +} + +void TimelineView::printTimeline(raw_ostream &OS) const { + std::string Buffer; + raw_string_ostream TempStream(Buffer); + + printTimelineHeader(TempStream, LastCycle); + TempStream.flush(); + OS << Buffer; + + for (unsigned I = 0, E = Timeline.size(); I < E; ++I) { + Buffer = ""; + const TimelineViewEntry &Entry = Timeline[I]; + if (Entry.CycleRetired == 0) + return; + + unsigned Iteration = I / AsmSequence.size(); + unsigned SourceIndex = I % AsmSequence.size(); + printTimelineViewEntry(TempStream, Entry, Iteration, SourceIndex); + // Append the instruction info at the end of the line. + const MCInst &Inst = AsmSequence.getMCInstFromIndex(I); + MCIP.printInst(&Inst, TempStream, "", STI); + TempStream << '\n'; + TempStream.flush(); + OS << Buffer; + } +} + +} // namespace mca Index: llvm/trunk/tools/llvm-mca/llvm-mca.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/llvm-mca.cpp +++ llvm/trunk/tools/llvm-mca/llvm-mca.cpp @@ -0,0 +1,307 @@ +//===-- llvm-mca.cpp - Machine Code Analyzer -------------------*- C++ -* -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This utility is a simple driver that allows static performance analysis on +// machine code similarly to how IACA (Intel Architecture Code Analyzer) works. +// +// llvm-mca [options] +// -march +// -mcpu +// -o +// +// The target defaults to the host target. +// The cpu defaults to 'generic'. +// The output defaults to standard output. +// +//===----------------------------------------------------------------------===// + +#include "BackendPrinter.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCParser/MCTargetAsmParser.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/PrettyStackTrace.h" +#include "llvm/Support/Signals.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" + +using namespace llvm; + +static cl::opt + InputFilename(cl::Positional, cl::desc(""), cl::init("-")); + +static cl::opt OutputFilename("o", cl::desc("Output filename"), + cl::init("-"), + cl::value_desc("filename")); + +static cl::opt + ArchName("march", cl::desc("Target arch to assemble for, " + "see -version for available targets")); + +static cl::opt + TripleName("mtriple", cl::desc("Target triple to assemble for, " + "see -version for available targets")); + +static cl::opt + MCPU("mcpu", + cl::desc("Target a specific cpu type (-mcpu=help for details)"), + cl::value_desc("cpu-name"), cl::init("generic")); + +static cl::opt + OutputAsmVariant("output-asm-variant", + cl::desc("Syntax variant to use for output printing")); + +static cl::opt Iterations("iterations", + cl::desc("Number of iterations to run"), + cl::init(0)); + +static cl::opt DispatchWidth( + "dispatch", + cl::desc("Dispatch Width. By default it is set equal to IssueWidth"), + cl::init(0)); + +static cl::opt MaxRetirePerCycle( + "max-retire-per-cycle", + cl::desc("Maximum number of instructions that can be retired in one cycle"), + cl::init(0)); + +static cl::opt + RegisterFileSize("register-file-size", + cl::desc("Maximum number of temporary registers which can " + "be used for register mappings"), + cl::init(0)); + +static cl::opt PrintTimelineView("timeline", + cl::desc("Print the timeline view"), + cl::init(false)); + +static cl::opt TimelineMaxIterations( + "timeline-max-iterations", + cl::desc("Maximum number of iterations to print in timeline view"), + cl::init(0)); + +static cl::opt TimelineMaxCycles( + "timeline-max-cycles", + cl::desc( + "Maximum number of cycles in the timeline view. Defaults to 80 cycles"), + cl::init(80)); + +static cl::opt PrintModeVerbose("verbose", + cl::desc("Enable verbose output"), + cl::init(false)); + +static cl::opt + AssumeNoAlias("noalias", + cl::desc("If set, it assumes that loads and stores do not alias"), + cl::init(true)); + +static cl::opt + LoadQueueSize("lqueue", cl::desc("Size of the load queue"), cl::init(0)); +static cl::opt + StoreQueueSize("squeue", cl::desc("Size of the store queue"), cl::init(0)); + +static const Target *getTarget(const char *ProgName) { + TripleName = Triple::normalize(TripleName); + if (TripleName.empty()) + TripleName = Triple::normalize(sys::getDefaultTargetTriple()); + Triple TheTriple(TripleName); + + // Get the target specific parser. + std::string Error; + const Target *TheTarget = + TargetRegistry::lookupTarget(ArchName, TheTriple, Error); + if (!TheTarget) { + errs() << ProgName << ": " << Error; + return nullptr; + } + + // Return the found target. + return TheTarget; +} + +static int AssembleInput(const char *ProgName, const Target *TheTarget, + SourceMgr &SrcMgr, MCContext &Ctx, MCStreamer &Str, + MCAsmInfo &MAI, MCSubtargetInfo &STI, + MCInstrInfo &MCII, MCTargetOptions &MCOptions) { + std::unique_ptr Parser(createMCAsmParser(SrcMgr, Ctx, Str, MAI)); + std::unique_ptr TAP( + TheTarget->createMCAsmParser(STI, *Parser, MCII, MCOptions)); + + if (!TAP) { + errs() << ProgName + << ": error: this target does not support assembly parsing.\n"; + return 1; + } + + Parser->setTargetParser(*TAP); + return Parser->Run(false); +} + +namespace { + +class MCStreamerWrapper final : public MCStreamer { + using InstVec = std::vector>; + InstVec &Insts; + +public: + MCStreamerWrapper(MCContext &Context, InstVec &Vec) + : MCStreamer(Context), Insts(Vec) {} + + // We only want to intercept the emission of new instructions. + virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool /* unused */) override { + Insts.emplace_back(new MCInst(Inst)); + } + + bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override { + return true; + } + + void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size, + unsigned ByteAlignment) override {} + void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr, + uint64_t Size = 0, unsigned ByteAlignment = 0) override {} + void EmitGPRel32Value(const MCExpr *Value) override {} + void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {} + void EmitCOFFSymbolStorageClass(int StorageClass) override {} + void EmitCOFFSymbolType(int Type) override {} + void EndCOFFSymbolDef() override {} + + const InstVec &GetInstructionSequence() const { return Insts; } +}; + +} // end of anonymous namespace + +int main(int argc, char **argv) { + sys::PrintStackTraceOnErrorSignal(argv[0]); + PrettyStackTraceProgram X(argc, argv); + llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. + + // Initialize targets and assembly parsers. + llvm::InitializeAllTargetInfos(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllAsmParsers(); + + // Enable printing of available targets when flag --version is specified. + cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion); + + // Parse flags and initialize target options. + cl::ParseCommandLineOptions(argc, argv, + "llvm machine code performance analyzer.\n"); + MCTargetOptions MCOptions; + MCOptions.PreserveAsmComments = false; + + // Get the target from the triple. If a triple is not specified, then select + // the default triple for the host. If the triple doesn't correspond to any + // registered target, then exit with an error message. + const char *ProgName = argv[0]; + const Target *TheTarget = getTarget(ProgName); + if (!TheTarget) + return 1; + + // GetTarget() may replaced TripleName with a default triple. + // For safety, reconstruct the Triple object. + Triple TheTriple(TripleName); + + ErrorOr> BufferPtr = + MemoryBuffer::getFileOrSTDIN(InputFilename); + if (std::error_code EC = BufferPtr.getError()) { + errs() << InputFilename << ": " << EC.message() << '\n'; + return 1; + } + + SourceMgr SrcMgr; + + // Tell SrcMgr about this buffer, which is what the parser will pick up. + SrcMgr.AddNewSourceBuffer(std::move(*BufferPtr), SMLoc()); + + std::unique_ptr MRI(TheTarget->createMCRegInfo(TripleName)); + assert(MRI && "Unable to create target register info!"); + + std::unique_ptr MAI(TheTarget->createMCAsmInfo(*MRI, TripleName)); + assert(MAI && "Unable to create target asm info!"); + + MCObjectFileInfo MOFI; + MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr); + MOFI.InitMCObjectFileInfo(TheTriple, /* PIC= */ false, Ctx); + + std::unique_ptr BOS; + std::unique_ptr S = + llvm::make_unique(Iterations); + MCStreamerWrapper Str(Ctx, S->getSequence()); + + std::unique_ptr MCII(TheTarget->createMCInstrInfo()); + std::unique_ptr STI( + TheTarget->createMCSubtargetInfo(TripleName, MCPU, /* FeaturesStr */ "")); + if (!STI->isCPUStringValid(MCPU)) + return 1; + + if (!STI->getSchedModel().isOutOfOrder()) { + errs() << "error: please specify an out-of-order cpu. '" << MCPU + << "' is an in-order cpu.\n"; + return 1; + } + + if (!STI->getSchedModel().hasInstrSchedModel()) { + errs() + << "error: unable to find instruction-level scheduling information for" + << " target triple '" << TheTriple.normalize() << "' and cpu '" << MCPU + << "'.\n"; + + if (STI->getSchedModel().InstrItineraries) + errs() << "note: cpu '" << MCPU << "' provides itineraries. However, " + << "instruction itineraries are currently unsupported.\n"; + return 1; + } + + std::unique_ptr IP(TheTarget->createMCInstPrinter( + Triple(TripleName), OutputAsmVariant, *MAI, *MCII, *MRI)); + if (!IP) { + errs() << "error: unable to create instruction printer for target triple '" + << TheTriple.normalize() << "' with assembly variant " + << OutputAsmVariant << ".\n"; + return 1; + } + + int Res = AssembleInput(ProgName, TheTarget, SrcMgr, Ctx, Str, *MAI, *STI, + *MCII, MCOptions); + if (Res) + return Res; + + if (S->isEmpty()) { + errs() << "error: no assembly instructions found.\n"; + return 1; + } + + const MCSchedModel &SM = STI->getSchedModel(); + + unsigned Width = SM.IssueWidth; + if (DispatchWidth) + Width = DispatchWidth; + + std::unique_ptr B = llvm::make_unique( + *STI, *MCII, *MRI, std::move(S), Width, RegisterFileSize, MaxRetirePerCycle, + LoadQueueSize, StoreQueueSize, AssumeNoAlias); + + std::unique_ptr Printer = + llvm::make_unique(*B, OutputFilename, std::move(IP), + PrintModeVerbose); + Printer->addResourcePressureView(); + if (PrintTimelineView) + Printer->addTimelineView(TimelineMaxIterations, TimelineMaxCycles); + + B->run(); + Printer->printReport(); + return 0; +}