Index: lib/Target/X86/X86ScheduleZnver1.td =================================================================== --- lib/Target/X86/X86ScheduleZnver1.td +++ lib/Target/X86/X86ScheduleZnver1.td @@ -55,7 +55,6 @@ def ZnFPU3 : ProcResource<1>; // FPU grouping -def ZnFPU : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]>; def ZnFPU013 : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>; def ZnFPU01 : ProcResGroup<[ZnFPU0, ZnFPU1]>; def ZnFPU12 : ProcResGroup<[ZnFPU1, ZnFPU2]>; @@ -91,6 +90,30 @@ // 4 Cycles load-to use Latency is captured def : ReadAdvance; +// The Integer PRF for Zen is 168 entries, and it holds the architectural and +// speculative version of the 64-bit integer registers. +// Reference: "Software Optimization Guide for AMD Family 17h Processors" +def ZnIntegerPRF : RegisterFile<168, [GR8, GR16, GR32, GR64]>; + +// 36 Entry (9x4 entries) floating-point Scheduler +def ZnFPU : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]> { +let BufferSize=36; +} + +// The Zen FP Retire Queue renames SIMD and FP uOps onto a pool of 160 128-bit +// registers. Operations on 256-bit data types are cracked into two COPs. +// Reference: "Software Optimization Guide for AMD Family 17h Processors" +def ZnFpuPRF: RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>; + +// The unit can track up to 192 macro ops in-flight. +// The retire unit handles in-order commit of up to 8 macro ops per cycle. +// Reference: "Software Optimization Guide for AMD Family 17h Processors" +// However, the retire unit is shared between integer and FP ops. In SMT mode +// it is 96 entry per thread. So we shall use that as conservative value. +def ZnRCU : RetireControlUnit<96, 8>; + +// FIXME: there are 72 read buffers and 44 write buffers. + // (a folded load is an instruction that loads and does some operation) // Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops // Instructions with folded loads are usually micro-fused, so they only appear Index: test/tools/llvm-mca/X86/register-file-statistics.s =================================================================== --- /dev/null +++ test/tools/llvm-mca/X86/register-file-statistics.s @@ -0,0 +1,56 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=broadwell -iterations=1 -register-file-stats -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDWELL +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -register-file-stats -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2 +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1 -register-file-stats -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=HASWELL +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -iterations=1 -register-file-stats -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SANDYBRID +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -iterations=1 -register-file-stats -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKLCLI +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -iterations=1 -register-file-stats -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKLSER +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -register-file-stats -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1 + +xor %eax, %eax + +# ALL: Iterations: 1 +# ALL-NEXT: Instructions: 1 +# ALL-NEXT: Total Cycles: 4 + +# BDWELL-NEXT: Dispatch Width: 4 +# BTVER2-NEXT: Dispatch Width: 2 +# HASWELL-NEXT: Dispatch Width: 4 +# SANDYBRID-NEXT: Dispatch Width: 4 +# SKLCLI-NEXT: Dispatch Width: 6 +# SKLSER-NEXT: Dispatch Width: 6 +# ZNVER1-NEXT: Dispatch Width: 4 + +# ALL-NEXT: IPC: 0.25 + +# BDWELL-NEXT: Block RThroughput: 0.3 +# BTVER2-NEXT: Block RThroughput: 0.5 +# HASWELL-NEXT: Block RThroughput: 0.3 +# SANDYBRID-NEXT: Block RThroughput: 0.3 +# SKLCLI-NEXT: Block RThroughput: 0.3 +# SKLSER-NEXT: Block RThroughput: 0.3 +# ZNVER1-NEXT: Block RThroughput: 0.3 + +# ALL: Register File statistics: +# ALL-NEXT: Total number of mappings created: 2 +# ALL-NEXT: Max number of mappings used: 2 + +# BTVER2: * Register File #1 -- JFpuPRF: +# BTVER2-NEXT: Number of physical registers: 72 +# BTVER2-NEXT: Total number of mappings created: 0 +# BTVER2-NEXT: Max number of mappings used: 0 + +# ZNVER1: * Register File #1 -- ZnFpuPRF: +# ZNVER1-NEXT: Number of physical registers: 160 +# ZNVER1-NEXT: Total number of mappings created: 0 +# ZNVER1-NEXT: Max number of mappings used: 0 + +# BTVER2: * Register File #2 -- JIntegerPRF: +# BTVER2-NEXT: Number of physical registers: 64 +# BTVER2-NEXT: Total number of mappings created: 2 +# BTVER2-NEXT: Max number of mappings used: 2 + +# ZNVER1: * Register File #2 -- ZnIntegerPRF: +# ZNVER1-NEXT: Number of physical registers: 168 +# ZNVER1-NEXT: Total number of mappings created: 1 +# ZNVER1-NEXT: Max number of mappings used: 1 Index: test/tools/llvm-mca/X86/scheduler-queue-usage.s =================================================================== --- /dev/null +++ test/tools/llvm-mca/X86/scheduler-queue-usage.s @@ -0,0 +1,62 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=broadwell -iterations=1 -scheduler-stats -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDWELL +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -scheduler-stats -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2 +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1 -scheduler-stats -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=HASWELL +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -iterations=1 -scheduler-stats -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SANDYBRID +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -iterations=1 -scheduler-stats -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKLCLI +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -iterations=1 -scheduler-stats -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKLSER +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -scheduler-stats -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1 + +xor %eax, %eax + +# ALL: Iterations: 1 +# ALL-NEXT: Instructions: 1 +# ALL-NEXT: Total Cycles: 4 + +# BDWELL-NEXT: Dispatch Width: 4 +# BTVER2-NEXT: Dispatch Width: 2 +# HASWELL-NEXT: Dispatch Width: 4 +# SANDYBRID-NEXT: Dispatch Width: 4 +# SKLCLI-NEXT: Dispatch Width: 6 +# SKLSER-NEXT: Dispatch Width: 6 +# ZNVER1-NEXT: Dispatch Width: 4 + +# ALL-NEXT: IPC: 0.25 + +# BDWELL-NEXT: Block RThroughput: 0.3 +# BTVER2-NEXT: Block RThroughput: 0.5 +# HASWELL-NEXT: Block RThroughput: 0.3 +# SANDYBRID-NEXT: Block RThroughput: 0.3 +# SKLCLI-NEXT: Block RThroughput: 0.3 +# SKLSER-NEXT: Block RThroughput: 0.3 +# ZNVER1-NEXT: Block RThroughput: 0.3 + +# ALL: Schedulers - number of cycles where we saw N instructions issued: +# ALL-NEXT: [# issued], [# cycles] +# ALL-NEXT: 0, 3 (75.0%) +# ALL-NEXT: 1, 1 (25.0%) + +# BDWELL: Scheduler's queue usage: +# BDWELL-NEXT: BWPortAny, 1/60 + +# HASWELL: Scheduler's queue usage: +# HASWELL-NEXT: HWPortAny, 1/60 + +# BTVER2: Scheduler's queue usage: +# BTVER2-NEXT: JALU01, 1/20 +# BTVER2-NEXT: JFPU01, 0/18 +# BTVER2-NEXT: JLSAGU, 0/12 + +# SANDYBRID: Scheduler's queue usage: +# SANDYBRID-NEXT: SBPortAny, 1/54 + +# SKLCLI: Scheduler's queue usage: +# SKLCLI-NEXT: SKLPortAny, 1/60 + +# SKLSER: Scheduler's queue usage: +# SKLSER-NEXT: SKXPortAny, 1/60 + +# ZNVER1: Scheduler's queue usage: +# ZNVER1-NEXT: ZnAGU, 0/28 +# ZNVER1-NEXT: ZnALU, 1/56 +# ZNVER1-NEXT: ZnFPU, 0/36