Index: docs/CommandGuide/llvm-exegesis.rst =================================================================== --- docs/CommandGuide/llvm-exegesis.rst +++ docs/CommandGuide/llvm-exegesis.rst @@ -214,10 +214,17 @@ If non-empty, write inconsistencies found during analysis to this file. `-` prints to stdout. By default, this analysis is not run. +.. option:: -analysis-clustering=[dbscan,naive] + + Specify the clustering algorithm to use. By default DBSCAN will be used. + Naive clustering algorithm is better for doing further work on the + `-analysis-inconsistencies-output-file=` output, it will create one cluster + per opcode, and check that the cluster is stable (all points are neighbours). + .. option:: -analysis-numpoints= Specify the numPoints parameters to be used for DBSCAN clustering - (`analysis` mode). + (`analysis` mode, DBSCAN only). .. option:: -analysis-clustering-epsilon= Index: test/tools/llvm-exegesis/X86/analysis-clustering-algorithms.test =================================================================== --- /dev/null +++ test/tools/llvm-exegesis/X86/analysis-clustering-algorithms.test @@ -0,0 +1,231 @@ +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.5 -analysis-numpoints=1 -analysis-clustering=dbscan | FileCheck -check-prefixes=CHECK-CLUSTERS-ALL,CHECK-CLUSTERS-DBSCAN-05 %s +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.49 -analysis-numpoints=1 -analysis-clustering=dbscan | FileCheck -check-prefixes=CHECK-CLUSTERS-ALL,CHECK-CLUSTERS-DBSCAN-049 %s +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.5 -analysis-numpoints=1 -analysis-clustering=naive | FileCheck -check-prefixes=CHECK-CLUSTERS-ALL,CHECK-CLUSTERS-NAIVE %s +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.49 -analysis-numpoints=1 -analysis-clustering=naive | FileCheck -check-prefixes=CHECK-CLUSTERS-ALL,CHECK-CLUSTERS-NAIVE %s + +# CHECK-CLUSTERS-ALL: {{^}}cluster_id,opcode_name,config,sched_class,inverse_throughput{{$}} + +# By default with -analysis-clustering-epsilon=0.5 everything ends up in the +# same cluster, because each next point is a neighbour of the previous point. + +# CHECK-CLUSTERS-DBSCAN-05-NEXT: {{^}}0, +# CHECK-CLUSTERS-DBSCAN-05-SAME: ,1.00{{$}} +# CHECK-CLUSTERS-DBSCAN-05-NEXT: {{^}}0, +# CHECK-CLUSTERS-DBSCAN-05-SAME: ,1.50{{$}} +# CHECK-CLUSTERS-DBSCAN-05-NEXT: {{^}}0, +# CHECK-CLUSTERS-DBSCAN-05-SAME: ,2.00{{$}} +# CHECK-CLUSTERS-DBSCAN-05-NEXT: {{^}}0, +# CHECK-CLUSTERS-DBSCAN-05-SAME: ,2.50{{$}} + +# With -analysis-clustering-epsilon=0.49 every point goes into separate cluster. + +# CHECK-CLUSTERS-DBSCAN-049-NEXT: {{^}}0, +# CHECK-CLUSTERS-DBSCAN-049-SAME: ,1.00{{$}} +# CHECK-CLUSTERS-DBSCAN-049: {{^}}1, +# CHECK-CLUSTERS-DBSCAN-049-SAME: ,1.50{{$}} +# CHECK-CLUSTERS-DBSCAN-049: {{^}}2, +# CHECK-CLUSTERS-DBSCAN-049-SAME: ,2.00{{$}} +# CHECK-CLUSTERS-DBSCAN-049: {{^}}3, +# CHECK-CLUSTERS-DBSCAN-049-SAME: ,2.50{{$}} + +# And -analysis-clustering=naive every opcode goes into separate cluster. + +# CHECK-CLUSTERS-NAIVE-049-NEXT: {{^}}0, +# CHECK-CLUSTERS-NAIVE-049-SAME: ,1.50{{$}} +# CHECK-CLUSTERS-NAIVE-049: {{^}}1, +# CHECK-CLUSTERS-NAIVE-049-SAME: ,2.00{{$}} +# CHECK-CLUSTERS-NAIVE-049: {{^}}2, +# CHECK-CLUSTERS-NAIVE-049-SAME: ,2.50{{$}} +# CHECK-CLUSTERS-NAIVE-049: {{^}}3, +# CHECK-CLUSTERS-NAIVE-049-SAME: ,1.00{{$}} + +# The "value" is manually specified, not measured. + +--- +mode: inverse_throughput +key: + instructions: + - 'ROL8ri AH AH i_0x1' + - 'ROL8ri AL AL i_0x1' + - 'ROL8ri BH BH i_0x1' + - 'ROL8ri BL BL i_0x1' + - 'ROL8ri BPL BPL i_0x1' + - 'ROL8ri CH CH i_0x1' + - 'ROL8ri CL CL i_0x1' + - 'ROL8ri DH DH i_0x1' + - 'ROL8ri DIL DIL i_0x1' + - 'ROL8ri DL DL i_0x1' + - 'ROL8ri SIL SIL i_0x1' + - 'ROL8ri R8B R8B i_0x1' + - 'ROL8ri R9B R9B i_0x1' + - 'ROL8ri R10B R10B i_0x1' + - 'ROL8ri R11B R11B i_0x1' + - 'ROL8ri R12B R12B i_0x1' + - 'ROL8ri R13B R13B i_0x1' + - 'ROL8ri R14B R14B i_0x1' + - 'ROL8ri R15B R15B i_0x1' + config: '' + register_initial_values: + - 'AH=0x0' + - 'AL=0x0' + - 'BH=0x0' + - 'BL=0x0' + - 'BPL=0x0' + - 'CH=0x0' + - 'CL=0x0' + - 'DH=0x0' + - 'DIL=0x0' + - 'DL=0x0' + - 'SIL=0x0' + - 'R8B=0x0' + - 'R9B=0x0' + - 'R10B=0x0' + - 'R11B=0x0' + - 'R12B=0x0' + - 'R13B=0x0' + - 'R14B=0x0' + - 'R15B=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 1000000 +measurements: + - { key: inverse_throughput, value: 1.0000, per_snippet_value: 30.4026 } +error: '' +info: instruction has tied variables, using static renaming. +assembled_snippet: 55415741564155415453B400B000B700B30040B500B500B100B60040B700B20040B60041B00041B10041B20041B30041B40041B50041B60041B700C0C401C0C001C0C701C0C30140C0C501C0C501C0C101C0C60140C0C701C0C20140C0C60141C0C00141C0C10141C0C20141C0C30141C0C40141C0C50141C0C60141C0C7015B415C415D415E415F5DC3 +... +--- +mode: inverse_throughput +key: + instructions: + - 'ROL16ri AX AX i_0x1' + - 'ROL16ri BP BP i_0x1' + - 'ROL16ri BX BX i_0x1' + - 'ROL16ri CX CX i_0x1' + - 'ROL16ri DI DI i_0x1' + - 'ROL16ri DX DX i_0x1' + - 'ROL16ri SI SI i_0x1' + - 'ROL16ri R8W R8W i_0x1' + - 'ROL16ri R9W R9W i_0x1' + - 'ROL16ri R10W R10W i_0x1' + - 'ROL16ri R11W R11W i_0x1' + - 'ROL16ri R12W R12W i_0x1' + - 'ROL16ri R13W R13W i_0x1' + - 'ROL16ri R14W R14W i_0x1' + - 'ROL16ri R15W R15W i_0x1' + config: '' + register_initial_values: + - 'AX=0x0' + - 'BP=0x0' + - 'BX=0x0' + - 'CX=0x0' + - 'DI=0x0' + - 'DX=0x0' + - 'SI=0x0' + - 'R8W=0x0' + - 'R9W=0x0' + - 'R10W=0x0' + - 'R11W=0x0' + - 'R12W=0x0' + - 'R13W=0x0' + - 'R14W=0x0' + - 'R15W=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 1000000 +measurements: + - { key: inverse_throughput, value: 1.5000, per_snippet_value: 30.154 } +error: '' +info: instruction has tied variables, using static renaming. +assembled_snippet: 5541574156415541545366B8000066BD000066BB000066B9000066BF000066BA000066BE00006641B800006641B900006641BA00006641BB00006641BC00006641BD00006641BE00006641BF000066C1C00166C1C50166C1C30166C1C10166C1C70166C1C20166C1C6016641C1C0016641C1C1016641C1C2016641C1C3016641C1C4016641C1C5016641C1C6016641C1C70166C1C0015B415C415D415E415F5DC3 +... +--- +mode: inverse_throughput +key: + instructions: + - 'ROL32ri EAX EAX i_0x1' + - 'ROL32ri EBP EBP i_0x1' + - 'ROL32ri EBX EBX i_0x1' + - 'ROL32ri ECX ECX i_0x1' + - 'ROL32ri EDI EDI i_0x1' + - 'ROL32ri EDX EDX i_0x1' + - 'ROL32ri ESI ESI i_0x1' + - 'ROL32ri R8D R8D i_0x1' + - 'ROL32ri R9D R9D i_0x1' + - 'ROL32ri R10D R10D i_0x1' + - 'ROL32ri R11D R11D i_0x1' + - 'ROL32ri R12D R12D i_0x1' + - 'ROL32ri R13D R13D i_0x1' + - 'ROL32ri R14D R14D i_0x1' + - 'ROL32ri R15D R15D i_0x1' + config: '' + register_initial_values: + - 'EAX=0x0' + - 'EBP=0x0' + - 'EBX=0x0' + - 'ECX=0x0' + - 'EDI=0x0' + - 'EDX=0x0' + - 'ESI=0x0' + - 'R8D=0x0' + - 'R9D=0x0' + - 'R10D=0x0' + - 'R11D=0x0' + - 'R12D=0x0' + - 'R13D=0x0' + - 'R14D=0x0' + - 'R15D=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 1000000 +measurements: + - { key: inverse_throughput, value: 2.0000, per_snippet_value: 23.2762 } +error: '' +info: instruction has tied variables, using static renaming. +assembled_snippet: 55415741564155415453B800000000BD00000000BB00000000B900000000BF00000000BA00000000BE0000000041B80000000041B90000000041BA0000000041BB0000000041BC0000000041BD0000000041BE0000000041BF00000000C1C001C1C501C1C301C1C101C1C701C1C201C1C60141C1C00141C1C10141C1C20141C1C30141C1C40141C1C50141C1C60141C1C701C1C0015B415C415D415E415F5DC3 +... +--- +mode: inverse_throughput +key: + instructions: + - 'ROL64ri RAX RAX i_0x1' + - 'ROL64ri RBP RBP i_0x1' + - 'ROL64ri RBX RBX i_0x1' + - 'ROL64ri RCX RCX i_0x1' + - 'ROL64ri RDI RDI i_0x1' + - 'ROL64ri RDX RDX i_0x1' + - 'ROL64ri RSI RSI i_0x1' + - 'ROL64ri R8 R8 i_0x1' + - 'ROL64ri R9 R9 i_0x1' + - 'ROL64ri R10 R10 i_0x1' + - 'ROL64ri R11 R11 i_0x1' + - 'ROL64ri R12 R12 i_0x1' + - 'ROL64ri R13 R13 i_0x1' + - 'ROL64ri R14 R14 i_0x1' + - 'ROL64ri R15 R15 i_0x1' + config: '' + register_initial_values: + - 'RAX=0x0' + - 'RBP=0x0' + - 'RBX=0x0' + - 'RCX=0x0' + - 'RDI=0x0' + - 'RDX=0x0' + - 'RSI=0x0' + - 'R8=0x0' + - 'R9=0x0' + - 'R10=0x0' + - 'R11=0x0' + - 'R12=0x0' + - 'R13=0x0' + - 'R14=0x0' + - 'R15=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 1000000 +measurements: + - { key: inverse_throughput, value: 2.5000, per_snippet_value: 26.2268 } +error: '' +info: instruction has tied variables, using static renaming. +assembled_snippet: 5541574156415541545348B8000000000000000048BD000000000000000048BB000000000000000048B9000000000000000048BF000000000000000048BA000000000000000048BE000000000000000049B8000000000000000049B9000000000000000049BA000000000000000049BB000000000000000049BC000000000000000049BD000000000000000049BE000000000000000049BF000000000000000048C1C00148C1C50148C1C30148C1C10148C1C70148C1C20148C1C60149C1C00149C1C10149C1C20149C1C30149C1C40149C1C50149C1C60149C1C70148C1C0015B415C415D415E415F5DC3 +... Index: test/tools/llvm-exegesis/X86/analysis-naive-cluster-stabilization.test =================================================================== --- /dev/null +++ test/tools/llvm-exegesis/X86/analysis-naive-cluster-stabilization.test @@ -0,0 +1,63 @@ +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.5 -analysis-inconsistency-epsilon=0.5 -analysis-numpoints=1 -analysis-clustering=naive | FileCheck -check-prefixes=CHECK-CLUSTERS-ALL,CHECK-CLUSTERS-05 %s +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-inconsistencies-output-file=- -analysis-clustering-epsilon=0.5 -analysis-inconsistency-epsilon=0.5 -analysis-numpoints=1 -analysis-clustering=naive | FileCheck -check-prefixes=CHECK-INCONSISTENCIES-STABLE-05 %s +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-inconsistencies-output-file=- -analysis-clustering-epsilon=0.5 -analysis-inconsistency-epsilon=0.5 -analysis-display-unstable-clusters -analysis-numpoints=1 -analysis-clustering=naive | FileCheck -check-prefixes=CHECK-INCONSISTENCIES-UNSTABLE-05 %s + +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.49 -analysis-inconsistency-epsilon=0.5 -analysis-numpoints=1 -analysis-clustering=naive | FileCheck -check-prefixes=CHECK-CLUSTERS-ALL,CHECK-CLUSTERS-049 %s +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-inconsistencies-output-file=- -analysis-clustering-epsilon=0.49 -analysis-inconsistency-epsilon=0.5 -analysis-numpoints=1 -analysis-clustering=naive | FileCheck -check-prefixes=CHECK-INCONSISTENCIES-STABLE-049 %s +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-inconsistencies-output-file=- -analysis-clustering-epsilon=0.49 -analysis-inconsistency-epsilon=0.5 -analysis-display-unstable-clusters -analysis-numpoints=1 -analysis-clustering=naive | FileCheck -check-prefixes=CHECK-INCONSISTENCIES-UNSTABLE-049 %s + +# CHECK-CLUSTERS-ALL: {{^}}cluster_id,opcode_name,config,sched_class,latency{{$}} + +# CHECK-CLUSTERS-NEXT-05: {{^}}0, +# CHECK-CLUSTERS-SAME-05: ,90.00{{$}} +# CHECK-CLUSTERS-05: {{^}}0, +# CHECK-CLUSTERS-SAME-05: ,90.50{{$}} + +# CHECK-INCONSISTENCIES-STABLE-05: ADD32rr +# CHECK-INCONSISTENCIES-STABLE-05: ADD32rr +# CHECK-INCONSISTENCIES-STABLE-05-NOT: ADD32rr + +# CHECK-INCONSISTENCIES-UNSTABLE-05-NOT: ADD32rr + +# CHECK-INCONSISTENCIES-STABLE-049-NOT: ADD32rr + +# CHECK-INCONSISTENCIES-UNSTABLE-049: ADD32rr +# CHECK-INCONSISTENCIES-UNSTABLE-049: ADD32rr +# CHECK-INCONSISTENCIES-UNSTABLE-049-NOT: ADD32rr + +--- +mode: latency +key: + instructions: + - 'ADD32rr EDX EDX EAX' + config: '' + register_initial_values: + - 'EDX=0x0' + - 'EAX=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 90.0000, per_snippet_value: 90.0000 } +error: '' +info: Repeating a single implicitly serial instruction +assembled_snippet: BA00000000B80000000001C201C201C201C201C201C201C201C201C201C201C201C201C201C201C201C2C3 +--- +mode: latency +key: + instructions: + - 'ADD32rr EDX EDX EAX' + config: '' + register_initial_values: + - 'EDX=0x0' + - 'EAX=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 90.5000, per_snippet_value: 90.5000 } +error: '' +info: Repeating a single implicitly serial instruction +assembled_snippet: BA00000000B80000000001C201C201C201C201C201C201C201C201C201C201C201C201C201C201C201C2C3 +--- +... Index: test/tools/llvm-exegesis/X86/analysis-naive-clusterization.test =================================================================== --- /dev/null +++ test/tools/llvm-exegesis/X86/analysis-naive-clusterization.test @@ -0,0 +1,100 @@ +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=0.1 -analysis-inconsistency-epsilon=0.1 -analysis-numpoints=1 -analysis-clustering=naive | FileCheck -check-prefixes=CHECK-CLUSTERS %s +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-inconsistencies-output-file=- -analysis-clustering-epsilon=0.5 -analysis-inconsistency-epsilon=0.5 -analysis-numpoints=1 -analysis-clustering=naive | FileCheck -check-prefixes=CHECK-INCONSISTENCIES-ALL,CHECK-INCONSISTENCIES-STABLE %s +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-inconsistencies-output-file=- -analysis-clustering-epsilon=0.5 -analysis-inconsistency-epsilon=0.5 -analysis-display-unstable-clusters -analysis-numpoints=1 -analysis-clustering=naive | FileCheck -check-prefixes=CHECK-INCONSISTENCIES-ALL,CHECK-INCONSISTENCIES-UNSTABLE %s + +# We have two ADD32rr measurements, and two measurements for SQRTSSr. +# ADD32rr measurements are neighbours. +# But the measurements of SQRTSSr are not neighbours, +# so therefore that cluster is marked as unstable. + +# By default, we do not show such unstable clusters. +# If told to show, we *only* show such unstable clusters. + +# CHECK-CLUSTERS: {{^}}cluster_id,opcode_name,config,sched_class,latency{{$}} +# CHECK-CLUSTERS-NEXT: {{^}}0, +# CHECK-CLUSTERS-SAME: ,90.00{{$}} +# CHECK-CLUSTERS-NEXT: {{^}}0, +# CHECK-CLUSTERS-SAME: ,90.11{{$}} +# CHECK-CLUSTERS: {{^}}1, +# CHECK-CLUSTERS-SAME: ,90.11{{$}} +# CHECK-CLUSTERS-NEXT: {{^}}1, +# CHECK-CLUSTERS-SAME: ,100.00{{$}} + +# CHECK-INCONSISTENCIES-STABLE: ADD32rr +# CHECK-INCONSISTENCIES-STABLE: ADD32rr +# CHECK-INCONSISTENCIES-STABLE-NOT: ADD32rr +# CHECK-INCONSISTENCIES-STABLE-NOT: SQRTSSr + +# CHECK-INCONSISTENCIES-UNSTABLE: SQRTSSr +# CHECK-INCONSISTENCIES-UNSTABLE: SQRTSSr +# CHECK-INCONSISTENCIES-UNSTABLE-NOT: SQRTSSr +# CHECK-INCONSISTENCIES-UNSTABLE-NOT: ADD32rr + +--- +mode: latency +key: + instructions: + - 'ADD32rr EDX EDX EAX' + config: '' + register_initial_values: + - 'EDX=0x0' + - 'EAX=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 90.0000, per_snippet_value: 90.0000 } +error: '' +info: Repeating a single implicitly serial instruction +assembled_snippet: BA00000000B80000000001C201C201C201C201C201C201C201C201C201C201C201C201C201C201C201C2C3 +--- +mode: latency +key: + instructions: + - 'ADD32rr EDX EDX EAX' + config: '' + register_initial_values: + - 'EDX=0x0' + - 'EAX=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 90.1100, per_snippet_value: 90.1100 } +error: '' +info: Repeating a single implicitly serial instruction +assembled_snippet: BA00000000B80000000001C201C201C201C201C201C201C201C201C201C201C201C201C201C201C201C2C3 +--- +mode: latency +key: + instructions: + - 'SQRTSSr XMM11 XMM11' + config: '' + register_initial_values: + - 'XMM11=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 90.1111, per_snippet_value: 90.1111 } +error: '' +info: Repeating a single explicitly serial instruction +assembled_snippet: 4883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F1C244883C410F3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBC3 +... +--- +mode: latency +key: + instructions: + - 'SQRTSSr XMM11 XMM11' + config: '' + register_initial_values: + - 'XMM11=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 100, per_snippet_value: 100 } +error: '' +info: Repeating a single explicitly serial instruction +assembled_snippet: 4883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F1C244883C410F3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBC3 +... Index: test/tools/llvm-exegesis/X86/analysis-same-cluster-for-ops-in-different-sched-clusters.test =================================================================== --- /dev/null +++ test/tools/llvm-exegesis/X86/analysis-same-cluster-for-ops-in-different-sched-clusters.test @@ -0,0 +1,54 @@ +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=10 -analysis-numpoints=1 | FileCheck -check-prefixes=CHECK-CLUSTERS-ALL,CHECK-CLUSTERS-DBSCAN %s +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=10 -analysis-numpoints=1 -analysis-clustering=dbscan | FileCheck -check-prefixes=CHECK-CLUSTERS-ALL,CHECK-CLUSTERS-DBSCAN %s +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-clustering-epsilon=10 -analysis-numpoints=1 -analysis-clustering=naive | FileCheck -check-prefixes=CHECK-CLUSTERS-ALL,CHECK-CLUSTERS-NAIVE %s + +# Normally BSR32rr is in WriteBSR and BSF32rr is in WriteBSF sched classes. +# Here we check that if we have dbscan-clustered these two measurements into the +# same cluster, we don't split it per the sched classes into two. + +# CHECK-CLUSTERS-ALL: {{^}}cluster_id,opcode_name,config,sched_class,inverse_throughput{{$}} + +# CHECK-CLUSTERS-DBSCAN-NEXT: {{^}}0, +# CHECK-CLUSTERS-DBSCAN-SAME: ,4.03{{$}} +# CHECK-CLUSTERS-DBSCAN-NEXT: {{^}}0, +# CHECK-CLUSTERS-DBSCAN-SAME: ,3.02{{$}} + +# CHECK-CLUSTERS-NAIVE-NEXT: {{^}}0, +# CHECK-CLUSTERS-NAIVE-SAME: ,3.02{{$}} +# CHECK-CLUSTERS-NAIVE: {{^}}1, +# CHECK-CLUSTERS-NAIVE-SAME: ,4.03{{$}} + +--- +mode: inverse_throughput +key: + instructions: + - 'BSR32rr R11D EDI' + config: '' + register_initial_values: + - 'EDI=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 1000000 +measurements: + - { key: inverse_throughput, value: 4.03048, per_snippet_value: 4.03048 } +error: '' +info: instruction has no tied variables picking Uses different from defs +assembled_snippet: BF00000000440FBDDF440FBDDF440FBDDF440FBDDF440FBDDF440FBDDF440FBDDF440FBDDF440FBDDF440FBDDF440FBDDF440FBDDF440FBDDF440FBDDF440FBDDF440FBDDFC3 +... +--- +mode: inverse_throughput +key: + instructions: + - 'BSF32rr EAX R14D' + config: '' + register_initial_values: + - 'R14D=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 1000000 +measurements: + - { key: inverse_throughput, value: 3.02186, per_snippet_value: 3.02186 } +error: '' +info: instruction has no tied variables picking Uses different from defs +assembled_snippet: 415641BE00000000410FBCC6410FBCC6410FBCC6410FBCC6410FBCC6410FBCC6410FBCC6410FBCC6410FBCC6410FBCC6410FBCC6410FBCC6410FBCC6410FBCC6410FBCC6410FBCC6415EC3 +... Index: tools/llvm-exegesis/lib/Analysis.h =================================================================== --- tools/llvm-exegesis/lib/Analysis.h +++ tools/llvm-exegesis/lib/Analysis.h @@ -65,19 +65,12 @@ }; // Represents the intersection of a sched class and a cluster. - class SchedClassCluster { + class SchedClassCluster : public BenchmarkGroup { public: const InstructionBenchmarkClustering::ClusterId &id() const { return ClusterId; } - const std::vector &getPointIds() const { return PointIds; } - - // Return the cluster centroid. - const std::vector &getRepresentative() const { - return Representative; - } - // Returns true if the cluster representative measurements match that of SC. bool measurementsMatch(const llvm::MCSubtargetInfo &STI, @@ -86,13 +79,10 @@ const double AnalysisInconsistencyEpsilonSquared_) const; void addPoint(size_t PointId, - const InstructionBenchmarkClustering &Clustering); + const InstructionBenchmarkClustering &Clustering) override; private: InstructionBenchmarkClustering::ClusterId ClusterId; - std::vector PointIds; - // Measurement stats for the points in the SchedClassCluster. - std::vector Representative; }; void printInstructionRowCsv(size_t PointId, llvm::raw_ostream &OS) const; Index: tools/llvm-exegesis/lib/Analysis.cpp =================================================================== --- tools/llvm-exegesis/lib/Analysis.cpp +++ tools/llvm-exegesis/lib/Analysis.cpp @@ -435,16 +435,11 @@ void Analysis::SchedClassCluster::addPoint( size_t PointId, const InstructionBenchmarkClustering &Clustering) { - PointIds.push_back(PointId); - const auto &Point = Clustering.getPoints()[PointId]; - if (ClusterId.isUndef()) { + if (ClusterId.isUndef()) ClusterId = Clustering.getClusterIdForPoint(PointId); - Representative.resize(Point.Measurements.size()); - } - for (size_t I = 0, E = Point.Measurements.size(); I < E; ++I) { - Representative[I].push(Point.Measurements[I]); - } assert(ClusterId == Clustering.getClusterIdForPoint(PointId)); + + BenchmarkGroup::addPoint(PointId, Clustering); } // Returns a ProxResIdx by id or name. Index: tools/llvm-exegesis/lib/Clustering.h =================================================================== --- tools/llvm-exegesis/lib/Clustering.h +++ tools/llvm-exegesis/lib/Clustering.h @@ -25,20 +25,24 @@ class InstructionBenchmarkClustering { public: + enum ModeE { Dbscan, Naive }; + // Clusters `Points` using DBSCAN with the given parameters. See the cc file // for more explanations on the algorithm. static llvm::Expected - create(const std::vector &Points, size_t MinPts, - double AnalysisClusteringEpsilon, + create(const std::vector &Points, ModeE Mode, + size_t DbscanMinPts, double AnalysisClusteringEpsilon, llvm::Optional NumOpcodes = llvm::None); class ClusterId { public: static ClusterId noise() { return ClusterId(kNoise); } static ClusterId error() { return ClusterId(kError); } - static ClusterId makeValid(size_t Id) { return ClusterId(Id); } + static ClusterId makeValid(size_t Id, bool IsUnstable = false) { + return ClusterId(Id, IsUnstable); + } static ClusterId makeValidUnstable(size_t Id) { - return ClusterId(Id, /*IsUnstable=*/true); + return makeValid(Id, /*IsUnstable=*/true); } ClusterId() : Id_(kUndef), IsUnstable_(false) {} @@ -120,12 +124,20 @@ double AnalysisClusteringEpsilonSquared); llvm::Error validateAndSetup(); - void dbScan(size_t MinPts); + + void clusterizeDbScan(size_t MinPts); + void clusterizeNaive(unsigned NumOpcodes); + + // Stabilization is only needed if dbscan was used to clusterize. void stabilize(unsigned NumOpcodes); + void rangeQuery(size_t Q, std::vector &Scratchpad) const; + bool areAllNeighbours(ArrayRef Pts) const; + const std::vector &Points_; const double AnalysisClusteringEpsilonSquared_; + int NumDimensions_ = 0; // ClusterForPoint_[P] is the cluster id for Points[P]. std::vector ClusterIdForPoint_; @@ -134,6 +146,26 @@ Cluster ErrorCluster_; }; +class BenchmarkGroup { +public: + const std::vector &getPointIds() const { return PointIds; } + + // Return the cluster centroid. + const std::vector &getRepresentative() const { + return Representative; + } + + std::vector getGroupCenterPoint() const; + + virtual void addPoint(size_t PointId, + const InstructionBenchmarkClustering &Clustering); + +protected: + std::vector PointIds; + // Measurement stats for the points in the SchedClassCluster. + std::vector Representative; +}; + } // namespace exegesis } // namespace llvm Index: tools/llvm-exegesis/lib/Clustering.cpp =================================================================== --- tools/llvm-exegesis/lib/Clustering.cpp +++ tools/llvm-exegesis/lib/Clustering.cpp @@ -53,6 +53,31 @@ } } +// Given a set of points, checks that all the points are neighbours +// up to AnalysisClusteringEpsilon. This is O(2*N). +bool InstructionBenchmarkClustering::areAllNeighbours( + ArrayRef Pts) const { + // First, get the centroid of this group of points. This is O(N). + BenchmarkGroup G; + llvm::for_each(Pts, [this, &G](size_t P) { G.addPoint(P, *this); }); + const std::vector &Centroid = G.getGroupCenterPoint(); + + // Since we will be comparing with the centroid, we need to halve the epsilon. + double AnalysisClusteringEpsilonHalvedSquared = + AnalysisClusteringEpsilonSquared_ / 4.0; + + // And now check that every point is a neighbour of the centroid. Also O(N). + return llvm::all_of( + Pts, [this, &Centroid, AnalysisClusteringEpsilonHalvedSquared](size_t P) { + assert(P < Points_.size()); + const auto &PMeasurements = Points_[P].Measurements; + if (PMeasurements.empty()) // Error point. + return true; // Pretend that error point is a neighbour. + return isNeighbour(PMeasurements, Centroid, + AnalysisClusteringEpsilonHalvedSquared); + }); +} + InstructionBenchmarkClustering::InstructionBenchmarkClustering( const std::vector &Points, const double AnalysisClusteringEpsilonSquared) @@ -95,7 +120,7 @@ return llvm::Error::success(); } -void InstructionBenchmarkClustering::dbScan(const size_t MinPts) { +void InstructionBenchmarkClustering::clusterizeDbScan(const size_t MinPts) { std::vector Neighbors; // Persistent buffer to avoid allocs. for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { if (!ClusterIdForPoint_[P].isUndef()) @@ -152,6 +177,48 @@ } } +void InstructionBenchmarkClustering::clusterizeNaive(unsigned NumOpcodes) { + // Given an instruction Opcode, which are the benchmarks of this instruction? + std::vector> OpcodeToPoints; + OpcodeToPoints.resize(NumOpcodes); + size_t NumOpcodesSeen = 0; + for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { + const InstructionBenchmark &Point = Points_[P]; + const unsigned Opcode = Point.keyInstruction().getOpcode(); + assert(Opcode < NumOpcodes && "NumOpcodes is incorrect (too small)"); + llvm::SmallVectorImpl &PointsOfOpcode = OpcodeToPoints[Opcode]; + if (PointsOfOpcode.empty()) // If we previously have not seen any points of + ++NumOpcodesSeen; // this opcode, then naturally this is the new opcode. + PointsOfOpcode.emplace_back(P); + } + assert(OpcodeToPoints.size() == NumOpcodes && "sanity check"); + assert(NumOpcodesSeen <= NumOpcodes && + "can't see more opcodes than there are total opcodes"); + assert(NumOpcodesSeen <= Points_.size() && + "can't see more opcodes than there are total points"); + + Clusters_.reserve(NumOpcodesSeen); // One cluster per opcode. + for (ArrayRef PointsOfOpcode : llvm::make_filter_range( + OpcodeToPoints, [](ArrayRef PointsOfOpcode) { + return !PointsOfOpcode.empty(); // Ignore opcodes with no points. + })) { + // Create a new cluster. + Clusters_.emplace_back(ClusterId::makeValid( + Clusters_.size(), /*IsUnstable=*/!areAllNeighbours(PointsOfOpcode))); + Cluster &CurrentCluster = Clusters_.back(); + // Mark points as belonging to the new cluster. + llvm::for_each(PointsOfOpcode, [this, &CurrentCluster](size_t P) { + ClusterIdForPoint_[P] = CurrentCluster.Id; + }); + // And add all the points of this opcode to the new cluster. + CurrentCluster.PointIndices.reserve(PointsOfOpcode.size()); + CurrentCluster.PointIndices.assign(PointsOfOpcode.begin(), + PointsOfOpcode.end()); + assert(CurrentCluster.PointIndices.size() == PointsOfOpcode.size()); + } + assert(Clusters_.size() == NumOpcodesSeen); +} + // Given an instruction Opcode, we can make benchmarks (measurements) of the // instruction characteristics/performance. Then, to facilitate further analysis // we group the benchmarks with *similar* characteristics into clusters. @@ -246,8 +313,8 @@ llvm::Expected InstructionBenchmarkClustering::create( - const std::vector &Points, const size_t MinPts, - const double AnalysisClusteringEpsilon, + const std::vector &Points, const ModeE Mode, + const size_t DbscanMinPts, const double AnalysisClusteringEpsilon, llvm::Optional NumOpcodes) { InstructionBenchmarkClustering Clustering( Points, AnalysisClusteringEpsilon * AnalysisClusteringEpsilon); @@ -258,13 +325,39 @@ return Clustering; // Nothing to cluster. } - Clustering.dbScan(MinPts); + if (Mode == ModeE::Dbscan) { + Clustering.clusterizeDbScan(DbscanMinPts); - if (NumOpcodes.hasValue()) - Clustering.stabilize(NumOpcodes.getValue()); + if (NumOpcodes.hasValue()) + Clustering.stabilize(NumOpcodes.getValue()); + } else /*if(Mode == ModeE::Naive)*/ { + if (!NumOpcodes.hasValue()) + llvm::report_fatal_error( + "'naive' clustering mode requires opcode count to be specified"); + Clustering.clusterizeNaive(NumOpcodes.getValue()); + } return Clustering; } +void BenchmarkGroup::addPoint( + size_t PointId, const InstructionBenchmarkClustering &Clustering) { + const auto &Point = Clustering.getPoints()[PointId]; + if (Point.Measurements.empty()) // Error point. + return; + PointIds.push_back(PointId); + Representative.resize(Point.Measurements.size()); + for (size_t I = 0, E = Point.Measurements.size(); I < E; ++I) { + Representative[I].push(Point.Measurements[I]); + } +} + +std::vector BenchmarkGroup::getGroupCenterPoint() const { + std::vector ClusterCenterPoint(Representative.size()); + for (const auto &I : llvm::zip(Representative, ClusterCenterPoint)) + std::get<1>(I).PerInstructionValue = std::get<0>(I).avg(); + return ClusterCenterPoint; +} + } // namespace exegesis } // namespace llvm Index: tools/llvm-exegesis/llvm-exegesis.cpp =================================================================== --- tools/llvm-exegesis/llvm-exegesis.cpp +++ tools/llvm-exegesis/llvm-exegesis.cpp @@ -66,7 +66,7 @@ cl::cat(Options), cl::init("")); static cl::opt BenchmarkMode( - "mode", cl::desc("the mode to run"), cl::cat(BenchmarkOptions), + "mode", cl::desc("the mode to run"), cl::cat(Options), cl::values(clEnumValN(exegesis::InstructionBenchmark::Latency, "latency", "Instruction Latency"), clEnumValN(exegesis::InstructionBenchmark::InverseThroughput, @@ -89,14 +89,24 @@ cl::desc("ignore instructions that do not define a sched class"), cl::cat(BenchmarkOptions), cl::init(false)); -static cl::opt AnalysisNumPoints( +static cl::opt + AnalysisClusteringAlgorithm( + "analysis-clustering", cl::desc("the clustering algorithm to use"), + cl::cat(AnalysisOptions), + cl::values(clEnumValN(exegesis::InstructionBenchmarkClustering::Dbscan, + "dbscan", "use DBSCAN/OPTICS algorithm"), + clEnumValN(exegesis::InstructionBenchmarkClustering::Naive, + "naive", "one cluster per opcode")), + cl::init(exegesis::InstructionBenchmarkClustering::Dbscan)); + +static cl::opt AnalysisDbscanNumPoints( "analysis-numpoints", - cl::desc("minimum number of points in an analysis cluster"), + cl::desc("minimum number of points in an analysis cluster (dbscan only)"), cl::cat(AnalysisOptions), cl::init(3)); static cl::opt AnalysisClusteringEpsilon( "analysis-clustering-epsilon", - cl::desc("dbscan epsilon for benchmark point clustering"), + cl::desc("epsilon for benchmark point clustering"), cl::cat(AnalysisOptions), cl::init(0.1)); static cl::opt AnalysisInconsistencyEpsilon( @@ -460,8 +470,8 @@ std::unique_ptr InstrInfo(TheTarget->createMCInstrInfo()); const auto Clustering = ExitOnErr(InstructionBenchmarkClustering::create( - Points, AnalysisNumPoints, AnalysisClusteringEpsilon, - InstrInfo->getNumOpcodes())); + Points, AnalysisClusteringAlgorithm, AnalysisDbscanNumPoints, + AnalysisClusteringEpsilon, InstrInfo->getNumOpcodes())); const Analysis Analyzer(*TheTarget, std::move(InstrInfo), Clustering, AnalysisInconsistencyEpsilon, Index: unittests/tools/llvm-exegesis/ClusteringTest.cpp =================================================================== --- unittests/tools/llvm-exegesis/ClusteringTest.cpp +++ unittests/tools/llvm-exegesis/ClusteringTest.cpp @@ -46,7 +46,8 @@ // Error cluster: points {2} Points[2].Error = "oops"; - auto Clustering = InstructionBenchmarkClustering::create(Points, 2, 0.25); + auto Clustering = InstructionBenchmarkClustering::create( + Points, InstructionBenchmarkClustering::ModeE::Dbscan, 2, 0.25); ASSERT_TRUE((bool)Clustering); EXPECT_THAT(Clustering.get().getValidClusters(), UnorderedElementsAre(HasPoints({0, 3}), HasPoints({1, 4}))); @@ -73,7 +74,9 @@ {"x", 0.01, 0.0}, {"y", 1.02, 0.0}, {"z", 1.98, 0.0}}; Points[1].Measurements = {{"y", 1.02, 0.0}, {"z", 1.98, 0.0}}; auto Error = - InstructionBenchmarkClustering::create(Points, 2, 0.25).takeError(); + InstructionBenchmarkClustering::create( + Points, InstructionBenchmarkClustering::ModeE::Dbscan, 2, 0.25) + .takeError(); ASSERT_TRUE((bool)Error); consumeError(std::move(Error)); } @@ -83,7 +86,9 @@ Points[0].Measurements = {{"x", 0.01, 0.0}, {"y", 1.02, 0.0}}; Points[1].Measurements = {{"y", 1.02, 0.0}, {"x", 1.98, 0.0}}; auto Error = - InstructionBenchmarkClustering::create(Points, 2, 0.25).takeError(); + InstructionBenchmarkClustering::create( + Points, InstructionBenchmarkClustering::ModeE::Dbscan, 2, 0.25) + .takeError(); ASSERT_TRUE((bool)Error); consumeError(std::move(Error)); } @@ -112,7 +117,8 @@ Points[2].Measurements = { {"x", 2.0, 0.0}}; - auto Clustering = InstructionBenchmarkClustering::create(Points, 2, 1.1); + auto Clustering = InstructionBenchmarkClustering::create( + Points, InstructionBenchmarkClustering::ModeE::Dbscan, 2, 1.1); ASSERT_TRUE((bool)Clustering); EXPECT_THAT(Clustering.get().getValidClusters(), UnorderedElementsAre(HasPoints({0, 1, 2}))); @@ -128,7 +134,8 @@ Points[2].Measurements = { {"x", 1.0, 0.0}}; - auto Clustering = InstructionBenchmarkClustering::create(Points, 2, 1.1); + auto Clustering = InstructionBenchmarkClustering::create( + Points, InstructionBenchmarkClustering::ModeE::Dbscan, 2, 1.1); ASSERT_TRUE((bool)Clustering); EXPECT_THAT(Clustering.get().getValidClusters(), UnorderedElementsAre(HasPoints({0, 1, 2})));