Index: docs/CommandGuide/llvm-exegesis.rst =================================================================== --- docs/CommandGuide/llvm-exegesis.rst +++ docs/CommandGuide/llvm-exegesis.rst @@ -224,6 +224,13 @@ Specify the numPoints parameters to be used for DBSCAN clustering (`analysis` mode). +.. option:: -analysis-display-unstable-clusters + + If there is more than one benchmark for an opcode, said benchmarks may end up + not being clustered into the same cluster if the measured performance + characteristics are different. by default all such opcodes are filtered out. + This flag will instead show only such unstable opcodes. + .. option:: -ignore-invalid-sched-class=false If set, ignore instructions that do not have a sched class (class idx = 0). Index: test/tools/llvm-exegesis/X86/analysis-cluster-stabilization.test =================================================================== --- /dev/null +++ test/tools/llvm-exegesis/X86/analysis-cluster-stabilization.test @@ -0,0 +1,82 @@ +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-clusters-output-file=- -analysis-epsilon=0.1 -analysis-numpoints=1 | FileCheck -check-prefixes=CHECK-CLUSTERS %s +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-inconsistencies-output-file=- -analysis-epsilon=0.5 -analysis-numpoints=1 | FileCheck -check-prefixes=CHECK-INCONSISTENCIES-ALL,CHECK-INCONSISTENCIES-STABLE %s +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-inconsistencies-output-file=- -analysis-epsilon=0.5 -analysis-display-unstable-clusters -analysis-numpoints=1 | FileCheck -check-prefixes=CHECK-INCONSISTENCIES-ALL,CHECK-INCONSISTENCIES-UNSTABLE %s + +# We have one ADD32rr measurement, and two measurements for SQRTSSr. +# The ADD32rr measurement and one of the SQRTSSr measurements are identical, +# and thus will be be in the same cluster. But the second SQRTSSr measurement +# is different from the first SQRTSSr measurement, and thus it will be in it's +# own cluster. We do reclusterization, and thus since there is more than one +# measurement from SQRTSSr, and they are not in the same cluster, we move +# all two SQRTSSr measurements into their own cluster, and mark it as unstable. +# By default, we do not show such unstable clusters. +# If told to show, we *only* show such unstable clusters. + +# CHECK-CLUSTERS: {{^}}cluster_id,opcode_name,config,sched_class,latency{{$}} +# CHECK-CLUSTERS-NEXT: {{^}}0, +# CHECK-CLUSTERS-SAME: ,90.00{{$}} +# CHECK-CLUSTERS: {{^}}3, +# CHECK-CLUSTERS-SAME: ,90.11{{$}} +# CHECK-CLUSTERS-NEXT: {{^}}3, +# CHECK-CLUSTERS-SAME: ,100.00{{$}} + +# CHECK-INCONSISTENCIES-STABLE: ADD32rr +# CHECK-INCONSISTENCIES-STABLE-NOT: ADD32rr +# CHECK-INCONSISTENCIES-STABLE-NOT: SQRTSSr + +# CHECK-INCONSISTENCIES-UNSTABLE: SQRTSSr +# CHECK-INCONSISTENCIES-UNSTABLE: SQRTSSr +# CHECK-INCONSISTENCIES-UNSTABLE-NOT: SQRTSSr +# CHECK-INCONSISTENCIES-UNSTABLE-NOT: ADD32rr + +--- +mode: latency +key: + instructions: + - 'ADD32rr EDX EDX EAX' + config: '' + register_initial_values: + - 'EDX=0x0' + - 'EAX=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 90.0000, per_snippet_value: 90.0000 } +error: '' +info: Repeating a single implicitly serial instruction +assembled_snippet: BA00000000B80000000001C201C201C201C201C201C201C201C201C201C201C201C201C201C201C201C2C3 +--- +mode: latency +key: + instructions: + - 'SQRTSSr XMM11 XMM11' + config: '' + register_initial_values: + - 'XMM11=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 90.1111, per_snippet_value: 90.1111 } +error: '' +info: Repeating a single explicitly serial instruction +assembled_snippet: 4883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F1C244883C410F3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBC3 +... +--- +mode: latency +key: + instructions: + - 'SQRTSSr XMM11 XMM11' + config: '' + register_initial_values: + - 'XMM11=0x0' +cpu_name: bdver2 +llvm_triple: x86_64-unknown-linux-gnu +num_repetitions: 10000 +measurements: + - { key: latency, value: 100, per_snippet_value: 100 } +error: '' +info: Repeating a single explicitly serial instruction +assembled_snippet: 4883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F1C244883C410F3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBF3450F51DBC3 +... Index: tools/llvm-exegesis/lib/Analysis.h =================================================================== --- tools/llvm-exegesis/lib/Analysis.h +++ tools/llvm-exegesis/lib/Analysis.h @@ -36,7 +36,9 @@ class Analysis { public: Analysis(const llvm::Target &Target, - const InstructionBenchmarkClustering &Clustering); + std::unique_ptr InstrInfo, + const InstructionBenchmarkClustering &Clustering, + bool AnalysisDisplayUnstableOpcodes); // Prints a csv of instructions for each cluster. struct PrintClusters {}; @@ -125,6 +127,7 @@ std::unique_ptr AsmInfo_; std::unique_ptr InstPrinter_; std::unique_ptr Disasm_; + const bool AnalysisDisplayUnstableOpcodes_; }; // Computes the idealized ProcRes Unit pressure. This is the expected Index: tools/llvm-exegesis/lib/Analysis.cpp =================================================================== --- tools/llvm-exegesis/lib/Analysis.cpp +++ tools/llvm-exegesis/lib/Analysis.cpp @@ -149,7 +149,7 @@ writeEscaped(OS, Point.Key.Config); OS << kCsvSep; assert(!Point.Key.Instructions.empty()); - const llvm::MCInst &MCI = Point.Key.Instructions[0]; + const llvm::MCInst &MCI = Point.keyInstruction(); const unsigned SchedClassId = resolveSchedClassId( *SubtargetInfo_, InstrInfo_->get(MCI.getOpcode()).getSchedClass(), MCI); @@ -168,13 +168,15 @@ } Analysis::Analysis(const llvm::Target &Target, - const InstructionBenchmarkClustering &Clustering) - : Clustering_(Clustering) { + std::unique_ptr InstrInfo, + const InstructionBenchmarkClustering &Clustering, + bool AnalysisDisplayUnstableOpcodes) + : Clustering_(Clustering), InstrInfo_(std::move(InstrInfo)), + AnalysisDisplayUnstableOpcodes_(AnalysisDisplayUnstableOpcodes) { if (Clustering.getPoints().empty()) return; const InstructionBenchmark &FirstPoint = Clustering.getPoints().front(); - InstrInfo_.reset(Target.createMCInstrInfo()); RegInfo_.reset(Target.createMCRegInfo(FirstPoint.LLVMTriple)); AsmInfo_.reset(Target.createMCAsmInfo(*RegInfo_, FirstPoint.LLVMTriple)); SubtargetInfo_.reset(Target.createMCSubtargetInfo(FirstPoint.LLVMTriple, @@ -233,7 +235,7 @@ assert(!Point.Key.Instructions.empty()); // FIXME: we should be using the tuple of classes for instructions in the // snippet as key. - const llvm::MCInst &MCI = Point.Key.Instructions[0]; + const llvm::MCInst &MCI = Point.keyInstruction(); unsigned SchedClassId = InstrInfo_->get(MCI.getOpcode()).getSchedClass(); const bool WasVariant = SchedClassId && SubtargetInfo_->getSchedModel() .getSchedClassDesc(SchedClassId) @@ -668,6 +670,8 @@ const auto &ClusterId = Clustering_.getClusterIdForPoint(PointId); if (!ClusterId.isValid()) continue; // Ignore noise and errors. FIXME: take noise into account ? + if (ClusterId.isUnstable() ^ AnalysisDisplayUnstableOpcodes_) + continue; // Either display stable or unstable clusters only. auto SchedClassClusterIt = std::find_if(SchedClassClusters.begin(), SchedClassClusters.end(), [ClusterId](const SchedClassCluster &C) { Index: tools/llvm-exegesis/lib/BenchmarkResult.h =================================================================== --- tools/llvm-exegesis/lib/BenchmarkResult.h +++ tools/llvm-exegesis/lib/BenchmarkResult.h @@ -61,6 +61,8 @@ ModeE Mode; std::string CpuName; std::string LLVMTriple; + // Which instruction is being benchmarked here? + const llvm::MCInst &keyInstruction() const { return Key.Instructions[0]; } // The number of instructions inside the repeated snippet. For example, if a // snippet of 3 instructions is repeated 4 times, this is 12. int NumRepetitions = 0; Index: tools/llvm-exegesis/lib/Clustering.h =================================================================== --- tools/llvm-exegesis/lib/Clustering.h +++ tools/llvm-exegesis/lib/Clustering.h @@ -15,7 +15,9 @@ #define LLVM_TOOLS_LLVM_EXEGESIS_CLUSTERING_H #include "BenchmarkResult.h" +#include "llvm/ADT/Optional.h" #include "llvm/Support/Error.h" +#include #include namespace llvm { @@ -27,21 +29,28 @@ // for more explanations on the algorithm. static llvm::Expected create(const std::vector &Points, size_t MinPts, - double Epsilon); + double Epsilon, llvm::Optional NumOpcodes = llvm::None); class ClusterId { public: static ClusterId noise() { return ClusterId(kNoise); } static ClusterId error() { return ClusterId(kError); } static ClusterId makeValid(size_t Id) { return ClusterId(Id); } - ClusterId() : Id_(kUndef) {} + static ClusterId makeValidUnstable(size_t Id) { + return ClusterId(Id, /*IsUnstable=*/true); + } + + ClusterId() : Id_(kUndef), IsUnstable_(false) {} + + // Compare id's, ignoring the 'unstability' bit. bool operator==(const ClusterId &O) const { return Id_ == O.Id_; } bool operator<(const ClusterId &O) const { return Id_ < O.Id_; } bool isValid() const { return Id_ <= kMaxValid; } - bool isUndef() const { return Id_ == kUndef; } + bool isUnstable() const { return IsUnstable_; } bool isNoise() const { return Id_ == kNoise; } bool isError() const { return Id_ == kError; } + bool isUndef() const { return Id_ == kUndef; } // Precondition: isValid(). size_t getId() const { @@ -50,14 +59,19 @@ } private: - explicit ClusterId(size_t Id) : Id_(Id) {} + ClusterId(size_t Id, bool IsUnstable = false) + : Id_(Id), IsUnstable_(IsUnstable) {} + static constexpr const size_t kMaxValid = - std::numeric_limits::max() - 4; + (std::numeric_limits::max() >> 1) - 4; static constexpr const size_t kNoise = kMaxValid + 1; static constexpr const size_t kError = kMaxValid + 2; static constexpr const size_t kUndef = kMaxValid + 3; - size_t Id_; + + size_t Id_ : (std::numeric_limits::digits - 1); + size_t IsUnstable_ : 1; }; + static_assert(sizeof(ClusterId) == sizeof(size_t), "should be a bit field."); struct Cluster { Cluster() = delete; @@ -101,8 +115,10 @@ private: InstructionBenchmarkClustering( const std::vector &Points, double EpsilonSquared); + llvm::Error validateAndSetup(); void dbScan(size_t MinPts); + void stabilize(unsigned NumOpcodes); void rangeQuery(size_t Q, std::vector &Scratchpad) const; const std::vector &Points_; Index: tools/llvm-exegesis/lib/Clustering.cpp =================================================================== --- tools/llvm-exegesis/lib/Clustering.cpp +++ tools/llvm-exegesis/lib/Clustering.cpp @@ -8,8 +8,11 @@ #include "Clustering.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include #include +#include namespace llvm { namespace exegesis { @@ -147,10 +150,102 @@ } } +// Given an instruction Opcode, we can make benchmarks (measurements) of the +// instruction characteristics/performance. Then, to facilitate further analysis +// we group the benchmarks with *similar* characteristics into clusters. +// Now, this is all not entirely deterministic. Some instructions have variable +// characteristics, depending on their arguments. And thus, if we do several +// benchmarks of the same instruction Opcode, we may end up with *different* +// performance characteristics measurements. And when we then do clustering, +// these several benchmarks of the same instruction Opcode may end up being +// clustered into *different* clusters. This is not great for further analysis. +// We shall find every opcode with benchmarks not in just one cluster, and move +// *all* the benchmarks of said Opcode into one new unstable cluster per Opcode. +void InstructionBenchmarkClustering::stabilize(unsigned NumOpcodes) { + // Given an instruction Opcode, in which clusters do benchmarks of this + // instruction lie? Normally, they all should be in the same cluster. + std::vector> OpcodeToClusterIDs; + OpcodeToClusterIDs.resize(NumOpcodes); + // The list of opcodes that have more than one cluster. + llvm::SetVector UnstableOpcodes; + // Populate OpcodeToClusterIDs and UnstableOpcodes data structures. + assert(ClusterIdForPoint_.size() == Points_.size() && "size mismatch"); + for (const auto &Point : zip(Points_, ClusterIdForPoint_)) { + const ClusterId &ClusterIdOfPoint = std::get<1>(Point); + if (!ClusterIdOfPoint.isValid()) + continue; // Only process fully valid clusters. + const unsigned Opcode = std::get<0>(Point).keyInstruction().getOpcode(); + assert(Opcode < NumOpcodes && "NumOpcodes is incorrect (too small)"); + llvm::SmallSet &ClusterIDsOfOpcode = + OpcodeToClusterIDs[Opcode]; + ClusterIDsOfOpcode.insert(ClusterIdOfPoint); + // Is there more than one ClusterID for this opcode?. + if (ClusterIDsOfOpcode.size() < 2) + continue; // If not, then at this moment this Opcode is stable. + // Else let's record this unstable opcode for future use. + UnstableOpcodes.insert(Opcode); + } + assert(OpcodeToClusterIDs.size() == NumOpcodes && "sanity check"); + + // We know with how many [new] clusters we will end up with. + const auto NewTotalClusterCount = Clusters_.size() + UnstableOpcodes.size(); + Clusters_.reserve(NewTotalClusterCount); + for (const size_t UnstableOpcode : UnstableOpcodes.getArrayRef()) { + const llvm::SmallSet &ClusterIDs = + OpcodeToClusterIDs[UnstableOpcode]; + assert(ClusterIDs.size() > 1 && + "Should only have Opcodes with more than one cluster."); + + // Create a new unstable cluster, one per Opcode. + Clusters_.emplace_back(ClusterId::makeValidUnstable(Clusters_.size())); + Cluster &UnstableCluster = Clusters_.back(); + // We will find *at least* one point in each of these clusters. + UnstableCluster.PointIndices.reserve(ClusterIDs.size()); + + // Go through every cluster which we recorded as containing benchmarks + // of this UnstableOpcode. NOTE: we only recorded valid clusters. + for (const ClusterId &CID : ClusterIDs) { + assert(CID.isValid() && + "We only recorded valid clusters, not noise/error clusters."); + Cluster &OldCluster = Clusters_[CID.getId()]; // Valid clusters storage. + // Within each cluster, go through each point, and either move it to the + // new unstable cluster, or 'keep' it. + // In this case, we'll reshuffle OldCluster.PointIndices vector + // so that all the points that are *not* for UnstableOpcode are first, + // and the rest of the points is for the UnstableOpcode. + const auto it = std::stable_partition( + OldCluster.PointIndices.begin(), OldCluster.PointIndices.end(), + [this, UnstableOpcode](size_t P) { + return Points_[P].keyInstruction().getOpcode() != UnstableOpcode; + }); + assert(std::distance(it, OldCluster.PointIndices.end()) > 0 && + "Should have found at least one bad point"); + // Mark to-be-moved points as belonging to the new cluster. + std::for_each(it, OldCluster.PointIndices.end(), + [this, &UnstableCluster](size_t P) { + ClusterIdForPoint_[P] = UnstableCluster.Id; + }); + // Actually append to-be-moved points to the new cluster. + UnstableCluster.PointIndices.insert(UnstableCluster.PointIndices.cend(), + it, OldCluster.PointIndices.end()); + // And finally, remove "to-be-moved" points form the old cluster. + OldCluster.PointIndices.erase(it, OldCluster.PointIndices.cend()); + // Now, the old cluster may end up being empty, but let's just keep it + // in whatever state it ended up. Purging empty clusters isn't worth it. + }; + assert(UnstableCluster.PointIndices.size() > 1 && + "New unstable cluster should end up with more than one point."); + assert(UnstableCluster.PointIndices.size() >= ClusterIDs.size() && + "New unstable cluster should end up with no less points than there " + "was clusters"); + } + assert(Clusters_.size() == NewTotalClusterCount && "sanity check"); +} + llvm::Expected InstructionBenchmarkClustering::create( const std::vector &Points, const size_t MinPts, - const double Epsilon) { + const double Epsilon, llvm::Optional NumOpcodes) { InstructionBenchmarkClustering Clustering(Points, Epsilon * Epsilon); if (auto Error = Clustering.validateAndSetup()) { return std::move(Error); @@ -160,6 +255,10 @@ } Clustering.dbScan(MinPts); + + if (NumOpcodes.hasValue()) + Clustering.stabilize(NumOpcodes.getValue()); + return Clustering; } Index: tools/llvm-exegesis/llvm-exegesis.cpp =================================================================== --- tools/llvm-exegesis/llvm-exegesis.cpp +++ tools/llvm-exegesis/llvm-exegesis.cpp @@ -96,13 +96,21 @@ AnalysisInconsistenciesOutputFile("analysis-inconsistencies-output-file", cl::desc(""), cl::init("")); +static cl::opt AnalysisDisplayUnstableOpcodes( + "analysis-display-unstable-clusters", + cl::desc("if there is more than one benchmark for an opcode, said " + "benchmarks may end up not being clustered into the same cluster " + "if the measured performance characteristics are different. by " + "default all such opcodes are filtered out. this flag will " + "instead show only such unstable opcodes"), + cl::init(false)); + static cl::opt CpuName("mcpu", cl::desc( "cpu name to use for pfm counters, leave empty to autodetect"), cl::init("")); - static ExitOnError ExitOnErr; #ifdef LLVM_EXEGESIS_INITIALIZE_NATIVE_TARGET @@ -432,10 +440,14 @@ llvm::errs() << "unknown target '" << Points[0].LLVMTriple << "'\n"; return; } - const auto Clustering = ExitOnErr(InstructionBenchmarkClustering::create( - Points, AnalysisNumPoints, AnalysisEpsilon)); - const Analysis Analyzer(*TheTarget, Clustering); + std::unique_ptr InstrInfo(TheTarget->createMCInstrInfo()); + + const auto Clustering = ExitOnErr(InstructionBenchmarkClustering::create( + Points, AnalysisNumPoints, AnalysisEpsilon, InstrInfo->getNumOpcodes())); + + const Analysis Analyzer(*TheTarget, std::move(InstrInfo), Clustering, + AnalysisDisplayUnstableOpcodes); maybeRunAnalysis(Analyzer, "analysis clusters", AnalysisClustersOutputFile);