Index: llvm/trunk/tools/llvm-exegesis/lib/CMakeLists.txt =================================================================== --- llvm/trunk/tools/llvm-exegesis/lib/CMakeLists.txt +++ llvm/trunk/tools/llvm-exegesis/lib/CMakeLists.txt @@ -2,6 +2,7 @@ STATIC BenchmarkResult.cpp BenchmarkRunner.cpp + Clustering.cpp InMemoryAssembler.cpp InstructionSnippetGenerator.cpp Latency.cpp Index: llvm/trunk/tools/llvm-exegesis/lib/Clustering.h =================================================================== --- llvm/trunk/tools/llvm-exegesis/lib/Clustering.h +++ llvm/trunk/tools/llvm-exegesis/lib/Clustering.h @@ -0,0 +1,103 @@ +//===-- Clustering.h --------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Utilities to compute benchmark result clusters. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_EXEGESIS_CLUSTERING_H +#define LLVM_TOOLS_LLVM_EXEGESIS_CLUSTERING_H + +#include "BenchmarkResult.h" +#include "llvm/Support/Error.h" +#include + +namespace exegesis { + +class InstructionBenchmarkClustering { +public: + // Clusters `Points` using DBSCAN with the given parameters. See the cc file + // for more explanations on the algorithm. + static llvm::Expected + create(const std::vector &Points, size_t MinPts, + double Epsilon); + + class ClusterId { + public: + static ClusterId noise() { return ClusterId(kNoise); } + static ClusterId error() { return ClusterId(kError); } + static ClusterId makeValid(int Id) { + assert(Id >= 0); + return ClusterId(Id); + } + ClusterId() : Id_(kUndef) {} + bool operator==(const ClusterId &O) const { return Id_ == O.Id_; } + + bool isValid() const { return Id_ >= 0; } + bool isUndef() const { return Id_ == kUndef; } + bool isNoise() const { return Id_ == kNoise; } + bool isError() const { return Id_ == kError; } + + // Precondition: isValid(). + size_t getId() const { + assert(isValid()); + return static_cast(Id_); + } + + private: + explicit ClusterId(int Id) : Id_(Id) {} + static constexpr const int kUndef = -1; + static constexpr const int kNoise = -2; + static constexpr const int kError = -3; + int Id_; + }; + + struct Cluster { + Cluster() = delete; + explicit Cluster(const ClusterId &Id) : Id(Id) {} + + const ClusterId Id; + // Indices of benchmarks within the cluster. + std::vector PointIndices; + }; + + ClusterId getClusterIdForPoint(size_t P) const { + return ClusterIdForPoint_[P]; + } + + const Cluster &getCluster(ClusterId Id) const { + assert(!Id.isUndef() && "unlabeled cluster"); + if (Id.isNoise()) { + return NoiseCluster_; + } + if (Id.isError()) { + return ErrorCluster_; + } + return Clusters_[Id.getId()]; + } + + const std::vector &getValidClusters() const { return Clusters_; } + +private: + InstructionBenchmarkClustering(); + llvm::Error validateAndSetup(const std::vector &Points); + void dbScan(const std::vector &Points, size_t MinPts, + double EpsilonSquared); + int NumDimensions_ = 0; + // ClusterForPoint_[P] is the cluster id for Points[P]. + std::vector ClusterIdForPoint_; + std::vector Clusters_; + Cluster NoiseCluster_; + Cluster ErrorCluster_; +}; + +} // namespace exegesis + +#endif // LLVM_TOOLS_LLVM_EXEGESIS_CLUSTERING_H Index: llvm/trunk/tools/llvm-exegesis/lib/Clustering.cpp =================================================================== --- llvm/trunk/tools/llvm-exegesis/lib/Clustering.cpp +++ llvm/trunk/tools/llvm-exegesis/lib/Clustering.cpp @@ -0,0 +1,170 @@ +//===-- Clustering.cpp ------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Clustering.h" +#include +#include + +namespace exegesis { + +// The clustering problem has the following characteristics: +// (A) - Low dimension (dimensions are typically proc resource units, +// typically < 10). +// (B) - Number of points : ~thousands (points are measurements of an MCInst) +// (C) - Number of clusters: ~tens. +// (D) - The number of clusters is not known /a priory/. +// (E) - The amount of noise is relatively small. +// The problem is rather small. In terms of algorithms, (D) disqualifies +// k-means and makes algorithms such as DBSCAN[1] or OPTICS[2] more applicable. +// +// We've used DBSCAN here because it's simple to implement. This is a pretty +// straightforward and inefficient implementation of the pseudocode in [2]. +// +// [1] https://en.wikipedia.org/wiki/DBSCAN +// [2] https://en.wikipedia.org/wiki/OPTICS_algorithm + +namespace { + +// Finds the points at distance less than sqrt(EpsilonSquared) of Q (not +// including Q). +std::vector rangeQuery(const std::vector &Points, + const size_t Q, const double EpsilonSquared) { + std::vector Neighbors; + const auto &QMeasurements = Points[Q].Measurements; + for (size_t P = 0, NumPoints = Points.size(); P < NumPoints; ++P) { + if (P == Q) + continue; + const auto &PMeasurements = Points[P].Measurements; + if (PMeasurements.empty()) // Error point. + continue; + double DistanceSquared = 0; + for (size_t I = 0, E = QMeasurements.size(); I < E; ++I) { + const auto Diff = PMeasurements[I].Value - QMeasurements[I].Value; + DistanceSquared += Diff * Diff; + } + if (DistanceSquared <= EpsilonSquared) { + Neighbors.push_back(P); + } + } + return Neighbors; +} + +} // namespace + +InstructionBenchmarkClustering::InstructionBenchmarkClustering() + : NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {} + +llvm::Error InstructionBenchmarkClustering::validateAndSetup( + const std::vector &Points) { + ClusterIdForPoint_.resize(Points.size()); + // Mark erroneous measurements out. + // All points must have the same number of dimensions, in the same order. + const std::vector *LastMeasurement = nullptr; + for (size_t P = 0, NumPoints = Points.size(); P < NumPoints; ++P) { + const auto &Point = Points[P]; + if (!Point.Error.empty()) { + ClusterIdForPoint_[P] = ClusterId::error(); + ErrorCluster_.PointIndices.push_back(P); + continue; + } + const auto *CurMeasurement = &Point.Measurements; + if (LastMeasurement) { + if (LastMeasurement->size() != CurMeasurement->size()) { + return llvm::make_error( + "inconsistent measurement dimensions", + llvm::inconvertibleErrorCode()); + } + for (size_t I = 0, E = LastMeasurement->size(); I < E; ++I) { + if (LastMeasurement->at(I).Key != CurMeasurement->at(I).Key) { + return llvm::make_error( + "inconsistent measurement dimensions keys", + llvm::inconvertibleErrorCode()); + } + } + } + LastMeasurement = CurMeasurement; + } + if (LastMeasurement) { + NumDimensions_ = LastMeasurement->size(); + } + return llvm::Error::success(); +} + +void InstructionBenchmarkClustering::dbScan( + const std::vector &Points, const size_t MinPts, + const double EpsilonSquared) { + for (size_t P = 0, NumPoints = Points.size(); P < NumPoints; ++P) { + if (!ClusterIdForPoint_[P].isUndef()) + continue; // Previously processed in inner loop. + const auto Neighbors = rangeQuery(Points, P, EpsilonSquared); + if (Neighbors.size() + 1 < MinPts) { // Density check. + // The region around P is not dense enough to create a new cluster, mark + // as noise for now. + ClusterIdForPoint_[P] = ClusterId::noise(); + continue; + } + + // Create a new cluster, add P. + Clusters_.emplace_back(ClusterId::makeValid(Clusters_.size())); + Cluster &CurrentCluster = Clusters_.back(); + ClusterIdForPoint_[P] = CurrentCluster.Id; /* Label initial point */ + CurrentCluster.PointIndices.push_back(P); + + // Process P's neighbors. + std::unordered_set ToProcess(Neighbors.begin(), Neighbors.end()); + while (!ToProcess.empty()) { + // Retrieve a point from the set. + const size_t Q = *ToProcess.begin(); + ToProcess.erase(Q); + + if (ClusterIdForPoint_[Q].isNoise()) { + // Change noise point to border point. + ClusterIdForPoint_[Q] = CurrentCluster.Id; + CurrentCluster.PointIndices.push_back(Q); + continue; + } + if (!ClusterIdForPoint_[Q].isUndef()) { + continue; // Previously processed. + } + // Add Q to the current custer. + ClusterIdForPoint_[Q] = CurrentCluster.Id; + CurrentCluster.PointIndices.push_back(Q); + // And extend to the neighbors of Q if the region is dense enough. + const auto Neighbors = rangeQuery(Points, Q, EpsilonSquared); + if (Neighbors.size() + 1 >= MinPts) { + ToProcess.insert(Neighbors.begin(), Neighbors.end()); + } + } + } + + // Add noisy points to noise cluster. + for (size_t P = 0, NumPoints = Points.size(); P < NumPoints; ++P) { + if (ClusterIdForPoint_[P].isNoise()) { + NoiseCluster_.PointIndices.push_back(P); + } + } +} + +llvm::Expected +InstructionBenchmarkClustering::create( + const std::vector &Points, const size_t MinPts, + const double Epsilon) { + InstructionBenchmarkClustering Clustering; + if (auto Error = Clustering.validateAndSetup(Points)) { + return Error; + } + if (Clustering.ErrorCluster_.PointIndices.size() == Points.size()) { + return Clustering; // Nothing to cluster. + } + + Clustering.dbScan(Points, MinPts, Epsilon * Epsilon); + return Clustering; +} + +} // namespace exegesis Index: llvm/trunk/unittests/tools/llvm-exegesis/CMakeLists.txt =================================================================== --- llvm/trunk/unittests/tools/llvm-exegesis/CMakeLists.txt +++ llvm/trunk/unittests/tools/llvm-exegesis/CMakeLists.txt @@ -12,6 +12,7 @@ add_llvm_unittest(LLVMExegesisTests BenchmarkResultTest.cpp + ClusteringTest.cpp OperandGraphTest.cpp PerfHelperTest.cpp ) Index: llvm/trunk/unittests/tools/llvm-exegesis/ClusteringTest.cpp =================================================================== --- llvm/trunk/unittests/tools/llvm-exegesis/ClusteringTest.cpp +++ llvm/trunk/unittests/tools/llvm-exegesis/ClusteringTest.cpp @@ -0,0 +1,86 @@ +//===-- ClusteringTest.cpp --------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Clustering.h" +#include "BenchmarkResult.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/raw_ostream.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace exegesis { + +namespace { + +using testing::Field; +using testing::UnorderedElementsAre; +using testing::UnorderedElementsAreArray; + +TEST(ClusteringTest, Clusters3D) { + std::vector Points(6); + + // Cluster around (x=0, y=1, z=2): points {0, 3}. + Points[0].Measurements = {{"x", 0.01, ""}, {"y", 1.02, ""}, {"z", 1.98, "A"}}; + Points[3].Measurements = {{"x", -0.01, ""}, {"y", 1.02, ""}, {"z", 1.98, ""}}; + // Cluster around (x=1, y=1, z=2): points {1, 4}. + Points[1].Measurements = {{"x", 1.01, ""}, {"y", 1.02, ""}, {"z", 1.98, ""}}; + Points[4].Measurements = {{"x", 0.99, ""}, {"y", 1.02, ""}, {"z", 1.98, ""}}; + // Cluster around (x=0, y=0, z=0): points {5}, marked as noise. + Points[5].Measurements = {{"x", 0.0, ""}, {"y", 0.01, ""}, {"z", -0.02, ""}}; + // Error cluster: points {2} + Points[2].Error = "oops"; + + auto HasPoints = [](const std::vector &Indices) { + return Field(&InstructionBenchmarkClustering::Cluster::PointIndices, + UnorderedElementsAreArray(Indices)); + }; + + auto Clustering = InstructionBenchmarkClustering::create(Points, 2, 0.25); + ASSERT_TRUE((bool)Clustering); + EXPECT_THAT(Clustering.get().getValidClusters(), + UnorderedElementsAre(HasPoints({0, 3}), HasPoints({1, 4}))); + EXPECT_THAT(Clustering.get().getCluster( + InstructionBenchmarkClustering::ClusterId::noise()), + HasPoints({5})); + EXPECT_THAT(Clustering.get().getCluster( + InstructionBenchmarkClustering::ClusterId::error()), + HasPoints({2})); + + EXPECT_EQ(Clustering.get().getClusterIdForPoint(2), + InstructionBenchmarkClustering::ClusterId::error()); + EXPECT_EQ(Clustering.get().getClusterIdForPoint(5), + InstructionBenchmarkClustering::ClusterId::noise()); + EXPECT_EQ(Clustering.get().getClusterIdForPoint(0), + Clustering.get().getClusterIdForPoint(3)); + EXPECT_EQ(Clustering.get().getClusterIdForPoint(1), + Clustering.get().getClusterIdForPoint(4)); +} + +TEST(ClusteringTest, Clusters3D_InvalidSize) { + std::vector Points(6); + Points[0].Measurements = {{"x", 0.01, ""}, {"y", 1.02, ""}, {"z", 1.98, ""}}; + Points[1].Measurements = {{"y", 1.02, ""}, {"z", 1.98, ""}}; + auto Error = + InstructionBenchmarkClustering::create(Points, 2, 0.25).takeError(); + ASSERT_TRUE((bool)Error); + consumeError(std::move(Error)); +} + +TEST(ClusteringTest, Clusters3D_InvalidOrder) { + std::vector Points(6); + Points[0].Measurements = {{"x", 0.01, ""}, {"y", 1.02, ""}}; + Points[1].Measurements = {{"y", 1.02, ""}, {"x", 1.98, ""}}; + auto Error = + InstructionBenchmarkClustering::create(Points, 2, 0.25).takeError(); + ASSERT_TRUE((bool)Error); + consumeError(std::move(Error)); +} + +} // namespace +} // namespace exegesis