diff options
Diffstat (limited to 'llvm/tools/llvm-exegesis/lib/Clustering.cpp')
-rw-r--r-- | llvm/tools/llvm-exegesis/lib/Clustering.cpp | 170 |
1 files changed, 170 insertions, 0 deletions
diff --git a/llvm/tools/llvm-exegesis/lib/Clustering.cpp b/llvm/tools/llvm-exegesis/lib/Clustering.cpp new file mode 100644 index 00000000000..c8646c7c399 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/Clustering.cpp @@ -0,0 +1,170 @@ +//===-- Clustering.cpp ------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Clustering.h" +#include <string> +#include <unordered_set> + +namespace exegesis { + +// The clustering problem has the following characteristics: +// (A) - Low dimension (dimensions are typically proc resource units, +// typically < 10). +// (B) - Number of points : ~thousands (points are measurements of an MCInst) +// (C) - Number of clusters: ~tens. +// (D) - The number of clusters is not known /a priory/. +// (E) - The amount of noise is relatively small. +// The problem is rather small. In terms of algorithms, (D) disqualifies +// k-means and makes algorithms such as DBSCAN[1] or OPTICS[2] more applicable. +// +// We've used DBSCAN here because it's simple to implement. This is a pretty +// straightforward and inefficient implementation of the pseudocode in [2]. +// +// [1] https://en.wikipedia.org/wiki/DBSCAN +// [2] https://en.wikipedia.org/wiki/OPTICS_algorithm + +namespace { + +// Finds the points at distance less than sqrt(EpsilonSquared) of Q (not +// including Q). +std::vector<size_t> rangeQuery(const std::vector<InstructionBenchmark> &Points, + const size_t Q, const double EpsilonSquared) { + std::vector<size_t> Neighbors; + const auto &QMeasurements = Points[Q].Measurements; + for (size_t P = 0, NumPoints = Points.size(); P < NumPoints; ++P) { + if (P == Q) + continue; + const auto &PMeasurements = Points[P].Measurements; + if (PMeasurements.empty()) // Error point. + continue; + double DistanceSquared = 0; + for (size_t I = 0, E = QMeasurements.size(); I < E; ++I) { + const auto Diff = PMeasurements[I].Value - QMeasurements[I].Value; + DistanceSquared += Diff * Diff; + } + if (DistanceSquared <= EpsilonSquared) { + Neighbors.push_back(P); + } + } + return Neighbors; +} + +} // namespace + +InstructionBenchmarkClustering::InstructionBenchmarkClustering() + : NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {} + +llvm::Error InstructionBenchmarkClustering::validateAndSetup( + const std::vector<InstructionBenchmark> &Points) { + ClusterIdForPoint_.resize(Points.size()); + // Mark erroneous measurements out. + // All points must have the same number of dimensions, in the same order. + const std::vector<BenchmarkMeasure> *LastMeasurement = nullptr; + for (size_t P = 0, NumPoints = Points.size(); P < NumPoints; ++P) { + const auto &Point = Points[P]; + if (!Point.Error.empty()) { + ClusterIdForPoint_[P] = ClusterId::error(); + ErrorCluster_.PointIndices.push_back(P); + continue; + } + const auto *CurMeasurement = &Point.Measurements; + if (LastMeasurement) { + if (LastMeasurement->size() != CurMeasurement->size()) { + return llvm::make_error<llvm::StringError>( + "inconsistent measurement dimensions", + llvm::inconvertibleErrorCode()); + } + for (size_t I = 0, E = LastMeasurement->size(); I < E; ++I) { + if (LastMeasurement->at(I).Key != CurMeasurement->at(I).Key) { + return llvm::make_error<llvm::StringError>( + "inconsistent measurement dimensions keys", + llvm::inconvertibleErrorCode()); + } + } + } + LastMeasurement = CurMeasurement; + } + if (LastMeasurement) { + NumDimensions_ = LastMeasurement->size(); + } + return llvm::Error::success(); +} + +void InstructionBenchmarkClustering::dbScan( + const std::vector<InstructionBenchmark> &Points, const size_t MinPts, + const double EpsilonSquared) { + for (size_t P = 0, NumPoints = Points.size(); P < NumPoints; ++P) { + if (!ClusterIdForPoint_[P].isUndef()) + continue; // Previously processed in inner loop. + const auto Neighbors = rangeQuery(Points, P, EpsilonSquared); + if (Neighbors.size() + 1 < MinPts) { // Density check. + // The region around P is not dense enough to create a new cluster, mark + // as noise for now. + ClusterIdForPoint_[P] = ClusterId::noise(); + continue; + } + + // Create a new cluster, add P. + Clusters_.emplace_back(ClusterId::makeValid(Clusters_.size())); + Cluster &CurrentCluster = Clusters_.back(); + ClusterIdForPoint_[P] = CurrentCluster.Id; /* Label initial point */ + CurrentCluster.PointIndices.push_back(P); + + // Process P's neighbors. + std::unordered_set<size_t> ToProcess(Neighbors.begin(), Neighbors.end()); + while (!ToProcess.empty()) { + // Retrieve a point from the set. + const size_t Q = *ToProcess.begin(); + ToProcess.erase(Q); + + if (ClusterIdForPoint_[Q].isNoise()) { + // Change noise point to border point. + ClusterIdForPoint_[Q] = CurrentCluster.Id; + CurrentCluster.PointIndices.push_back(Q); + continue; + } + if (!ClusterIdForPoint_[Q].isUndef()) { + continue; // Previously processed. + } + // Add Q to the current custer. + ClusterIdForPoint_[Q] = CurrentCluster.Id; + CurrentCluster.PointIndices.push_back(Q); + // And extend to the neighbors of Q if the region is dense enough. + const auto Neighbors = rangeQuery(Points, Q, EpsilonSquared); + if (Neighbors.size() + 1 >= MinPts) { + ToProcess.insert(Neighbors.begin(), Neighbors.end()); + } + } + } + + // Add noisy points to noise cluster. + for (size_t P = 0, NumPoints = Points.size(); P < NumPoints; ++P) { + if (ClusterIdForPoint_[P].isNoise()) { + NoiseCluster_.PointIndices.push_back(P); + } + } +} + +llvm::Expected<InstructionBenchmarkClustering> +InstructionBenchmarkClustering::create( + const std::vector<InstructionBenchmark> &Points, const size_t MinPts, + const double Epsilon) { + InstructionBenchmarkClustering Clustering; + if (auto Error = Clustering.validateAndSetup(Points)) { + return std::move(Error); + } + if (Clustering.ErrorCluster_.PointIndices.size() == Points.size()) { + return Clustering; // Nothing to cluster. + } + + Clustering.dbScan(Points, MinPts, Epsilon * Epsilon); + return Clustering; +} + +} // namespace exegesis |