This is an archive of the discontinued LLVM Phabricator instance.

Differential D51689

[clangd] Dense posting lists proof-of-concept
Needs ReviewPublic

Authored by sammccall on Sep 5 2018, 9:31 AM.

Download Raw Diff

This revision needs review, but all reviewers have resigned.

Details

Reviewers

kbobyrev

Summary

This uses a bitmap representation instead of a list if the density of
the list is high enough (at least 1 in 32, which is the breakeven point
sizewise).

Experimenting with the LLVM index, this saves about 3% of total posting
list size, which isn't worth the complexity.

However it should also improve iterator performance somewhat:

advance is within a constant factor (find next set bit, average step is bounded)
advanceTo is constant time instead of log(n) with random accesses

If the posting lists that are dense are also commonly used in queries
(seems likely for common trigrams) then this may be worth doing for
latency reasons.
I'm uploading this so Kirill can experiment with benchmarks.

Diff Detail

Repository

rCTE Clang Tools Extra

Build Status

Buildable 22310
Build 22310: arc lint + arc unit

Event Timeline

sammccall created this revision.Sep 5 2018, 9:31 AM

Herald added subscribers: cfe-commits, kadircet, arphaman and 4 others. · View Herald TranscriptSep 5 2018, 9:31 AM

Harbormaster completed remote builds in B22271: Diff 164056.Sep 5 2018, 9:31 AM

[Tooling] JSONCompilationDatabasePlugin infers compile commands for missing files

See the existing InterpolatingCompilationDatabase for details on how this works.
We've been using this in clangd for a while, the heuristics seem to work well.

Harbormaster completed remote builds in B22310: Diff 164191.Sep 6 2018, 5:34 AM

Uh, please ignore the last comment, arc/I got confused.

For anyone interested in the direction of posting list compression, an implementation of Variable length Byte compression (VByte) has landed: D52300.

Revision Contents

Path

Size

clangd/

index/

dex/

DexIndex.cpp

16 lines

Iterator.h

47 lines

Iterator.cpp

132 lines

Diff 164191

clangd/index/dex/DexIndex.cpp

Show First 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	void DexIndex::buildIndex() {
// Symbols are sorted by symbol qualities so that items in the posting lists		// Symbols are sorted by symbol qualities so that items in the posting lists
// are stored in the descending order of symbol quality.		// are stored in the descending order of symbol quality.
std::sort(begin(Symbols), end(Symbols),		std::sort(begin(Symbols), end(Symbols),
[&](const Symbol LHS, const Symbol RHS) {		[&](const Symbol LHS, const Symbol RHS) {
return SymbolQuality[LHS] > SymbolQuality[RHS];		return SymbolQuality[LHS] > SymbolQuality[RHS];
});		});

// Populate TempInvertedIndex with posting lists for index symbols.		// Populate TempInvertedIndex with posting lists for index symbols.
		llvm::DenseMap<Token, std::vector<DocID>> TempInvertedIndex;
for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) {		for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) {
const auto *Sym = Symbols[SymbolRank];		const auto *Sym = Symbols[SymbolRank];
for (const auto &Token : generateSearchTokens(*Sym))		for (const auto &Token : generateSearchTokens(*Sym))
InvertedIndex[Token].push_back(SymbolRank);		TempInvertedIndex[Token].push_back(SymbolRank);
}		}
		InvertedIndex.reserve(InvertedIndex.size());
		for (auto &Pair : TempInvertedIndex)
		InvertedIndex.try_emplace(Pair.first, std::move(Pair.second));

vlog("Built DexIndex with estimated memory usage {0} bytes.",		vlog("Built DexIndex with estimated memory usage {0} bytes.",
estimateMemoryUsage());		estimateMemoryUsage());
}		}

/// Constructs iterators over tokens extracted from the query and exhausts it		/// Constructs iterators over tokens extracted from the query and exhausts it
/// while applying Callback to each symbol in the order of decreasing quality		/// while applying Callback to each symbol in the order of decreasing quality
/// of the matched symbols.		/// of the matched symbols.
Show All 9 Lines	bool DexIndex::fuzzyFind(
const auto TrigramTokens = generateIdentifierTrigrams(Req.Query);		const auto TrigramTokens = generateIdentifierTrigrams(Req.Query);

// Generate query trigrams and construct AND iterator over all query		// Generate query trigrams and construct AND iterator over all query
// trigrams.		// trigrams.
std::vector<std::unique_ptr<Iterator>> TrigramIterators;		std::vector<std::unique_ptr<Iterator>> TrigramIterators;
for (const auto &Trigram : TrigramTokens) {		for (const auto &Trigram : TrigramTokens) {
const auto It = InvertedIndex.find(Trigram);		const auto It = InvertedIndex.find(Trigram);
if (It != InvertedIndex.end())		if (It != InvertedIndex.end())
TrigramIterators.push_back(create(It->second));		TrigramIterators.push_back(It->second.iterator());
}		}
if (!TrigramIterators.empty())		if (!TrigramIterators.empty())
TopLevelChildren.push_back(createAnd(move(TrigramIterators)));		TopLevelChildren.push_back(createAnd(move(TrigramIterators)));

// Generate scope tokens for search query.		// Generate scope tokens for search query.
std::vector<std::unique_ptr<Iterator>> ScopeIterators;		std::vector<std::unique_ptr<Iterator>> ScopeIterators;
for (const auto &Scope : Req.Scopes) {		for (const auto &Scope : Req.Scopes) {
const auto It = InvertedIndex.find(Token(Token::Kind::Scope, Scope));		const auto It = InvertedIndex.find(Token(Token::Kind::Scope, Scope));
if (It != InvertedIndex.end())		if (It != InvertedIndex.end())
ScopeIterators.push_back(create(It->second));		ScopeIterators.push_back(It->second.iterator());
}		}
// Add OR iterator for scopes if there are any Scope Iterators.		// Add OR iterator for scopes if there are any Scope Iterators.
if (!ScopeIterators.empty())		if (!ScopeIterators.empty())
TopLevelChildren.push_back(createOr(move(ScopeIterators)));		TopLevelChildren.push_back(createOr(move(ScopeIterators)));

// Use TRUE iterator if both trigrams and scopes from the query are not		// Use TRUE iterator if both trigrams and scopes from the query are not
// present in the symbol index.		// present in the symbol index.
auto QueryIterator = TopLevelChildren.empty()		auto QueryIterator = TopLevelChildren.empty()
▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	void DexIndex::refs(const RefsRequest &Req,
log("refs is not implemented.");		log("refs is not implemented.");
}		}

size_t DexIndex::estimateMemoryUsage() const {		size_t DexIndex::estimateMemoryUsage() const {
size_t Bytes =		size_t Bytes =
LookupTable.size() * sizeof(std::pair<SymbolID, const Symbol *>);		LookupTable.size() * sizeof(std::pair<SymbolID, const Symbol *>);
Bytes += SymbolQuality.size() * sizeof(std::pair<const Symbol *, float>);		Bytes += SymbolQuality.size() * sizeof(std::pair<const Symbol *, float>);
Bytes += InvertedIndex.size() * sizeof(Token);		Bytes += InvertedIndex.size() * sizeof(Token);
		for (const auto &P : InvertedIndex)
for (const auto &P : InvertedIndex) {		Bytes += P.second.bytes();
Bytes += P.second.size() * sizeof(DocID);
}
return Bytes;		return Bytes;
}		}

} // namespace dex		} // namespace dex
} // namespace clangd		} // namespace clangd
} // namespace clang		} // namespace clang

clangd/index/dex/Iterator.h

	Show All 26 Lines
	// computationally effective if search request is not very restrictive).			// computationally effective if search request is not very restrictive).
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H			#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H
	#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H			#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H

	#include "llvm/ADT/ArrayRef.h"			#include "llvm/ADT/ArrayRef.h"
				#include "llvm/ADT/BitVector.h"
	#include "llvm/Support/raw_ostream.h"			#include "llvm/Support/raw_ostream.h"
	#include <algorithm>			#include <algorithm>
	#include <memory>			#include <memory>
	#include <vector>			#include <vector>

	namespace clang {			namespace clang {
	namespace clangd {			namespace clangd {
	namespace dex {			namespace dex {

	/// Symbol position in the list of all index symbols sorted by a pre-computed			/// Symbol position in the list of all index symbols sorted by a pre-computed
	/// symbol quality.			/// symbol quality.
	using DocID = uint32_t;			using DocID = uint32_t;
	/// Contains sorted sequence of DocIDs all of which belong to symbols matching
	/// certain criteria, i.e. containing a Search Token. PostingLists are values
	/// for the inverted index.
	// FIXME(kbobyrev): Posting lists for incomplete trigrams (one/two symbols) are
	// likely to be very dense and hence require special attention so that the index
	// doesn't use too much memory. Possible solution would be to construct
	// compressed posting lists which consist of ranges of DocIDs instead of
	// distinct DocIDs. A special case would be the empty query: for that case
	// TrueIterator should be implemented - an iterator which doesn't actually store
	// any PostingList within itself, but "contains" all DocIDs in range
	// [0, IndexSize).
	using PostingList = std::vector<DocID>;
	/// Immutable reference to PostingList object.
	using PostingListRef = llvm::ArrayRef<DocID>;

	/// Iterator is the interface for Query Tree node. The simplest type of Iterator			/// Iterator is the interface for Query Tree node. The simplest type of Iterator
	/// is DocumentIterator which is simply a wrapper around PostingList iterator			/// is DocumentIterator which is simply a wrapper around PostingList iterator
	/// and serves as the Query Tree leaf. More sophisticated examples of iterators			/// and serves as the Query Tree leaf. More sophisticated examples of iterators
	/// can manage intersection, union of the elements produced by other iterators			/// can manage intersection, union of the elements produced by other iterators
	/// (their children) to form a multi-level Query Tree. The interface is designed			/// (their children) to form a multi-level Query Tree. The interface is designed
	/// to be extensible in order to support multiple types of iterators.			/// to be extensible in order to support multiple types of iterators.
	class Iterator {			class Iterator {
	▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines
	///			///
	/// Boosting can be seen as a compromise between retrieving too many items and			/// Boosting can be seen as a compromise between retrieving too many items and
	/// calculating finals score for each of them (which might be very expensive)			/// calculating finals score for each of them (which might be very expensive)
	/// and not retrieving enough items so that items with very high final score			/// and not retrieving enough items so that items with very high final score
	/// would not be processed. Boosting score is a computationally efficient way			/// would not be processed. Boosting score is a computationally efficient way
	/// to acquire preliminary scores of requested items.			/// to acquire preliminary scores of requested items.
	std::vector<std::pair<DocID, float>> consume(Iterator &It);			std::vector<std::pair<DocID, float>> consume(Iterator &It);

	/// Returns a document iterator over given PostingList.
	///
	/// DocumentIterator returns DEFAULT_BOOST_SCORE for each processed item.
	std::unique_ptr<Iterator> create(PostingListRef Documents);

	/// Returns AND Iterator which performs the intersection of the PostingLists of			/// Returns AND Iterator which performs the intersection of the PostingLists of
	/// its children.			/// its children.
	///			///
	/// consume(): AND Iterator returns the product of Childrens' boosting scores			/// consume(): AND Iterator returns the product of Childrens' boosting scores
	/// when not exhausted and DEFAULT_BOOST_SCORE otherwise.			/// when not exhausted and DEFAULT_BOOST_SCORE otherwise.
	std::unique_ptr<Iterator>			std::unique_ptr<Iterator>
	createAnd(std::vector<std::unique_ptr<Iterator>> Children);			createAnd(std::vector<std::unique_ptr<Iterator>> Children);

	▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines
	}			}

	template <typename HeadT>			template <typename HeadT>
	void populateChildren(std::vector<std::unique_ptr<Iterator>> &Children,			void populateChildren(std::vector<std::unique_ptr<Iterator>> &Children,
	HeadT &Head) {			HeadT &Head) {
	Children.push_back(move(Head));			Children.push_back(move(Head));
	}			}

				/// Contains sorted sequence of DocIDs all of which belong to symbols matching
				/// certain criteria, i.e. containing a Search Token. PostingLists are values
				/// for the inverted index.
				class PostingList {
				public:
				PostingList() : Representation(Null) {}
				PostingList(std::vector<DocID> Docs);
				/// Returns a document iterator over given PostingList.
				std::unique_ptr<Iterator> iterator() const;

				PostingList(PostingList&&);
				PostingList &operator=(PostingList&&);
				~PostingList();

				size_t bytes() const;

				private:
				enum Rep { Null, Dense, Sparse } Representation;
				union {
				struct {
				llvm::BitVector Bitmap;
				size_t Count;
				} DenseRep;
				std::vector<DocID> SparseRep;
				};
				};

	} // namespace dex			} // namespace dex
	} // namespace clangd			} // namespace clangd
	} // namespace clang			} // namespace clang

	#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H			#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H

clangd/index/dex/Iterator.cpp

Show All 12 Lines
#include <numeric>		#include <numeric>

namespace clang {		namespace clang {
namespace clangd {		namespace clangd {
namespace dex {		namespace dex {

namespace {		namespace {

/// Implements Iterator over a PostingList. DocumentIterator is the most basic		/// Implements Iterator over a sparse PostingList.
/// iterator: it doesn't have any children (hence it is the leaf of iterator		/// This is a leaf iterator which simply wraps a list of DocIDs.
/// tree) and is simply a wrapper around PostingList::const_iterator.		class SparseIterator : public Iterator {
class DocumentIterator : public Iterator {
public:		public:
DocumentIterator(PostingListRef Documents)		SparseIterator(llvm::ArrayRef<DocID> Documents)
: Documents(Documents), Index(std::begin(Documents)) {}		: Documents(Documents), Index(std::begin(Documents)) {}

bool reachedEnd() const override { return Index == std::end(Documents); }		bool reachedEnd() const override { return Index == std::end(Documents); }

/// Advances cursor to the next item.		/// Advances cursor to the next item.
void advance() override {		void advance() override {
assert(!reachedEnd() && "DOCUMENT iterator can't advance() at the end.");		assert(!reachedEnd() && "DOCUMENT iterator can't advance() at the end.");
++Index;		++Index;
Show All 34 Lines	llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
if (Index == std::end(Documents))		if (Index == std::end(Documents))
OS << "{END}";		OS << "{END}";
else		else
OS << "END";		OS << "END";
OS << ']';		OS << ']';
return OS;		return OS;
}		}

PostingListRef Documents;		llvm::ArrayRef<DocID> Documents;
PostingListRef::const_iterator Index;		llvm::ArrayRef<DocID>::iterator Index;
		};

		/// Implements Iterator over a dense PostingList.
		/// This is a leaf iterator over a BitVector with one bit per possible DocID.
		class DenseIterator : public Iterator {
		public:
		DenseIterator(const llvm::BitVector &Bits, size_t Count)
		: Bits(Bits), Index(Bits.find_first()), Count(Count) {}

		bool reachedEnd() const override { return Index == -1; }

		/// Advances cursor to the next item.
		void advance() override {
		assert(!reachedEnd() && "DENSE iterator can't advance() at the end.");
		Index = Bits.find_next(Index);
		}

		/// Applies binary search to advance cursor to the next item with DocID equal
		/// or higher than the given one.
		void advanceTo(DocID ID) override {
		assert(!reachedEnd() && "DENSE iterator can't advance() at the end.");
		Index = Bits.find_next(ID);
		}

		DocID peek() const override {
		assert(!reachedEnd() && "DENSE iterator can't peek() at the end.");
		return Index;
		}

		float consume() override {
		assert(!reachedEnd() && "DENSE iterator can't consume() at the end.");
		return DEFAULT_BOOST_SCORE;
		}

		size_t estimateSize() const override { return Count; }

		private:
		llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
		return OS << "(dense)\n";
		}

		const llvm::BitVector &Bits;
		int Index; // Invariant: Index == -1 \|\| Bits[Index]
		size_t Count;
};		};

/// Implements Iterator over the intersection of other iterators.		/// Implements Iterator over the intersection of other iterators.
///		///
/// AndIterator iterates through common items among all children. It becomes		/// AndIterator iterates through common items among all children. It becomes
/// exhausted as soon as any child becomes exhausted. After each mutation, the		/// exhausted as soon as any child becomes exhausted. After each mutation, the
/// iterator restores the invariant: all children must point to the same item.		/// iterator restores the invariant: all children must point to the same item.
class AndIterator : public Iterator {		class AndIterator : public Iterator {
▲ Show 20 Lines • Show All 304 Lines • ▼ Show 20 Lines

std::vector<std::pair<DocID, float>> consume(Iterator &It) {		std::vector<std::pair<DocID, float>> consume(Iterator &It) {
std::vector<std::pair<DocID, float>> Result;		std::vector<std::pair<DocID, float>> Result;
for (; !It.reachedEnd(); It.advance())		for (; !It.reachedEnd(); It.advance())
Result.emplace_back(It.peek(), It.consume());		Result.emplace_back(It.peek(), It.consume());
return Result;		return Result;
}		}

std::unique_ptr<Iterator> create(PostingListRef Documents) {
return llvm::make_unique<DocumentIterator>(Documents);
}

std::unique_ptr<Iterator>		std::unique_ptr<Iterator>
createAnd(std::vector<std::unique_ptr<Iterator>> Children) {		createAnd(std::vector<std::unique_ptr<Iterator>> Children) {
return llvm::make_unique<AndIterator>(move(Children));		return llvm::make_unique<AndIterator>(move(Children));
}		}

std::unique_ptr<Iterator>		std::unique_ptr<Iterator>
createOr(std::vector<std::unique_ptr<Iterator>> Children) {		createOr(std::vector<std::unique_ptr<Iterator>> Children) {
return llvm::make_unique<OrIterator>(move(Children));		return llvm::make_unique<OrIterator>(move(Children));
}		}

std::unique_ptr<Iterator> createTrue(DocID Size) {		std::unique_ptr<Iterator> createTrue(DocID Size) {
return llvm::make_unique<TrueIterator>(Size);		return llvm::make_unique<TrueIterator>(Size);
}		}

std::unique_ptr<Iterator> createBoost(std::unique_ptr<Iterator> Child,		std::unique_ptr<Iterator> createBoost(std::unique_ptr<Iterator> Child,
float Factor) {		float Factor) {
return llvm::make_unique<BoostIterator>(move(Child), Factor);		return llvm::make_unique<BoostIterator>(move(Child), Factor);
}		}

std::unique_ptr<Iterator> createLimit(std::unique_ptr<Iterator> Child,		std::unique_ptr<Iterator> createLimit(std::unique_ptr<Iterator> Child,
size_t Limit) {		size_t Limit) {
return llvm::make_unique<LimitIterator>(move(Child), Limit);		return llvm::make_unique<LimitIterator>(move(Child), Limit);
}		}

		std::unique_ptr<Iterator> PostingList::iterator() const {
		switch(Representation) {
		case Dense:
		return llvm::make_unique<DenseIterator>(DenseRep.Bitmap, DenseRep.Count);
		case Sparse:
		return llvm::make_unique<SparseIterator>(SparseRep);
		case Null:
		assert(false && "iterator() on null posting list");
		return llvm::make_unique<SparseIterator>(llvm::ArrayRef<DocID>{});
		}
		}

		PostingList::PostingList(std::vector<DocID> Docs) {
		if (Docs.empty() \|\| sizeof(DocID) * Docs.size() < Docs.back() / CHAR_BIT) {
		Representation = Sparse;
		new (&SparseRep) decltype(SparseRep)(std::move(Docs));
		} else {
		Representation = Dense;
		new (&DenseRep) decltype(DenseRep);
		DenseRep.Count = Docs.size();
		DenseRep.Bitmap.resize(Docs.back() + 1);
		for (DocID D : Docs)
		DenseRep.Bitmap.set(D);
		}
		};

		PostingList::~PostingList() {
		switch (Representation) {
		case Sparse:
		delete &SparseRep;
		break;
		case Dense:
		delete &DenseRep;
		break;
		case Null:
		break;
		}
		}

		PostingList::PostingList(PostingList &&Other) {
		Representation = Other.Representation;
		switch (Representation) {
		case Sparse:
		new (&SparseRep) decltype(SparseRep)(std::move(Other.SparseRep));
		break;
		case Dense:
		new (&DenseRep) decltype(DenseRep)(std::move(Other.DenseRep));
		break;
		case Null:
		break;
		}
		Other.Representation = Null;
		}

		PostingList &PostingList::operator=(PostingList &&Other) {
		this->~PostingList();
		new(this) PostingList(std::move(Other));
		return *this;
		}

		size_t PostingList::bytes() const {
		switch(Representation) {
		case Sparse:
		return SparseRep.size() * sizeof(DocID);
		case Dense:
		return DenseRep.Bitmap.getMemorySize();
		case Null:
		return 0;
		}
		}

} // namespace dex		} // namespace dex
} // namespace clangd		} // namespace clangd
} // namespace clang		} // namespace clang