diff --git a/clang-tools-extra/clang-tidy/GlobList.h b/clang-tools-extra/clang-tidy/GlobList.h --- a/clang-tools-extra/clang-tidy/GlobList.h +++ b/clang-tools-extra/clang-tidy/GlobList.h @@ -10,10 +10,10 @@ #define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GLOBLIST_H #include "clang/Basic/LLVM.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Support/Regex.h" +#include +#include namespace clang { namespace tidy { @@ -25,7 +25,9 @@ /// them in the order of appearance in the list. class GlobList { public: - virtual ~GlobList() = default; + virtual ~GlobList(); + + GlobList(const GlobList &) = delete; /// \p Globs is a comma-separated list of globs (only the '*' metacharacter is /// supported) with an optional '-' prefix to denote exclusion. @@ -41,12 +43,33 @@ /// matching glob's Positive flag. virtual bool contains(StringRef S) const; + bool isEquivalentTo(const GlobList &Other); + private: - struct GlobListItem { - bool IsPositive; - llvm::Regex Regex; + static constexpr size_t ItemBits = 15; + // Pretty arbitrary choice here. + static constexpr size_t NumInlineElements = 24; + struct GlobStartInfo { + uint16_t Items : ItemBits; + uint16_t IsPositive : 1; + }; + + union DataItem { + GlobStartInfo StartInfo; + uint16_t Size; + }; + + /// Tirst item we encounter is a \c GlobStartInfo object. The next \c + /// StartInfo.Items items in here will be a raw uint16_t. After that, if there + /// are more items, the next will be another \c GlobStartInfo object. + std::vector Data; + union { + char Inline[NumInlineElements]; + char *Ptr; }; - SmallVector Items; + size_t RawStringSize : sizeof(size_t) * CHAR_BIT - 1; + // The fallback value if we don't match anything. + size_t UnmatchedResult : 1; }; /// A \p GlobList that caches search results, so that search is performed only diff --git a/clang-tools-extra/clang-tidy/GlobList.cpp b/clang-tools-extra/clang-tidy/GlobList.cpp --- a/clang-tools-extra/clang-tidy/GlobList.cpp +++ b/clang-tools-extra/clang-tidy/GlobList.cpp @@ -7,61 +7,174 @@ //===----------------------------------------------------------------------===// #include "GlobList.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include namespace clang { namespace tidy { -// Returns true if GlobList starts with the negative indicator ('-'), removes it -// from the GlobList. -static bool consumeNegativeIndicator(StringRef &GlobList) { - GlobList = GlobList.trim(); - if (GlobList.startswith("-")) { - GlobList = GlobList.substr(1); - return true; +static size_t getMaxAllocSize(StringRef Str) { + size_t Count = 0; + for (char C : Str) { + switch (C) { + case '\n': // Seperator character. + case ',': // Seperator character. + case '*': // Wildcards aren't stored. + continue; + default: + ++Count; + } } - return false; + return Count; } -// Converts first glob from the comma-separated list of globs to Regex and -// removes it and the trailing comma from the GlobList. -static llvm::Regex consumeGlob(StringRef &GlobList) { - StringRef UntrimmedGlob = GlobList.substr(0, GlobList.find_first_of(",\n")); - StringRef Glob = UntrimmedGlob.trim(); - GlobList = GlobList.substr(UntrimmedGlob.size() + 1); - SmallString<128> RegexText("^"); - StringRef MetaChars("()^$|*+?.[]\\{}"); - for (char C : Glob) { - if (C == '*') - RegexText.push_back('.'); - else if (MetaChars.contains(C)) - RegexText.push_back('\\'); - RegexText.push_back(C); +GlobList::GlobList(StringRef Globs, bool KeepNegativeGlobs /* =true */) + : RawStringSize(0), UnmatchedResult(false) { + // FIXME: trimming the newline character here is suspicious as its recognised + // as a seperator. + Globs = Globs.trim(" \t\r\n\f\v"); + + // Upperbound for the allocated size. + size_t MaxAllocSize = getMaxAllocSize(Globs); + char *Storage = Inline; + + while (true) { + GlobStartInfo S; + size_t GlobStart = Globs.find_last_of(",\n") + 1; + StringRef Trimmed = Globs.substr(GlobStart).ltrim(" \t\v\f\r"); + S.IsPositive = !Trimmed.consume_front("-"); + if (S.IsPositive || KeepNegativeGlobs) { + // The match-all glob means anything before it won't ever be read. + + if (Trimmed == "*") { + // Don't actually stored this glob, We'll stop checking once we reach + // this item, so instead, just track whether it was a match or not. + UnmatchedResult = S.IsPositive; + return; + } + auto StartIndex = Data.size(); + Data.emplace_back(); + if (!Trimmed.empty()) { + // FIXME: This wouldn't support escaping the wildcard (\*), but that's + // not a known usecase for GlobList in clang-tidy. + for (auto Item : llvm::split(Trimmed, "*")) { + if (Item.empty()) { + // Eat ** in a glob. + if (Data.size() - StartIndex > 1 && Data.back().Size == 0) + continue; + } else { + assert(Item.size() < (1 << ItemBits) && "Index out of bounds"); + // Lazily allocate out of line storage if needed. + if (RawStringSize <= NumInlineElements && + RawStringSize + Item.size() > NumInlineElements) { + char *NewPtr = new char[MaxAllocSize]; + ::memcpy(NewPtr, Inline, RawStringSize); + Ptr = NewPtr; + Storage = Ptr; + } + assert(RawStringSize + Item.size() <= MaxAllocSize); + llvm::copy(Item, &Storage[RawStringSize]); + RawStringSize += Item.size(); + } + Data.emplace_back(); + Data.back().Size = Item.size(); + } + } + auto ItemCount = Data.size() - StartIndex - 1; + assert(ItemCount < (1 << ItemBits) && "Too many items"); + S.Items = ItemCount; + // The starting node goes at the end of the items, because we traverse the + // glob list in reverse order. + Data[StartIndex].StartInfo = S; + } + if (GlobStart == 0) + return; + // Bit of a hack this, Basically let ',\n' not be parsed as an empty string. + // This behaviour needs defining. + auto NextOffset = Globs.find_last_not_of( + Globs[GlobStart - 1] == ',' ? " \t\r\n\f\v" : " \t\r\n\f\v,", + GlobStart - 1); + if (NextOffset == StringRef::npos) + return; + Globs = Globs.substr(0, NextOffset + 1); } - RegexText.push_back('$'); - return llvm::Regex(RegexText); } -GlobList::GlobList(StringRef Globs, bool KeepNegativeGlobs /* =true */) { - Items.reserve(Globs.count(',') + Globs.count('\n') + 1); - do { - GlobListItem Item; - Item.IsPositive = !consumeNegativeIndicator(Globs); - Item.Regex = consumeGlob(Globs); - if (Item.IsPositive || KeepNegativeGlobs) - Items.push_back(std::move(Item)); - } while (!Globs.empty()); +GlobList::~GlobList() { + if (RawStringSize > NumInlineElements) + delete[] Ptr; +} + +static bool matchGlobList(StringRef Source, llvm::ArrayRef Offsets, + StringRef Match) { + assert(!Offsets.empty()); + // If we only had one item originally, then we are working with an exact match + // with no wild cards. + if (Offsets.size() == 1) { + assert(Source.size() == Offsets.front()); + return Match == Source; + } + + if (!Match.consume_front(Source.take_front(Offsets.front()))) + return false; + Source = Source.drop_front(Offsets.front()); + + for (auto Offset : Offsets.drop_front().drop_back()) { + auto SearchItem = Source.take_front(Offset); + Source = Source.drop_front(Offset); + auto Idx = Match.find(SearchItem); + if (Idx == StringRef::npos) + return false; + Match = Match.drop_front(Idx + Offset); + } + + assert(Source.size() == Offsets.back()); + // For the last item, we know previously there was a wildcard, so we only need + // to check for a suffix match. + return Match.endswith(Source); } bool GlobList::contains(StringRef S) const { - // Iterating the container backwards as the last match determins if S is in - // the list. - for (const GlobListItem &Item : llvm::reverse(Items)) { - if (Item.Regex.match(S)) - return Item.IsPositive; + StringRef Source(((RawStringSize > NumInlineElements) + ? static_cast(Ptr) + : Inline), + RawStringSize); + auto Items = llvm::makeArrayRef(Data); + while (!Items.empty()) { + GlobStartInfo StartNode = Items.front().StartInfo; + Items = Items.drop_front(); + assert(Items.size() >= StartNode.Items); + if (StartNode.Items == 0) { + if (S.empty()) + return StartNode.IsPositive; + continue; + } + auto GlobItems = Items.take_front(StartNode.Items); + Items = Items.drop_front(StartNode.Items); + auto Offsets = llvm::makeArrayRef(&GlobItems.front().Size, StartNode.Items); + auto Size = std::accumulate(Offsets.begin(), Offsets.end(), 0U); + if (matchGlobList(Source.take_front(Size), Offsets, S)) + return StartNode.IsPositive; + Source = Source.drop_front(Size); } - return false; + return UnmatchedResult; +} + +bool GlobList::isEquivalentTo(const GlobList &Other) { + // Check the cheap to determine lengths first. + if (RawStringSize != Other.RawStringSize) + return false; + if (Data.size() != Other.Data.size()) + return false; + // Then check the elements. + if (memcmp(Data.data(), Other.Data.data(), Data.size() * sizeof(Data[0])) != + 0) + return false; + if (RawStringSize > NumInlineElements) + return strncmp(Ptr, Other.Ptr, RawStringSize) == 0; + return strncmp(Inline, Other.Inline, RawStringSize) == 0; } bool CachedGlobList::contains(StringRef S) const { diff --git a/clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp b/clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp --- a/clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp +++ b/clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp @@ -16,6 +16,13 @@ EXPECT_FALSE(Filter.contains("aaa")); } +TYPED_TEST(GlobListTest, NotEmpty) { + TypeParam Filter("*,-"); + + EXPECT_FALSE(Filter.contains("")); + EXPECT_TRUE(Filter.contains("aaa")); +} + TYPED_TEST(GlobListTest, Nothing) { TypeParam Filter("-*"); @@ -57,6 +64,42 @@ EXPECT_FALSE(Filter.contains("bbbb")); } +TYPED_TEST(GlobListTest, PreAndPostFixPattern) { + TypeParam Filter("a*b"); + + EXPECT_TRUE(Filter.contains("ab")); + EXPECT_TRUE(Filter.contains("aMiddleb")); + EXPECT_FALSE(Filter.contains("aba")); + EXPECT_FALSE(Filter.contains("bab")); +} + +TYPED_TEST(GlobListTest, DoubleWildcard) { + { + TypeParam Filter("a**b"); + + EXPECT_TRUE(Filter.contains("ab")); + EXPECT_TRUE(Filter.contains("aMiddleb")); + EXPECT_FALSE(Filter.contains("aba")); + EXPECT_FALSE(Filter.contains("bab")); + } + { + TypeParam Filter("a**"); + + EXPECT_TRUE(Filter.contains("ab")); + EXPECT_TRUE(Filter.contains("aMiddleb")); + EXPECT_TRUE(Filter.contains("aba")); + EXPECT_FALSE(Filter.contains("bab")); + } + { + TypeParam Filter("**b"); + + EXPECT_TRUE(Filter.contains("ab")); + EXPECT_TRUE(Filter.contains("aMiddleb")); + EXPECT_FALSE(Filter.contains("aba")); + EXPECT_TRUE(Filter.contains("bab")); + } +} + TYPED_TEST(GlobListTest, PatternPriority) { // The last glob that matches the string decides whether that string is // included or excluded. @@ -107,6 +150,9 @@ TYPED_TEST(GlobListTest, NewlineCharactersAsSeparators) { TypeParam Filter("a* \n b,\n-c*,dd"); + // FIXME: Define this behaviour, There are 2 seperator characters next to each + // other (,\n), Technically that should be represented as containing an empty + // item. EXPECT_FALSE(Filter.contains("")); EXPECT_TRUE(Filter.contains("aaa")); EXPECT_TRUE(Filter.contains("b")); @@ -117,5 +163,14 @@ EXPECT_FALSE(Filter.contains("ddd")); } +TYPED_TEST(GlobListTest, IgnoreNegativeMatches) { + TypeParam Filter("a,-a,b*,-b*,-*", false); + + EXPECT_FALSE(Filter.contains("")); + EXPECT_TRUE(Filter.contains("a")); + EXPECT_TRUE(Filter.contains("b")); + EXPECT_TRUE(Filter.contains("bar")); +} + } // namespace tidy } // namespace clang