diff --git a/clang-tools-extra/clang-tidy/GlobList.h b/clang-tools-extra/clang-tidy/GlobList.h --- a/clang-tools-extra/clang-tidy/GlobList.h +++ b/clang-tools-extra/clang-tidy/GlobList.h @@ -10,10 +10,10 @@ #define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GLOBLIST_H #include "clang/Basic/LLVM.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Support/Regex.h" +#include +#include namespace clang { namespace tidy { @@ -25,7 +25,9 @@ /// them in the order of appearance in the list. class GlobList { public: - virtual ~GlobList() = default; + virtual ~GlobList(); + + GlobList(const GlobList &) = delete; /// \p Globs is a comma-separated list of globs (only the '*' metacharacter is /// supported) with an optional '-' prefix to denote exclusion. @@ -41,12 +43,46 @@ /// matching glob's Positive flag. virtual bool contains(StringRef S) const; + bool isEquivalentTo(const GlobList &Other) const; + + bool isEquivalentTo(StringRef S, bool KeepNegativeGlobs = true) const; + + void print(raw_ostream &OS) const; + private: - struct GlobListItem { - bool IsPositive; - llvm::Regex Regex; + static constexpr size_t ItemBits = 15; + // Pretty arbitrary choice here. + static constexpr size_t NumInlineElements = 24; + struct GlobStartInfo { + uint16_t Items : ItemBits; + uint16_t IsPositive : 1; + }; + + bool isInlineStorage() const; + + char *getStorage(); + + const char *getStorage() const; + + union DataItem { + GlobStartInfo StartInfo; + uint16_t Size; + }; + + static std::pair> + consumeGlobData(llvm::ArrayRef &Data); + + /// First item we encounter is a \c GlobStartInfo object. The next \c + /// StartInfo.Items items in here will be a raw uint16_t. After that, if there + /// are more items, the next will be another \c GlobStartInfo object. + std::vector Data; + union { + char Inline[NumInlineElements]; + char *Ptr; }; - SmallVector Items; + size_t RawStringSize : sizeof(size_t) * CHAR_BIT - 1; + // The fallback value if we don't match anything. + size_t UnmatchedResult : 1; }; /// A \p GlobList that caches search results, so that search is performed only diff --git a/clang-tools-extra/clang-tidy/GlobList.cpp b/clang-tools-extra/clang-tidy/GlobList.cpp --- a/clang-tools-extra/clang-tidy/GlobList.cpp +++ b/clang-tools-extra/clang-tidy/GlobList.cpp @@ -7,63 +7,318 @@ //===----------------------------------------------------------------------===// #include "GlobList.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include namespace clang { namespace tidy { -// Returns true if GlobList starts with the negative indicator ('-'), removes it -// from the GlobList. -static bool consumeNegativeIndicator(StringRef &GlobList) { - GlobList = GlobList.trim(); - if (GlobList.startswith("-")) { - GlobList = GlobList.substr(1); - return true; +static size_t getMaxAllocSize(StringRef Str) { + size_t Count = 0; + for (char C : Str) { + switch (C) { + case '\n': // Seperator character. + case ',': // Seperator character. + case '*': // Wildcards aren't stored. + continue; + default: + ++Count; + } } - return false; + return Count; } -// Converts first glob from the comma-separated list of globs to Regex and -// removes it and the trailing comma from the GlobList. -static llvm::Regex consumeGlob(StringRef &GlobList) { - StringRef UntrimmedGlob = GlobList.substr(0, GlobList.find_first_of(",\n")); - StringRef Glob = UntrimmedGlob.trim(); - GlobList = GlobList.substr(UntrimmedGlob.size() + 1); - SmallString<128> RegexText("^"); - StringRef MetaChars("()^$|*+?.[]\\{}"); - for (char C : Glob) { - if (C == '*') - RegexText.push_back('.'); - else if (MetaChars.contains(C)) - RegexText.push_back('\\'); - RegexText.push_back(C); +static std::pair splitGlob(StringRef &Glob) { + size_t GlobStart = Glob.find_last_of(",\n") + 1; + StringRef Trimmed = Glob.substr(GlobStart).ltrim(" \t\v\f\r"); + if (GlobStart != 0) { + // Bit of a hack this, Basically let ',\n' not be parsed as an empty string. + // This behaviour needs defining. + size_t NextOffset = Glob.find_last_not_of( + Glob[GlobStart - 1] == ',' ? " \t\r\n\f\v" : " \t\r\n\f\v,", + GlobStart - 1); + + if (NextOffset != StringRef::npos) { + Glob = Glob.substr(0, NextOffset + 1); + return {Trimmed, true}; + } } - RegexText.push_back('$'); - return llvm::Regex(RegexText); + return {Trimmed, false}; +} + +static bool isEscapeChar(char C) { return C == '-' || C == '\\'; } + +static bool isEscapedStart(StringRef Trimmed) { + return Trimmed.size() >= 2 && Trimmed[0] == '\\' && isEscapeChar(Trimmed[1]); } -GlobList::GlobList(StringRef Globs, bool KeepNegativeGlobs /* =true */) { - Items.reserve(Globs.count(',') + Globs.count('\n') + 1); +static bool isAllWildcards(StringRef Glob) { + return !Glob.empty() && Glob.find_first_not_of('*') == StringRef::npos; +} + +static void initialGlobTrim(StringRef &Globs) { + // FIXME: trimming the newline character here is suspicious as its recognised + // as a seperator. + Globs = Globs.trim(" \t\r\n\f\v"); +} + +GlobList::GlobList(StringRef Globs, bool KeepNegativeGlobs /* =true */) + : RawStringSize(0), UnmatchedResult(false) { + initialGlobTrim(Globs); + // Upperbound for the allocated size. + size_t MaxAllocSize = getMaxAllocSize(Globs); + char *Storage = Inline; + auto AddSizedItem = [this](size_t Size) { + assert(Size < std::numeric_limits::max() && "Chunk too large"); + Data.emplace_back(); + Data.back().Size = Size; + }; + bool HasMoreItems; + StringRef Trimmed; do { - GlobListItem Item; - Item.IsPositive = !consumeNegativeIndicator(Globs); - Item.Regex = consumeGlob(Globs); - if (Item.IsPositive || KeepNegativeGlobs) - Items.push_back(std::move(Item)); - } while (!Globs.empty()); + GlobStartInfo S; + std::tie(Trimmed, HasMoreItems) = splitGlob(Globs); + S.IsPositive = !Trimmed.consume_front("-"); + // Let '\\' be used to escape the '-' used to denote a negative match. + if (S.IsPositive && isEscapedStart(Trimmed)) + Trimmed = Trimmed.drop_front(); + if (S.IsPositive || KeepNegativeGlobs) { + // The match-all glob means anything before it won't ever be read. + // Therefore we can stop building the glob if we reach this item. + // We also don't need to actually store this glob as we can just update + // the default result if no items match. + if (isAllWildcards(Trimmed)) { + UnmatchedResult = S.IsPositive; + return; + } + size_t StartIndex = Data.size(); + Data.emplace_back(); + if (Trimmed.consume_front("*")) + AddSizedItem(0); + const bool LastEmpty = Trimmed.consume_back("*"); + // FIXME: This wouldn't support escaping the wildcard (\*), but that's + // not a known usecase for GlobList in clang-tidy. + for (StringRef Item : llvm::split(Trimmed, "*")) { + // Don't store empty strings here as they can only exist with sucessive + // wildcards(**) + if (Item.empty()) + continue; + assert(RawStringSize + Item.size() <= MaxAllocSize); + // Lazily allocate out of line storage if needed. + if (isInlineStorage() && + RawStringSize + Item.size() > NumInlineElements) { + // We cant store the Pointer directly in Ptr as it would clobber the + // Inline storage. + char *NewPtr = new char[MaxAllocSize]; + ::memcpy(NewPtr, Inline, RawStringSize); + Ptr = NewPtr; + Storage = Ptr; + } + llvm::copy(Item, &Storage[RawStringSize]); + RawStringSize += Item.size(); + AddSizedItem(Item.size()); + } + if (LastEmpty) + AddSizedItem(0); + size_t ItemCount = Data.size() - StartIndex - 1; + assert(ItemCount < (1 << ItemBits) && "Too many items"); + S.Items = ItemCount; + // The starting node goes at the end of the items, because we traverse + // the glob list in reverse order. + Data[StartIndex].StartInfo = S; + } + } while (HasMoreItems); } -bool GlobList::contains(StringRef S) const { - // Iterating the container backwards as the last match determins if S is in - // the list. - for (const GlobListItem &Item : llvm::reverse(Items)) { - if (Item.Regex.match(S)) - return Item.IsPositive; +bool GlobList::isEquivalentTo(StringRef Globs, bool KeepNegativeGlobs) const { + initialGlobTrim(Globs); + bool HasMoreItems; + StringRef Trimmed; + auto Begin = Data.begin(); + auto End = Data.end(); + size_t CurSize = 0; + bool Error = false; + const char *Storage = getStorage(); + do { + GlobStartInfo S; + std::tie(Trimmed, HasMoreItems) = splitGlob(Globs); + S.IsPositive = !Trimmed.consume_front("-"); + if (S.IsPositive && isEscapedStart(Trimmed)) + Trimmed = Trimmed.drop_front(); + if (S.IsPositive || KeepNegativeGlobs) { + // The match-all glob means anything before it won't ever be read. + if (isAllWildcards(Trimmed)) { + Error = UnmatchedResult != S.IsPositive; + break; + } + if (Begin == End || S.IsPositive != Begin->StartInfo.IsPositive) + return false; + size_t Items = 0; + size_t ExpectedItems = Begin->StartInfo.Items; + ++Begin; + auto EatItem = [&Items, ExpectedItems, &Begin](size_t Size = 0) { + return ++Items <= ExpectedItems && (Begin++)->Size == Size; + }; + + if (Trimmed.consume_front("*") && !EatItem()) + return false; + + const bool LastEmpty = Trimmed.consume_back("*"); + for (llvm::StringRef Item : llvm::split(Trimmed, "*")) { + if (Item.empty()) + continue; + assert(Item.size() + CurSize <= RawStringSize); + if (!EatItem(Item.size()) || + memcmp(Item.data(), &Storage[CurSize], Item.size()) != 0) + return false; + CurSize += Item.size(); + } + if ((LastEmpty && !EatItem()) || Items != ExpectedItems) + return false; + } + } while (HasMoreItems); + if (!Error && Begin == End) { + assert(CurSize == RawStringSize); + return true; } return false; } +GlobList::~GlobList() { + if (!isInlineStorage()) + delete[] Ptr; +} + +std::pair> +GlobList::consumeGlobData(llvm::ArrayRef &Data) { + const GlobStartInfo &Start = Data.front().StartInfo; + assert(Data.size() >= 1 + Start.Items); + llvm::ArrayRef Offsets(&Data.data()[1].Size, Start.Items); + Data = Data.drop_front(1 + Start.Items); + return {Start.IsPositive, Offsets}; +} + +static bool matchGlobList(StringRef Source, llvm::ArrayRef Offsets, + StringRef Match) { + assert(!Offsets.empty()); + // If we only had one item originally, then we are working with an exact match + // with no wild cards. + if (Offsets.size() == 1) { + assert(Source.size() == Offsets.front()); + return Match == Source; + } + + if (!Match.consume_front(Source.take_front(Offsets.front()))) + return false; + Source = Source.drop_front(Offsets.front()); + + for (uint16_t Offset : Offsets.drop_front().drop_back()) { + StringRef SearchItem = Source.take_front(Offset); + Source = Source.drop_front(Offset); + size_t Idx = Match.find(SearchItem); + if (Idx == StringRef::npos) + return false; + Match = Match.drop_front(Idx + Offset); + } + + assert(Source.size() == Offsets.back()); + // For the last item, we know previously there was a wildcard, so we only need + // to check for a suffix match. + return Match.endswith(Source); +} + +bool GlobList::contains(StringRef S) const { + StringRef Source(getStorage(), RawStringSize); + ArrayRef Items(Data); + bool IsPositive; + ArrayRef Offsets; + while (!Items.empty()) { + std::tie(IsPositive, Offsets) = consumeGlobData(Items); + if (Offsets.empty()) { + if (S.empty()) + return IsPositive; + continue; + } + size_t Size = std::accumulate(Offsets.begin(), Offsets.end(), size_t(0U)); + if (matchGlobList(Source.take_front(Size), Offsets, S)) + return IsPositive; + Source = Source.drop_front(Size); + } + return UnmatchedResult; +} + +bool GlobList::isEquivalentTo(const GlobList &Other) const { + // Check the cheap to determine lengths first. + if (std::make_tuple(RawStringSize, UnmatchedResult, Data.size()) != + std::make_tuple(Other.RawStringSize, Other.UnmatchedResult, + Other.Data.size())) + return false; + // Then check the elements, These are POD types so memcmp is safe for this. + if (memcmp(Data.data(), Other.Data.data(), Data.size() * sizeof(Data[0])) != + 0) + return false; + return strncmp(getStorage(), Other.getStorage(), RawStringSize) == 0; +} + +void GlobList::print(raw_ostream &OS) const { + StringRef Source(getStorage(), RawStringSize); + // For ease of matching, we store the globs in reverse order to what they are + // written in. Therefore when we print the glob, we need to reverse the order + // they are stored in. + llvm::SmallVector> RevPrint; + ArrayRef Items(Data); + size_t Cur = 0; + bool IsPositive; + ArrayRef Offsets; + while (!Items.empty()) { + RevPrint.emplace_back(Items.begin(), Cur); + std::tie(IsPositive, Offsets) = consumeGlobData(Items); + Cur += std::accumulate(Offsets.begin(), Offsets.end(), 0U); + } + if (UnmatchedResult) + OS << "*"; + else if (RevPrint.empty()) + OS << "-*"; + bool Sep = UnmatchedResult; + for (const auto &Item : llvm::reverse(RevPrint)) { + if (Sep) + OS << ','; + Sep = true; + if (!Item.first->StartInfo.IsPositive) + OS << '-'; + ArrayRef Offsets(&Item.first[1].Size, + Item.first->StartInfo.Items); + if (Offsets.empty()) + continue; + StringRef First = Source.substr(Item.second, Offsets.front()); + if (Item.first->StartInfo.IsPositive && !First.empty() && + isEscapeChar(First.front())) + OS << '\\'; + OS << First; + size_t NextOff = Item.second + Offsets.front(); + for (uint16_t Item : Offsets.drop_front()) { + OS << '*' << Source.substr(NextOff, Item); + NextOff += Item; + } + } +} + +bool GlobList::isInlineStorage() const { + return RawStringSize <= NumInlineElements; +} + +char *GlobList::getStorage() { return isInlineStorage() ? Inline : Ptr; } + +const char *GlobList::getStorage() const { + return const_cast(this)->getStorage(); +} + bool CachedGlobList::contains(StringRef S) const { auto Entry = Cache.try_emplace(S); bool &Value = Entry.first->getValue(); diff --git a/clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp b/clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp --- a/clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp +++ b/clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp @@ -16,6 +16,13 @@ EXPECT_FALSE(Filter.contains("aaa")); } +TYPED_TEST(GlobListTest, NotEmpty) { + TypeParam Filter("*,-"); + + EXPECT_FALSE(Filter.contains("")); + EXPECT_TRUE(Filter.contains("aaa")); +} + TYPED_TEST(GlobListTest, Nothing) { TypeParam Filter("-*"); @@ -57,6 +64,51 @@ EXPECT_FALSE(Filter.contains("bbbb")); } +TYPED_TEST(GlobListTest, PreAndPostFixPattern) { + TypeParam Filter("a*b"); + + EXPECT_TRUE(Filter.contains("ab")); + EXPECT_TRUE(Filter.contains("aMiddleb")); + EXPECT_FALSE(Filter.contains("aba")); + EXPECT_FALSE(Filter.contains("bab")); +} + +TYPED_TEST(GlobListTest, DoubleWildcard) { + { + TypeParam Filter("a**b"); + + EXPECT_TRUE(Filter.isEquivalentTo("a*b")); + EXPECT_TRUE(Filter.isEquivalentTo("a***b")); + + EXPECT_TRUE(Filter.contains("ab")); + EXPECT_TRUE(Filter.contains("aMiddleb")); + EXPECT_FALSE(Filter.contains("aba")); + EXPECT_FALSE(Filter.contains("bab")); + } + { + TypeParam Filter("a**"); + + EXPECT_TRUE(Filter.isEquivalentTo("a*")); + EXPECT_TRUE(Filter.isEquivalentTo("a***")); + + EXPECT_TRUE(Filter.contains("ab")); + EXPECT_TRUE(Filter.contains("aMiddleb")); + EXPECT_TRUE(Filter.contains("aba")); + EXPECT_FALSE(Filter.contains("bab")); + } + { + TypeParam Filter("**b"); + + EXPECT_TRUE(Filter.isEquivalentTo("*b")); + EXPECT_TRUE(Filter.isEquivalentTo("***b")); + + EXPECT_TRUE(Filter.contains("ab")); + EXPECT_TRUE(Filter.contains("aMiddleb")); + EXPECT_FALSE(Filter.contains("aba")); + EXPECT_TRUE(Filter.contains("bab")); + } +} + TYPED_TEST(GlobListTest, PatternPriority) { // The last glob that matches the string decides whether that string is // included or excluded. @@ -107,6 +159,9 @@ TYPED_TEST(GlobListTest, NewlineCharactersAsSeparators) { TypeParam Filter("a* \n b,\n-c*,dd"); + // FIXME: Define this behaviour, There are 2 seperator characters next to each + // other (,\n), Technically that should be represented as containing an empty + // item. EXPECT_FALSE(Filter.contains("")); EXPECT_TRUE(Filter.contains("aaa")); EXPECT_TRUE(Filter.contains("b")); @@ -117,5 +172,63 @@ EXPECT_FALSE(Filter.contains("ddd")); } +TYPED_TEST(GlobListTest, IgnoreNegativeMatches) { + TypeParam Filter("a,-a,b*,-b*,-*", false); + + EXPECT_FALSE(Filter.contains("")); + EXPECT_TRUE(Filter.contains("a")); + EXPECT_TRUE(Filter.contains("b")); + EXPECT_TRUE(Filter.contains("bar")); +} + +TYPED_TEST(GlobListTest, EscapeNegatives) { + TypeParam Filter("\\-a,-a"); + + std::string Canonicalized; + llvm::raw_string_ostream Str(Canonicalized); + Filter.print(Str); + Str.flush(); + + // Ensure negatives are printed correctly. + EXPECT_EQ("\\-a,-a", Canonicalized); + EXPECT_TRUE(Filter.isEquivalentTo("\\-a,-a")); + + EXPECT_TRUE(Filter.contains("-a")); + EXPECT_FALSE(Filter.contains("a")); +} + +void checkCanonicalize(StringRef Glob, StringRef Expected) { + GlobList Orig(Glob); + std::string Canonicalized; + llvm::raw_string_ostream Str(Canonicalized); + Orig.print(Str); + Str.flush(); + EXPECT_EQ(Canonicalized, Expected) + << "While canonicalizing glob: \"" << Glob << "\""; + EXPECT_TRUE(Orig.isEquivalentTo(Expected)) + << "While canonicalizing glob: \"" << Glob << "\""; + GlobList Canonical(Canonicalized); + EXPECT_TRUE(Orig.isEquivalentTo(Canonical)) + << "While canonicalizing glob: \"" << Glob << "\""; + EXPECT_TRUE(Canonical.isEquivalentTo(Glob)) + << "While canonicalizing glob: \"" << Glob << "\""; +} + +TEST(GlobListTest, Printing) { + // A positive wildcard is always printed with any previous items removed. + // checkCanonicalize(" *", "*"); + // checkCanonicalize(" * , -AAA ", "*,-AAA"); + checkCanonicalize("Unused , *, -A*", "*,-A*"); + // A negative wildcard is removed if there is nothing else after it. + checkCanonicalize("-* ", "-*"); + checkCanonicalize("-*, AAA ", "AAA"); + // The canonical representation uses commas as the seperators instead of + // newlines. + checkCanonicalize("Unused\n-*\nRest\n*Others", "Rest,*Others"); + // Consecutive wildcards are treated as being one wildcard. + checkCanonicalize("Unused,**", "*"); + checkCanonicalize("Unused,-**,Used", "Used"); +} + } // namespace tidy } // namespace clang