diff --git a/llvm/include/llvm/Support/GlobPattern.h b/llvm/include/llvm/Support/GlobPattern.h --- a/llvm/include/llvm/Support/GlobPattern.h +++ b/llvm/include/llvm/Support/GlobPattern.h @@ -6,8 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file implements a glob pattern matcher. The glob pattern is the -// rule used by the shell. +// This file implements a glob pattern matcher. // //===----------------------------------------------------------------------===// @@ -17,22 +16,42 @@ #include "llvm/ADT/BitVector.h" #include "llvm/Support/Error.h" #include -#include -// This class represents a glob pattern. Supported metacharacters -// are "*", "?", "\", "[]", "[^]", and "[!]". namespace llvm { template class ArrayRef; class StringRef; +/// This class implements a glob pattern matcher similar to the one found in +/// bash, but with some key differences. Namely, that \p "*" matches all +/// characters and does not exclude path separators. +/// +/// * \p "?" matches a single character. +/// * \p "*" matches zero or more characters. +/// * \p "[]" matches one character in the bracket. Character ranges, +/// e.g., \p "[a-z]", and negative sets via \p "[^ab]" or \p "[!ab]" are also +/// supported. +/// * \p "{,...}" matches one of the globs in the list. Nested brace +/// expansions are not supported. +/// * \p "\" escapes the next character so it is treated as a literal. +/// +/// Some known edge cases are: +/// * \p "]" is allowed as the first character in a character class, i.e., +/// \p "[]]" is valid. +/// * The empty character class, i.e., \p "[]", is invalid. +/// * \p "}" and \p "," that are not inside a brace expansion are taken as +/// literals, e.g., \p ",}" is valid but \p "{" is not. +/// +/// For example, \p "*[/\\]foo.{c,cpp}" will match (unix or windows) paths to +/// all files named \p "foo.c" or \p "foo.cpp". class GlobPattern { public: static Expected create(StringRef Pat); + /// \returns \p true if \p S matches this glob pattern bool match(StringRef S) const; - // Returns true for glob pattern "*". Can be used to avoid expensive - // preparation/acquisition of the input for match(). + /// \returns \p true for the glob pattern \p "*". Can be used to avoid + /// expensive preparation/acquisition of the input for \p match(). bool isTrivialMatchAll() const { if (Prefix && Prefix->empty()) { assert(!Suffix); @@ -44,8 +63,8 @@ private: bool matchOne(ArrayRef Pat, StringRef S) const; - // Parsed glob pattern. - std::vector Tokens; + /// A list of parsed glob patterns + SmallVector> TokenGroups; // The following members are for optimization. std::optional Exact; diff --git a/llvm/lib/Support/GlobPattern.cpp b/llvm/lib/Support/GlobPattern.cpp --- a/llvm/lib/Support/GlobPattern.cpp +++ b/llvm/lib/Support/GlobPattern.cpp @@ -18,7 +18,7 @@ using namespace llvm; static bool hasWildcard(StringRef S) { - return S.find_first_of("?*[\\") != StringRef::npos; + return S.find_first_of("?*[{\\") != StringRef::npos; } // Expands character ranges and returns a bitmap. @@ -58,50 +58,112 @@ return BV; } -// This is a scanner for the glob pattern. -// A glob pattern token is one of "*", "?", "\", "[]", "[^]" -// (which is a negative form of "[]"), "[!]" (which is -// equivalent to "[^]"), or a non-meta character. -// This function returns the first token in S. -static Expected scan(StringRef &S, StringRef Original) { - switch (S[0]) { - case '*': - S = S.substr(1); - // '*' is represented by an empty bitvector. - // All other bitvectors are 256-bit long. - return BitVector(); - case '?': - S = S.substr(1); - return BitVector(256, true); - case '[': { - // ']' is allowed as the first character of a character class. '[]' is - // invalid. So, just skip the first character. - size_t End = S.find(']', 2); - if (End == StringRef::npos) - return make_error("invalid glob pattern: " + Original, - errc::invalid_argument); +// Scan the pattern and return GlobPattern::TokenGroups. +// A token is one of "*", "?", "\", "[]", "[^]", "[!]", +// "{,...}", or a literal. +static Expected>> scan(StringRef &S) { + StringRef Original = S; + SmallVector> TokenGroups = {{}}; + SmallVector> BraceExpansionTerms; + + auto PushToken = [&](BitVector BV) { + if (!BraceExpansionTerms.empty()) { + // We are currently in a brace expansion, do not push this token yet + BraceExpansionTerms.back().push_back(BV); + return; + } + for (auto &Tokens : TokenGroups) + Tokens.push_back(BV); + }; + auto PushLiteralToken = [&](char C) { + BitVector BV(256, false); + BV[(uint8_t)C] = true; + PushToken(BV); + }; - StringRef Chars = S.substr(1, End - 1); - S = S.substr(End + 1); - if (Chars.startswith("^") || Chars.startswith("!")) { - Expected BV = expand(Chars.substr(1), Original); + while (!S.empty()) { + switch (S.front()) { + case '*': + // '*' is represented by an empty bitvector. + // All other bitvectors are 256-bit long. + PushToken(BitVector()); + S = S.drop_front(); + break; + case '?': + PushToken(BitVector(256, true)); + S = S.drop_front(); + break; + case '[': { + // ']' is allowed as the first character of a character class. '[]' is + // invalid. So, just skip the first character. + size_t End = S.find(']', 2); + if (End == StringRef::npos) + return make_error("invalid glob pattern: " + Original, + errc::invalid_argument); + + StringRef Chars = S.substr(1, End - 1); + S = S.drop_front(End + 1); + bool IsNegated = (Chars.startswith("^") || Chars.startswith("!")); + if (IsNegated) + Chars = Chars.drop_front(); + auto BV = expand(Chars, Original); if (!BV) return BV.takeError(); - return BV->flip(); + PushToken(IsNegated ? BV->flip() : *BV); + break; + } + case '{': + if (!BraceExpansionTerms.empty()) + return make_error( + "nested brace expansions are not supported: " + Original, + errc::invalid_argument); + BraceExpansionTerms.emplace_back(); + S = S.drop_front(); + break; + case ',': + if (!BraceExpansionTerms.empty()) { + BraceExpansionTerms.emplace_back(); + } else { + // A ',' that is not in a brace expansion is takens as a literal + PushLiteralToken(S.front()); + } + S = S.drop_front(); + break; + case '}': + if (!BraceExpansionTerms.empty()) { + SmallVector> OriginalTokenGroups; + // Save and clear TokenGroups + std::swap(TokenGroups, OriginalTokenGroups); + for (const auto &Term : BraceExpansionTerms) { + for (auto NewTokenGroup : OriginalTokenGroups) { + // Copy this token group and append this term + NewTokenGroup.append(Term); + TokenGroups.push_back(NewTokenGroup); + } + } + BraceExpansionTerms.clear(); + } else { + // A '}' that is not in a brace expansion is takens as a literal + PushLiteralToken(S.front()); + } + S = S.drop_front(); + break; + case '\\': + // Eat this character and treat it like a non-meta character. + S = S.drop_front(); + PushLiteralToken(S.front()); + S = S.drop_front(); + break; + default: + PushLiteralToken(S.front()); + S = S.drop_front(); } - return expand(Chars, Original); - } - case '\\': - // Eat this character and fall through below to treat it like a non-meta - // character. - S = S.substr(1); - [[fallthrough]]; - default: - BitVector BV(256, false); - BV[(uint8_t)S[0]] = true; - S = S.substr(1); - return BV; } + + if (!BraceExpansionTerms.empty()) + return make_error("incomplete brace expansion: " + Original, + errc::invalid_argument); + return TokenGroups; } Expected GlobPattern::create(StringRef S) { @@ -129,13 +191,8 @@ // Otherwise, we need to do real glob pattern matching. // Parse the pattern now. - StringRef Original = S; - while (!S.empty()) { - Expected BV = scan(S, Original); - if (!BV) - return BV.takeError(); - Pat.Tokens.push_back(*BV); - } + if (Error Err = scan(S).moveInto(Pat.TokenGroups)) + return Err; return Pat; } @@ -146,7 +203,7 @@ return S.startswith(*Prefix); if (Suffix) return S.endswith(*Suffix); - return matchOne(Tokens, S); + return any_of(TokenGroups, [&](auto &Tokens) { return matchOne(Tokens, S); }); } // Runs glob pattern Pats against string S. @@ -158,12 +215,12 @@ // If Pats[0] is '*', try to match Pats[1..] against all possible // tail strings of S to see at least one pattern succeeds. if (Pats[0].size() == 0) { - Pats = Pats.slice(1); + Pats = Pats.drop_front(); if (Pats.empty()) // Fast path. If a pattern is '*', it matches anything. return true; for (size_t I = 0, E = S.size(); I < E; ++I) - if (matchOne(Pats, S.substr(I))) + if (matchOne(Pats, S.drop_front(I))) return true; return false; } @@ -171,7 +228,7 @@ // If Pats[0] is not '*', it must consume one character. if (S.empty() || !Pats[0][(uint8_t)S[0]]) return false; - Pats = Pats.slice(1); - S = S.substr(1); + Pats = Pats.drop_front(); + S = S.drop_front(); } } diff --git a/llvm/unittests/Support/GlobPatternTest.cpp b/llvm/unittests/Support/GlobPatternTest.cpp --- a/llvm/unittests/Support/GlobPatternTest.cpp +++ b/llvm/unittests/Support/GlobPatternTest.cpp @@ -51,6 +51,22 @@ EXPECT_TRUE(Pat2->match("ax?c")); EXPECT_FALSE(Pat2->match("axxc")); EXPECT_FALSE(Pat2->match("")); + + Expected Pat3 = GlobPattern::create("\\{"); + EXPECT_TRUE((bool)Pat3); + EXPECT_TRUE(Pat3->match("{")); + EXPECT_FALSE(Pat3->match("\\{")); + EXPECT_FALSE(Pat3->match("")); + + Expected Pat4 = GlobPattern::create("\\\\"); + EXPECT_TRUE((bool)Pat4); + EXPECT_TRUE(Pat4->match("\\")); + EXPECT_FALSE(Pat4->match("")); + + Expected Pat5 = GlobPattern::create("\\a"); + EXPECT_TRUE((bool)Pat5); + EXPECT_TRUE(Pat5->match("a")); + EXPECT_FALSE(Pat5->match("\\a")); } TEST_F(GlobPatternTest, BasicCharacterClass) { @@ -106,23 +122,55 @@ } TEST_F(GlobPatternTest, SpecialCharsInCharacterClass) { - Expected Pat1 = GlobPattern::create("[*?^]"); + Expected Pat1 = GlobPattern::create("[*?^{},]"); EXPECT_TRUE((bool)Pat1); EXPECT_TRUE(Pat1->match("*")); EXPECT_TRUE(Pat1->match("?")); EXPECT_TRUE(Pat1->match("^")); - EXPECT_FALSE(Pat1->match("*?^")); + EXPECT_TRUE(Pat1->match("{")); + EXPECT_TRUE(Pat1->match("}")); + EXPECT_TRUE(Pat1->match(",")); + EXPECT_FALSE(Pat1->match("*?^{},")); EXPECT_FALSE(Pat1->match("")); } -TEST_F(GlobPatternTest, Invalid) { - Expected Pat1 = GlobPattern::create("["); - EXPECT_FALSE((bool)Pat1); - handleAllErrors(Pat1.takeError(), [&](ErrorInfoBase &EIB) {}); +TEST_F(GlobPatternTest, BraceExpansion) { + auto Pat1 = GlobPattern::create("{a,b}{1,2}"); + EXPECT_TRUE((bool)Pat1); + EXPECT_TRUE(Pat1->match("a1")); + EXPECT_TRUE(Pat1->match("a2")); + EXPECT_TRUE(Pat1->match("b1")); + EXPECT_TRUE(Pat1->match("b2")); + EXPECT_FALSE(Pat1->match("ab")); - Expected Pat2 = GlobPattern::create("[]"); - EXPECT_FALSE((bool)Pat2); - handleAllErrors(Pat2.takeError(), [&](ErrorInfoBase &EIB) {}); + auto Pat2 = GlobPattern::create(",}{foo,\\{,z*}"); + EXPECT_TRUE((bool)Pat2); + EXPECT_TRUE(Pat2->match(",}foo")); + EXPECT_TRUE(Pat2->match(",}{")); + EXPECT_TRUE(Pat2->match(",}z")); + EXPECT_TRUE(Pat2->match(",}zoo")); + EXPECT_FALSE(Pat2->match(",}fooz")); + EXPECT_FALSE(Pat2->match("foo")); + EXPECT_FALSE(Pat2->match("")); +} + +TEST_F(GlobPatternTest, BraceExpansionCharacterClass) { + // Matches mangled names of C++ standard library functions + auto Pat = GlobPattern::create("_Z{N,NK,}S[tabsiod]*"); + EXPECT_TRUE((bool)Pat); + EXPECT_TRUE(Pat->match("_ZNSt6vectorIiSaIiEE9push_backEOi")); + EXPECT_TRUE(Pat->match("_ZNKStfoo")); + EXPECT_TRUE(Pat->match("_ZNSafoo")); + EXPECT_TRUE(Pat->match("_ZStfoo")); + EXPECT_FALSE(Pat->match("_Zfoo")); +} + +TEST_F(GlobPatternTest, Invalid) { + for (const auto &InvalidPattern : {"[", "[]", "{", "{{"}) { + Expected Pat1 = GlobPattern::create(InvalidPattern); + EXPECT_FALSE((bool)Pat1) << "Expected invalid pattern: " << InvalidPattern; + handleAllErrors(Pat1.takeError(), [&](ErrorInfoBase &EIB) {}); + } } TEST_F(GlobPatternTest, ExtSym) {