diff --git a/llvm/include/llvm/Support/GlobPattern.h b/llvm/include/llvm/Support/GlobPattern.h --- a/llvm/include/llvm/Support/GlobPattern.h +++ b/llvm/include/llvm/Support/GlobPattern.h @@ -6,8 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file implements a glob pattern matcher. The glob pattern is the -// rule used by the shell. +// This file implements a glob pattern matcher. // //===----------------------------------------------------------------------===// @@ -20,30 +19,72 @@ #include "llvm/Support/Error.h" #include -// This class represents a glob pattern. Supported metacharacters -// are "*", "?", "\", "[]", "[^]", and "[!]". namespace llvm { +/// This class implements a glob pattern matcher similar to the one found in +/// bash, but with some key differences. Namely, that \p "*" matches all +/// characters and does not exclude path separators. +/// +/// * \p "?" matches a single character. +/// * \p "*" matches zero or more characters. +/// * \p "[]" matches one character in the bracket. Character ranges, +/// e.g., \p "[a-z]", and negative sets via \p "[^ab]" or \p "[!ab]" are also +/// supported. +/// * \p "{,...}" matches one of the globs in the list. Nested brace +/// expansions are not supported. If \p MaxSubPatterns is empty then +/// characters \p "{,}" are treated as literals. +/// * \p "\" escapes the next character so it is treated as a literal. +/// +/// Some known edge cases are: +/// * \p "]" is allowed as the first character in a character class, i.e., +/// \p "[]]" is valid and matches the literal \p "]". +/// * The empty character class, i.e., \p "[]", is invalid. +/// * Empty or singleton brace expansions, e.g., \p "{}", \p "{a}", are invalid. +/// * \p "}" and \p "," that are not inside a brace expansion are taken as +/// literals, e.g., \p ",}" is valid but \p "{" is not. +/// +/// For example, \p "*[/\\]foo.{c,cpp}" will match (unix or windows) paths to +/// all files named \p "foo.c" or \p "foo.cpp". class GlobPattern { public: - static Expected create(StringRef Pat); + /// \param Pat the pattern to match against + /// \param MaxSubPatterns if provided limit the number of allowed subpatterns + /// created from expanding braces otherwise disable + /// brace expansion + static Expected + create(StringRef Pat, std::optional MaxSubPatterns = {}); + /// \returns \p true if \p S matches this glob pattern bool match(StringRef S) const; // Returns true for glob pattern "*". Can be used to avoid expensive // preparation/acquisition of the input for match(). - bool isTrivialMatchAll() const { return Prefix.empty() && Pat == "*"; } + bool isTrivialMatchAll() const { + if (!Prefix.empty()) + return false; + if (SubGlobs.size() != 1) + return false; + return SubGlobs[0].getPat() == "*"; + } private: - bool matchOne(StringRef Str) const; + StringRef Prefix; - // Brackets with their end position and matched bytes. - struct Bracket { - const char *Next; - BitVector Bytes; - }; - SmallVector Brackets; + struct SubGlobPattern { + /// \param Pat the pattern to match against + static Expected create(StringRef Pat); + /// \returns \p true if \p S matches this glob pattern + bool match(StringRef S) const; + StringRef getPat() const { return StringRef(Pat.data(), Pat.size()); } - StringRef Prefix, Pat; + // Brackets with their end position and matched bytes. + struct Bracket { + size_t NextOffset; + BitVector Bytes; + }; + SmallVector Brackets; + SmallVector Pat; + }; + SmallVector SubGlobs; }; } diff --git a/llvm/lib/Support/GlobPattern.cpp b/llvm/lib/Support/GlobPattern.cpp --- a/llvm/lib/Support/GlobPattern.cpp +++ b/llvm/lib/Support/GlobPattern.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/GlobPattern.h" -#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Errc.h" @@ -54,18 +53,115 @@ return BV; } -Expected GlobPattern::create(StringRef S) { +// Identify brace expansions in S and return the list of patterns they expand +// into. +static Expected> +parseBraceExpansions(StringRef S, std::optional MaxSubPatterns) { + SmallVector SubPatterns = {S.str()}; + if (!MaxSubPatterns || !S.contains('{')) + return SubPatterns; + + struct BraceExpansion { + size_t Start; + size_t Length; + SmallVector Terms; + }; + SmallVector BraceExpansions; + + BraceExpansion *CurrentBE = nullptr; + size_t TermBegin; + for (size_t I = 0, E = S.size(); I != E; ++I) { + if (S[I] == '[') { + I = S.find(']', I + 2); + if (I == std::string::npos) + return make_error("invalid glob pattern, unmatched '['", + errc::invalid_argument); + } else if (S[I] == '{') { + if (CurrentBE) + return make_error( + "nested brace expansions are not supported", + errc::invalid_argument); + CurrentBE = &BraceExpansions.emplace_back(); + CurrentBE->Start = I; + TermBegin = I + 1; + } else if (S[I] == ',') { + if (!CurrentBE) + continue; + CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin)); + TermBegin = I + 1; + } else if (S[I] == '}') { + if (!CurrentBE) + continue; + if (CurrentBE->Terms.empty()) + return make_error( + "empty or singleton brace expansions are not supported", + errc::invalid_argument); + CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin)); + CurrentBE->Length = I - CurrentBE->Start + 1; + CurrentBE = nullptr; + } else if (S[I] == '\\') { + if (++I == E) + return make_error("invalid glob pattern, stray '\\'", + errc::invalid_argument); + } + } + if (CurrentBE) + return make_error("incomplete brace expansion", + errc::invalid_argument); + + size_t NumSubPatterns = 1; + for (auto &BE : BraceExpansions) { + if (NumSubPatterns > std::numeric_limits::max() / BE.Terms.size()) { + NumSubPatterns = std::numeric_limits::max(); + break; + } + NumSubPatterns *= BE.Terms.size(); + } + if (NumSubPatterns > *MaxSubPatterns) + return make_error("too many brace expansions", + errc::invalid_argument); + // Replace brace expansions in reverse order so that we don't invalidate + // earlier start indices + for (auto &BE : reverse(BraceExpansions)) { + SmallVector OrigSubPatterns; + std::swap(SubPatterns, OrigSubPatterns); + for (StringRef Term : BE.Terms) + for (StringRef Orig : OrigSubPatterns) + SubPatterns.emplace_back(Orig).replace(BE.Start, BE.Length, Term); + } + return SubPatterns; +} + +Expected +GlobPattern::create(StringRef S, std::optional MaxSubPatterns) { GlobPattern Pat; // Store the prefix that does not contain any metacharacter. - size_t PrefixSize = S.find_first_of("?*[\\"); + size_t PrefixSize = S.find_first_of("?*[{\\"); Pat.Prefix = S.substr(0, PrefixSize); if (PrefixSize == std::string::npos) return Pat; S = S.substr(PrefixSize); + SmallVector SubPats; + if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(SubPats)) + return Err; + for (StringRef SubPat : SubPats) { + auto SubGlobOrErr = SubGlobPattern::create(SubPat); + if (!SubGlobOrErr) + return SubGlobOrErr.takeError(); + Pat.SubGlobs.push_back(*SubGlobOrErr); + } + + return Pat; +} + +Expected +GlobPattern::SubGlobPattern::create(StringRef S) { + SubGlobPattern Pat; + // Parse brackets. - Pat.Pat = S; + Pat.Pat.assign(S.begin(), S.end()); for (size_t I = 0, E = S.size(); I != E; ++I) { if (S[I] == '[') { // ']' is allowed as the first character of a character class. '[]' is @@ -83,7 +179,7 @@ return BV.takeError(); if (Invert) BV->flip(); - Pat.Brackets.push_back(Bracket{S.data() + J + 1, std::move(*BV)}); + Pat.Brackets.push_back(Bracket{J + 1, std::move(*BV)}); I = J; } else if (S[I] == '\\') { if (++I == E) @@ -95,13 +191,20 @@ } bool GlobPattern::match(StringRef S) const { - return S.consume_front(Prefix) && matchOne(S); + if (!S.consume_front(Prefix)) + return false; + if (SubGlobs.empty() && S.empty()) + return true; + for (auto &Glob : SubGlobs) + if (Glob.match(S)) + return true; + return false; } // Factor the pattern into segments split by '*'. The segment is matched // sequentianlly by finding the first occurrence past the end of the previous // match. -bool GlobPattern::matchOne(StringRef Str) const { +bool GlobPattern::SubGlobPattern::match(StringRef Str) const { const char *P = Pat.data(), *SegmentBegin = nullptr, *S = Str.data(), *SavedS = S; const char *const PEnd = P + Pat.size(), *const End = S + Str.size(); @@ -118,7 +221,7 @@ continue; } else if (*P == '[') { if (Brackets[B].Bytes[uint8_t(*S)]) { - P = Brackets[B++].Next; + P = Pat.data() + Brackets[B++].NextOffset; ++S; continue; } @@ -143,5 +246,5 @@ } // All bytes in Str have been matched. Return true if the rest part of Pat is // empty or contains only '*'. - return Pat.find_first_not_of('*', P - Pat.data()) == std::string::npos; + return getPat().find_first_not_of('*', P - Pat.data()) == std::string::npos; } diff --git a/llvm/unittests/Support/GlobPatternTest.cpp b/llvm/unittests/Support/GlobPatternTest.cpp --- a/llvm/unittests/Support/GlobPatternTest.cpp +++ b/llvm/unittests/Support/GlobPatternTest.cpp @@ -52,6 +52,17 @@ EXPECT_FALSE(Pat2->match("axxc")); EXPECT_FALSE(Pat2->match("")); + auto Pat3 = GlobPattern::create("\\{"); + ASSERT_TRUE((bool)Pat3); + EXPECT_TRUE(Pat3->match("{")); + EXPECT_FALSE(Pat3->match("\\{")); + EXPECT_FALSE(Pat3->match("")); + + auto Pat4 = GlobPattern::create("\\a"); + ASSERT_TRUE((bool)Pat4); + EXPECT_TRUE(Pat4->match("a")); + EXPECT_FALSE(Pat4->match("\\a")); + for (size_t I = 0; I != 4; ++I) { std::string S(I, '\\'); Expected Pat = GlobPattern::create(S); @@ -122,12 +133,15 @@ } TEST_F(GlobPatternTest, SpecialCharsInCharacterClass) { - Expected Pat1 = GlobPattern::create("[*?^]"); - EXPECT_TRUE((bool)Pat1); + auto Pat1 = GlobPattern::create("[*?^{},]"); + ASSERT_TRUE((bool)Pat1); EXPECT_TRUE(Pat1->match("*")); EXPECT_TRUE(Pat1->match("?")); EXPECT_TRUE(Pat1->match("^")); - EXPECT_FALSE(Pat1->match("*?^")); + EXPECT_TRUE(Pat1->match("{")); + EXPECT_TRUE(Pat1->match("}")); + EXPECT_TRUE(Pat1->match(",")); + EXPECT_FALSE(Pat1->match("*?^{},")); EXPECT_FALSE(Pat1->match("")); Expected Pat2 = GlobPattern::create("[*]"); @@ -137,13 +151,73 @@ } TEST_F(GlobPatternTest, Invalid) { - Expected Pat1 = GlobPattern::create("["); + for (const auto &InvalidPattern : {"[", "[]"}) { + auto Pat1 = GlobPattern::create(InvalidPattern); + EXPECT_FALSE((bool)Pat1) << "Expected invalid pattern: " << InvalidPattern; + handleAllErrors(Pat1.takeError(), [&](ErrorInfoBase &EIB) {}); + } +} + +TEST_F(GlobPatternTest, InvalidBraceExpansion) { + for (const auto &InvalidPattern : + {"{", "{{", "{\\", "{\\}", "{}", "{a}", "[{}"}) { + auto Pat1 = GlobPattern::create(InvalidPattern, /*MaxSubPatterns=*/1024); + EXPECT_FALSE((bool)Pat1) << "Expected invalid pattern: " << InvalidPattern; + handleAllErrors(Pat1.takeError(), [&](ErrorInfoBase &EIB) {}); + } + auto Pat1 = GlobPattern::create("{a,b}{c,d}{e,f}", /*MaxSubPatterns=*/7); EXPECT_FALSE((bool)Pat1); handleAllErrors(Pat1.takeError(), [&](ErrorInfoBase &EIB) {}); +} - Expected Pat2 = GlobPattern::create("[]"); - EXPECT_FALSE((bool)Pat2); - handleAllErrors(Pat2.takeError(), [&](ErrorInfoBase &EIB) {}); +TEST_F(GlobPatternTest, BraceExpansion) { + auto Pat1 = GlobPattern::create("{a,b}{1,2}", /*MaxSubPatterns=*/1024); + ASSERT_TRUE((bool)Pat1); + EXPECT_TRUE(Pat1->match("a1")); + EXPECT_TRUE(Pat1->match("a2")); + EXPECT_TRUE(Pat1->match("b1")); + EXPECT_TRUE(Pat1->match("b2")); + EXPECT_FALSE(Pat1->match("ab")); + + auto Pat2 = GlobPattern::create(",}{foo,\\,\\},z*}", /*MaxSubPatterns=*/1024); + ASSERT_TRUE((bool)Pat2); + EXPECT_TRUE(Pat2->match(",}foo")); + EXPECT_TRUE(Pat2->match(",},}")); + EXPECT_TRUE(Pat2->match(",}z")); + EXPECT_TRUE(Pat2->match(",}zoo")); + EXPECT_FALSE(Pat2->match(",}fooz")); + EXPECT_FALSE(Pat2->match("foo")); + EXPECT_FALSE(Pat2->match("")); + + // This test breaks if we store terms separately and attempt to match them one + // by one instead of using subglobs + auto Pat3 = GlobPattern::create("{a,ab}b", /*MaxSubPatterns=*/1024); + ASSERT_TRUE((bool)Pat3); + EXPECT_TRUE(Pat3->match("ab")); + EXPECT_TRUE(Pat3->match("abb")); +} + +TEST_F(GlobPatternTest, NoBraceExpansion) { + auto Pat1 = GlobPattern::create("{a,b}{1,2}"); + ASSERT_TRUE((bool)Pat1); + EXPECT_TRUE(Pat1->match("{a,b}{1,2}")); + EXPECT_FALSE(Pat1->match("a1")); + + auto Pat2 = GlobPattern::create("{{"); + ASSERT_TRUE((bool)Pat2); + EXPECT_TRUE(Pat2->match("{{")); +} + +TEST_F(GlobPatternTest, BraceExpansionCharacterClass) { + // Matches mangled names of C++ standard library functions + auto Pat = + GlobPattern::create("_Z{N,NK,}S[tabsiod]*", /*MaxSubPatterns=*/1024); + ASSERT_TRUE((bool)Pat); + EXPECT_TRUE(Pat->match("_ZNSt6vectorIiSaIiEE9push_backEOi")); + EXPECT_TRUE(Pat->match("_ZNKStfoo")); + EXPECT_TRUE(Pat->match("_ZNSafoo")); + EXPECT_TRUE(Pat->match("_ZStfoo")); + EXPECT_FALSE(Pat->match("_Zfoo")); } TEST_F(GlobPatternTest, ExtSym) { @@ -169,7 +243,7 @@ } TEST_F(GlobPatternTest, NUL) { - for (char C : "?*{") { + for (char C : "?*") { std::string S(1, C); Expected Pat = GlobPattern::create(S); ASSERT_TRUE((bool)Pat); @@ -185,13 +259,14 @@ TEST_F(GlobPatternTest, Pathological) { std::string P, S(40, 'a'); + StringRef Pieces[] = {"a*", "[ba]*", "{b*,a*}*"}; for (int I = 0; I != 30; ++I) - P += I % 2 ? "a*" : "[ba]*"; - Expected Pat = GlobPattern::create(P); + P += Pieces[I % 3]; + Expected Pat = GlobPattern::create(P, /*MaxSubPatterns=*/1024); ASSERT_TRUE((bool)Pat); EXPECT_TRUE(Pat->match(S)); P += 'b'; - Pat = GlobPattern::create(P); + Pat = GlobPattern::create(P, /*MaxSubPatterns=*/1024); ASSERT_TRUE((bool)Pat); EXPECT_FALSE(Pat->match(S)); EXPECT_TRUE(Pat->match(S + 'b'));