diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h @@ -19,6 +19,20 @@ // production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either // nonterminal or terminal, identified by a SymbolID. // +// The grammar supports extensions, which have the syntax form of +// [key=value;key=value]. Extensions are associated with a grammar symbol ( +// on the right-hand side of the symbol) or a grammar rule (at the end of the +// rule body). +// +// Extensions provide a way to inject custom code into the general GLR +// parser. For example, we have a rule in the C++ grammar: +// +// contextual-override := IDENTIFIER [guard=Override] +// +// This rule is guarded -- the reduction of this rule will be conducted by the +// GLR parser only if the IDENTIFIER content is `override` (see detail +// implementationin CXX.h). +// // Notions about the BNF grammar: // - "_" is the start symbol of the augmented grammar; // - single-line comment is supported, starting with a # @@ -69,6 +83,11 @@ return TokenFlag | static_cast(TK); } +// An extension uniquely identifies an extension in a grammar. +// It is the index into a table of extension values. +// NOTE: value among extensions must be unique even within different keys! +using ExtensionID = uint16_t; + // A RuleID uniquely identifies a production rule in a grammar. // It is an index into a table of rules. using RuleID = uint16_t; @@ -96,11 +115,16 @@ uint8_t Size : SizeBits; // Size of the Sequence SymbolID Sequence[MaxElements]; + // A guard extension controls whether a reduction of a rule will be conducted + // by the GLR parser. + // 0 is sentinel extension ID, indicating no extensions. + ExtensionID Guard = 0; + llvm::ArrayRef seq() const { return llvm::ArrayRef(Sequence, Size); } friend bool operator==(const Rule &L, const Rule &R) { - return L.Target == R.Target && L.seq() == R.seq(); + return L.Target == R.Target && L.seq() == R.seq() && L.Guard == R.Guard; } }; @@ -186,6 +210,9 @@ // A table of nonterminals, sorted by name. // SymbolID is the index of the table. std::vector Nonterminals; + // A table of extensions values, sorted by name. + // ExtensionID is the index of the table. + std::vector Extensions; }; } // namespace pseudo diff --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp --- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp @@ -61,6 +61,8 @@ OS << symbolName(R.Target) << " :="; for (SymbolID SID : R.seq()) OS << " " << symbolName(SID); + if (R.Guard) + OS << " [guard=" << T->Extensions[R.Guard] << "]"; return Result; } diff --git a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp --- a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp +++ b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp @@ -47,6 +47,9 @@ // Assemble the name->ID and ID->nonterminal name maps. llvm::DenseSet UniqueNonterminals; llvm::DenseMap SymbolIds; + + llvm::DenseSet UniqueExtensionValues; + for (uint16_t I = 0; I < NumTerminals; ++I) SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I))); auto Consider = [&](llvm::StringRef Name) { @@ -55,8 +58,11 @@ }; for (const auto &Spec : Specs) { Consider(Spec.Target); - for (const RuleSpec::Element &Elt : Spec.Sequence) + for (const RuleSpec::Element &Elt : Spec.Sequence) { Consider(Elt.Symbol); + for (const auto& KV : Elt.Extensions) + UniqueExtensionValues.insert(KV.second); + } } llvm::for_each(UniqueNonterminals, [&T](llvm::StringRef Name) { T->Nonterminals.emplace_back(); @@ -68,6 +74,15 @@ const GrammarTable::Nonterminal &R) { return L.Name < R.Name; }); + // Add an empty string for the corresponding sentinel none extension. + T->Extensions.push_back(""); + llvm::for_each(UniqueExtensionValues, [&T](llvm::StringRef Name) { + T->Extensions.emplace_back(); + T->Extensions.back() = Name.str(); + }); + llvm::sort(T->Extensions); + assert(T->Extensions.front() == ""); + // Build name -> ID maps for nonterminals. for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID); @@ -87,6 +102,8 @@ Symbols.push_back(Lookup(Elt.Symbol)); T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols)); } + applyExtension(Specs, *T); + assert(T->Rules.size() < (1 << RuleBits) && "Too many rules to fit in RuleID bits!"); const auto &SymbolOrder = getTopologicalOrder(T.get()); @@ -164,6 +181,9 @@ llvm::StringRef Target; struct Element { llvm::StringRef Symbol; // Name of the symbol + // Extensions that are associated to the sequence symbol or rule. + std::vector> + Extensions; }; std::vector Sequence; @@ -204,11 +224,58 @@ Chunk = Chunk.trim(); if (Chunk.empty()) continue; // skip empty + if (Chunk.startswith("[") && Chunk.endswith("]")) { + if (Out.Sequence.empty()) + continue; + parseExtension(Chunk, Out.Sequence.back().Extensions); + continue; + } Out.Sequence.push_back({Chunk}); } return true; - }; + } + + bool parseExtension( + llvm::StringRef Content, + std::vector> &Out) { + assert(Content.startswith("[") && Content.endswith("]")); + for (llvm::StringRef ExtText : + llvm::split(Content.drop_front().drop_back(), ';')) { + auto KV = ExtText.split('='); + if (KV.first == ExtText) { // no separator in Line + Diagnostics.push_back( + llvm::formatv("Failed to parse extension '{0}': no separator =", + ExtText) + .str()); + return false; + } + Out.push_back({KV.first, KV.second.trim()}); + } + return true; + } + // Apply the parsed extensions (stored in RuleSpec) to the grammar Rule. + void applyExtension(llvm::ArrayRef Specs, GrammarTable &T) { + assert(T.Rules.size() == Specs.size()); + assert(llvm::is_sorted(T.Extensions)); + auto LookupExtensionID = [&T](llvm::StringRef Name) { + const auto It = llvm::partition_point( + T.Extensions, [&](llvm::StringRef X) { return X < Name; }); + assert(It != T.Extensions.end() && *It == Name && + "Didn't find the symbol in AttrValues!"); + return It - T.Extensions.begin(); + }; + for (unsigned I = 0; I < Specs.size(); ++I) { + for (const auto &KV : Specs[I].Sequence.back().Extensions) { + if (KV.first == "guard") { + T.Rules[I].Guard = LookupExtensionID(KV.second); + continue; + } + Diagnostics.push_back( + llvm::formatv("Unknown extension key '{0}'", KV.first).str()); + } + } + } // Inlines all _opt symbols. // For example, a rule E := id +_opt id, after elimination, we have two diff --git a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp --- a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp @@ -99,6 +99,14 @@ EXPECT_LT(ruleFor("x"), ruleFor("_")); } +TEST_F(GrammarTest, Annotation) { + build(R"bnf( + _ := IDENTIFIER [guard=override] + )bnf"); + ASSERT_TRUE(Diags.empty()); + EXPECT_TRUE(G->lookupRule(ruleFor("_")).Guard); +} + TEST_F(GrammarTest, Diagnostics) { build(R"cpp( _ := ,_opt @@ -110,6 +118,8 @@ # cycle a := b b := a + + _ := IDENTIFIER [guard=override;unknown=value] )cpp"); EXPECT_EQ(G->underscore(), id("_")); @@ -120,7 +130,8 @@ "Failed to parse 'invalid': no separator :=", "Token-like name IDENFIFIE is used as a nonterminal", "No rules for nonterminal: IDENFIFIE", - "The grammar contains a cycle involving symbol a")); + "The grammar contains a cycle involving symbol a", + "Unknown extension key 'unknown'")); } TEST_F(GrammarTest, FirstAndFollowSets) {