diff --git a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp --- a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp +++ b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp @@ -25,6 +25,7 @@ #include "clang-pseudo/Forest.h" #include "clang-pseudo/GLR.h" #include "clang-pseudo/Token.h" +#include "clang-pseudo/cli/CLI.h" #include "clang-pseudo/grammar/Grammar.h" #include "clang-pseudo/grammar/LRTable.h" #include "clang/Basic/LangOptions.h" @@ -39,9 +40,6 @@ using llvm::cl::opt; using llvm::cl::Required; -static opt GrammarFile("grammar", - desc("Parse and check a BNF grammar file."), - Required); static opt Source("source", desc("Source file"), Required); namespace clang { @@ -49,11 +47,9 @@ namespace bench { namespace { -const std::string *GrammarText = nullptr; const std::string *SourceText = nullptr; -const Grammar *G = nullptr; -void setupGrammarAndSource() { +void setupSource() { auto ReadFile = [](llvm::StringRef FilePath) -> std::string { llvm::ErrorOr> GrammarText = llvm::MemoryBuffer::getFile(FilePath); @@ -64,22 +60,12 @@ } return GrammarText.get()->getBuffer().str(); }; - GrammarText = new std::string(ReadFile(GrammarFile)); SourceText = new std::string(ReadFile(Source)); - std::vector Diags; - G = Grammar::parseBNF(*GrammarText, Diags).release(); } -static void parseBNF(benchmark::State &State) { - std::vector Diags; - for (auto _ : State) - Grammar::parseBNF(*GrammarText, Diags); -} -BENCHMARK(parseBNF); - static void buildSLR(benchmark::State &State) { for (auto _ : State) - LRTable::buildSLR(*G); + LRTable::buildSLR(cli::getLanguage().G); } BENCHMARK(buildSLR); @@ -129,13 +115,14 @@ BENCHMARK(preprocess); static void glrParse(benchmark::State &State) { - LRTable Table = clang::pseudo::LRTable::buildSLR(*G); - SymbolID StartSymbol = *G->findNonterminal("translation-unit"); + SymbolID StartSymbol = + *cli::getLanguage().G.findNonterminal("translation-unit"); TokenStream Stream = lexAndPreprocess(); for (auto _ : State) { pseudo::ForestArena Forest; pseudo::GSS GSS; - pseudo::glrParse(Stream, ParseParams{*G, Table, Forest, GSS}, StartSymbol); + pseudo::glrParse(Stream, ParseParams{cli::getLanguage(), Forest, GSS}, + StartSymbol); } State.SetBytesProcessed(static_cast(State.iterations()) * SourceText->size()); @@ -143,14 +130,14 @@ BENCHMARK(glrParse); static void full(benchmark::State &State) { - LRTable Table = clang::pseudo::LRTable::buildSLR(*G); - SymbolID StartSymbol = *G->findNonterminal("translation-unit"); + SymbolID StartSymbol = + *cli::getLanguage().G.findNonterminal("translation-unit"); for (auto _ : State) { TokenStream Stream = lexAndPreprocess(); pseudo::ForestArena Forest; pseudo::GSS GSS; - pseudo::glrParse(lexAndPreprocess(), ParseParams{*G, Table, Forest, GSS}, - StartSymbol); + pseudo::glrParse(lexAndPreprocess(), + ParseParams{cli::getLanguage(), Forest, GSS}, StartSymbol); } State.SetBytesProcessed(static_cast(State.iterations()) * SourceText->size()); @@ -165,7 +152,7 @@ int main(int argc, char *argv[]) { benchmark::Initialize(&argc, argv); llvm::cl::ParseCommandLineOptions(argc, argv); - clang::pseudo::bench::setupGrammarAndSource(); + clang::pseudo::bench::setupSource(); benchmark::RunSpecifiedBenchmarks(); return 0; } diff --git a/clang-tools-extra/pseudo/benchmarks/CMakeLists.txt b/clang-tools-extra/pseudo/benchmarks/CMakeLists.txt --- a/clang-tools-extra/pseudo/benchmarks/CMakeLists.txt +++ b/clang-tools-extra/pseudo/benchmarks/CMakeLists.txt @@ -3,6 +3,7 @@ target_link_libraries(ClangPseudoBenchmark PRIVATE clangPseudo + clangPseudoCLI clangPseudoGrammar LLVMSupport ) diff --git a/clang-tools-extra/pseudo/fuzzer/CMakeLists.txt b/clang-tools-extra/pseudo/fuzzer/CMakeLists.txt --- a/clang-tools-extra/pseudo/fuzzer/CMakeLists.txt +++ b/clang-tools-extra/pseudo/fuzzer/CMakeLists.txt @@ -11,5 +11,6 @@ target_link_libraries(clang-pseudo-fuzzer PRIVATE clangPseudo + clangPseudoCLI clangPseudoGrammar ) diff --git a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp --- a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp +++ b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp @@ -10,6 +10,7 @@ #include "clang-pseudo/Forest.h" #include "clang-pseudo/GLR.h" #include "clang-pseudo/Token.h" +#include "clang-pseudo/cli/CLI.h" #include "clang-pseudo/grammar/Grammar.h" #include "clang-pseudo/grammar/LRTable.h" #include "clang/Basic/LangOptions.h" @@ -24,28 +25,11 @@ class Fuzzer { clang::LangOptions LangOpts = clang::pseudo::genericLangOpts(); - std::unique_ptr G; LRTable T; bool Print; public: - Fuzzer(llvm::StringRef GrammarPath, bool Print) : Print(Print) { - llvm::ErrorOr> GrammarText = - llvm::MemoryBuffer::getFile(GrammarPath); - if (std::error_code EC = GrammarText.getError()) { - llvm::errs() << "Error: can't read grammar file '" << GrammarPath - << "': " << EC.message() << "\n"; - std::exit(1); - } - std::vector Diags; - G = Grammar::parseBNF(GrammarText->get()->getBuffer(), Diags); - if (!Diags.empty()) { - for (const auto &Diag : Diags) - llvm::errs() << Diag << "\n"; - std::exit(1); - } - T = LRTable::buildSLR(*G); - } + Fuzzer(bool Print) : Print(Print) {} void operator()(llvm::StringRef Code) { std::string CodeStr = Code.str(); // Must be null-terminated. @@ -59,10 +43,11 @@ clang::pseudo::ForestArena Arena; clang::pseudo::GSS GSS; auto &Root = - glrParse(ParseableStream, clang::pseudo::ParseParams{*G, T, Arena, GSS}, - *G->findNonterminal("translation-unit")); + glrParse(ParseableStream, + clang::pseudo::ParseParams{cli::getLanguage(), Arena, GSS}, + *cli::getLanguage().G.findNonterminal("translation-unit")); if (Print) - llvm::outs() << Root.dumpRecursive(*G); + llvm::outs() << Root.dumpRecursive(cli::getLanguage().G); } }; @@ -75,16 +60,11 @@ extern "C" { // Set up the fuzzer from command line flags: -// -grammar= (required) - path to cxx.bnf // -print - used for testing the fuzzer int LLVMFuzzerInitialize(int *Argc, char ***Argv) { - llvm::StringRef GrammarFile; bool PrintForest = false; auto ConsumeArg = [&](llvm::StringRef Arg) -> bool { - if (Arg.consume_front("-grammar=")) { - GrammarFile = Arg; - return true; - } else if (Arg == "-print") { + if (Arg == "-print") { PrintForest = true; return true; } @@ -92,11 +72,7 @@ }; *Argc = std::remove_if(*Argv + 1, *Argv + *Argc, ConsumeArg) - *Argv; - if (GrammarFile.empty()) { - fprintf(stderr, "Fuzzer needs -grammar=/path/to/cxx.bnf\n"); - exit(1); - } - clang::pseudo::Fuzz = new clang::pseudo::Fuzzer(GrammarFile, PrintForest); + clang::pseudo::Fuzz = new clang::pseudo::Fuzzer(PrintForest); return 0; } diff --git a/clang-tools-extra/pseudo/gen/Main.cpp b/clang-tools-extra/pseudo/gen/Main.cpp --- a/clang-tools-extra/pseudo/gen/Main.cpp +++ b/clang-tools-extra/pseudo/gen/Main.cpp @@ -79,6 +79,14 @@ switch (Emit) { case EmitSymbolList: + Out.os() << R"cpp( +#ifndef NONTERMINAL +#define NONTERMINAL(X, Y) +#endif +#ifndef EXTENSION +#define EXTENSION(X, Y) +#endif + )cpp"; for (clang::pseudo::SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID) { std::string Name = G->symbolName(ID).str(); @@ -86,6 +94,16 @@ std::replace(Name.begin(), Name.end(), '-', '_'); Out.os() << llvm::formatv("NONTERMINAL({0}, {1})\n", Name, ID); } + for (clang::pseudo::ExtensionID AID = 1 /*skip the sentinel 0 value*/; + AID < G->table().AttributeValues.size(); ++AID) { + llvm::StringRef Name = G->table().AttributeValues[AID]; + assert(!Name.empty()); + Out.os() << llvm::formatv("EXTENSION({0}, {1})\n", Name, AID); + } + Out.os() << R"cpp( +#undef NONTERMINAL +#undef EXTENSION + )cpp"; break; case EmitGrammarContent: for (llvm::StringRef Line : llvm::split(GrammarText, '\n')) { diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h @@ -30,6 +30,7 @@ #define CLANG_PSEUDO_GLR_H #include "clang-pseudo/Forest.h" +#include "clang-pseudo/Language.h" #include "clang-pseudo/grammar/Grammar.h" #include "clang-pseudo/grammar/LRTable.h" #include "llvm/Support/Allocator.h" @@ -113,12 +114,7 @@ // Parameters for the GLR parsing. struct ParseParams { - // The grammar of the language we're going to parse. - const Grammar &G; - // The LR table which GLR uses to parse the input, should correspond to the - // Grammar G. - const LRTable &Table; - + const ParseLang ⟪ // Arena for data structure used by the GLR algorithm. ForestArena &Forest; // Storage for the output forest. GSS &GSStack; // Storage for parsing stacks. @@ -159,7 +155,7 @@ // // Exposed for testing only. void glrReduce(std::vector &PendingReduce, const ParseParams &Params, - NewHeadCallback NewHeadCB); + const TokenStream &Tokens, NewHeadCallback NewHeadCB); } // namespace pseudo } // namespace clang diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h b/clang-tools-extra/pseudo/include/clang-pseudo/Language.h new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Language.h @@ -0,0 +1,47 @@ +//===--- Language.h -------------------------------------------- -*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef CLANG_PSEUDO_LANGUAGE_H +#define CLANG_PSEUDO_LANGUAGE_H + +#include "clang-pseudo/grammar/Grammar.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLFunctionalExtras.h" + +namespace clang { +namespace pseudo { +class ForestNode; +class TokenStream; +class LRTable; + +// Interface for implementing the grammar "guard" attribute. +// +// It is used by the GLR parser to determine whether a reduction of a rule will +// be conducted during the reduce time. +// +// Returns false if the reduction is not conducted (this parsing branch in GLR +// will die). +using Guard = llvm::function_ref RHS, + const TokenStream &)>; + +// Specify a language that can be parsed by the pseduoparser. +// Manifest generated from a bnf grammar file. +struct ParseLang { + const Grammar &G; + const LRTable &Table; + // Binding "guard" extension id to a piece of C++ code. + const llvm::DenseMap &Guards; + + // FIXME: add clang::LangOptions. + // FIXME: add default start symbols. +}; + +} // namespace pseudo +} // namespace clang + +#endif // CLANG_PSEUDO_LANGUAGE_H \ No newline at end of file diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h b/clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h @@ -0,0 +1,31 @@ +//===--- CLI.h - Get grammar from variant sources ----------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// A library shared among different pseudoparser-based tools. It provides a +// uniform way to get basic pieces of the parser (Grammar, LRTable etc) from +// variant grammar sources. +// It defines a `--grammar` CLI flag, which supports 1) using a grammar from a +// file (--grammar=/path/to/lang.bnf) or using the prebuilt cxx language +// (--grammar=cxx). +// +//===----------------------------------------------------------------------===// + +#ifndef CLANG_PSEUDO_CLI_CLI_H +#define CLANG_PSEUDO_CLI_CLI_H + +namespace clang { +namespace pseudo { +struct ParseLang; +namespace cli { +// Returns the corresponding language from the '--grammar' command-line flag. +const ParseLang &getLanguage(); +} // namespace cli +} // namespace pseudo +} // namespace clang + +#endif // CLANG_PSEUDO_CLI_CLI_H diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h b/clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h @@ -28,7 +28,7 @@ namespace clang { namespace pseudo { class LRTable; - +class ParseLang; namespace cxx { // Symbol represents nonterminal symbols in the C++ grammar. // It provides a simple uniform way to access a particular nonterminal. @@ -38,10 +38,13 @@ #undef NONTERMINAL }; -// Returns the C++ grammar. -const Grammar &getGrammar(); -// Returns the corresponding LRTable for the C++ grammar. -const LRTable &getLRTable(); +enum class Extension : ExtensionID { +#define EXTENSION(X, Y) X = Y, +#include "CXXSymbols.inc" +#undef EXTENSION +}; + +const ParseLang &getLanguage(); } // namespace cxx diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt --- a/clang-tools-extra/pseudo/lib/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(cli) add_subdirectory(cxx) add_subdirectory(grammar) diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -41,14 +41,14 @@ SymbolID StartSymbol) { assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal"); llvm::ArrayRef Terminals = Params.Forest.createTerminals(Tokens); - auto &G = Params.G; + auto &G = Params.Lang.G; (void)G; auto &GSS = Params.GSStack; // Lists of active shift, reduce actions. std::vector PendingShift, PendingReduce; auto AddSteps = [&](const GSS::Node *Head, SymbolID NextTok) { - for (const auto &Action : Params.Table.getActions(Head->State, NextTok)) { + for (const auto &Action : Params.Lang.Table.getActions(Head->State, NextTok)) { switch (Action.kind()) { case LRTable::Action::Shift: PendingShift.push_back({Head, Action}); @@ -61,7 +61,7 @@ } } }; - StateID StartState = Params.Table.getStartState(StartSymbol); + StateID StartState = Params.Lang.Table.getStartState(StartSymbol); std::vector NewHeads = { GSS.addNode(/*State=*/StartState, /*ForestNode=*/nullptr, {})}; @@ -84,7 +84,7 @@ for (const auto *Head : NewHeads) AddSteps(Head, Terminal.symbol()); NewHeads.clear(); - glrReduce(PendingReduce, Params, + glrReduce(PendingReduce, Params, Tokens, [&](const GSS::Node * NewHead) { // A reduce will enable more steps. AddSteps(NewHead, Terminal.symbol()); @@ -98,10 +98,10 @@ for (const auto *Heads : NewHeads) AddSteps(Heads, tokenSymbol(tok::eof)); - StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol); + StateID AcceptState = Params.Lang.Table.getGoToState(StartState, StartSymbol); // Collect new heads created from the final reduce. std::vector Heads; - glrReduce(PendingReduce, Params, [&](const GSS::Node *NewHead) { + glrReduce(PendingReduce, Params, Tokens, [&](const GSS::Node *NewHead) { Heads.push_back(NewHead); // A reduce will enable more steps. AddSteps(NewHead, tokenSymbol(tok::eof)); @@ -147,7 +147,7 @@ }) && "Pending shift actions must be shift actions"); LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" Shift {0} ({1} active heads):\n", - Params.G.symbolName(NewTok.symbol()), + Params.Lang.G.symbolName(NewTok.symbol()), PendingShift.size())); // We group pending shifts by their target state so we can merge them. @@ -232,6 +232,7 @@ // 2 by`enum-name := class-name STAR`: // 0--5(pointer) // 5 is goto(0, pointer) void glrReduce(std::vector &PendingReduce, const ParseParams &Params, + const TokenStream& Tokens, NewHeadCallback NewHeadCB) { // There are two interacting complications: // 1. Performing one reduce can unlock new reduces on the newly-created head. @@ -294,12 +295,18 @@ // Pop walks up the parent chain(s) for a reduction from Head by to Rule. // Once we reach the end, record the bases and sequences. auto Pop = [&](const GSS::Node *Head, RuleID RID) { - LLVM_DEBUG(llvm::dbgs() << " Pop " << Params.G.dumpRule(RID) << "\n"); - const auto &Rule = Params.G.lookupRule(RID); + LLVM_DEBUG(llvm::dbgs() << " Pop " << Params.Lang.G.dumpRule(RID) << "\n"); + const auto &Rule = Params.Lang.G.lookupRule(RID); Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID}; TempSequence.resize_for_overwrite(Rule.Size); auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) { if (I == Rule.Size) { + if (Rule.Guard) { + auto It = Params.Lang.Guards.find(Rule.Guard); + assert(It != Params.Lang.Guards.end() && "missing guard!"); + if (!It->getSecond()(TempSequence, Tokens)) + return; + } F.Start = TempSequence.front()->startTokenIndex(); LLVM_DEBUG(llvm::dbgs() << " --> base at S" << N->State << "\n"); Sequences.emplace(F, PushSpec{N, TempSequence}); @@ -331,7 +338,7 @@ while (!Sequences.empty()) { Family F = Sequences.top().first; - LLVM_DEBUG(llvm::dbgs() << " Push " << Params.G.symbolName(F.Symbol) + LLVM_DEBUG(llvm::dbgs() << " Push " << Params.Lang.G.symbolName(F.Symbol) << " from token " << F.Start << "\n"); // Grab the sequences and bases for this family. @@ -344,7 +351,7 @@ FamilySequences.emplace_back(Sequences.top().first.Rule, Sequences.top().second.Seq); FamilyBases.emplace_back( - Params.Table.getGoToState(Sequences.top().second.Base->State, + Params.Lang.Table.getGoToState(Sequences.top().second.Base->State, F.Symbol), Sequences.top().second.Base); @@ -362,7 +369,7 @@ SequenceNodes.size() == 1 ? SequenceNodes.front() : &Params.Forest.createAmbiguous(F.Symbol, SequenceNodes); - LLVM_DEBUG(llvm::dbgs() << " --> " << Parsed->dump(Params.G) << "\n"); + LLVM_DEBUG(llvm::dbgs() << " --> " << Parsed->dump(Params.Lang.G) << "\n"); // Bases for this family, deduplicate them, and group by the goTo State. sortAndUnique(FamilyBases); diff --git a/clang-tools-extra/pseudo/lib/cli/CLI.cpp b/clang-tools-extra/pseudo/lib/cli/CLI.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/lib/cli/CLI.cpp @@ -0,0 +1,67 @@ +//===--- CLI.cpp - ----------------------------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// + +#include "clang-pseudo/cli/CLI.h" +#include "clang-pseudo/Language.h" +#include "clang-pseudo/cxx/CXX.h" +#include "clang-pseudo/grammar/LRTable.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/MemoryBuffer.h" +#include + +static llvm::cl::opt + Grammar("grammar", + llvm::cl::desc( + "Specify a BNF grammar file path, or builtin language (cxx)."), + llvm::cl::init("cxx")); + +namespace clang { +namespace pseudo { + +static bool alwaysAccept(llvm::ArrayRef RHS, + const TokenStream &) { + return true; +} + +namespace cli { +const ParseLang &getLanguage() { + if (::Grammar == "cxx") + return cxx::getLanguage(); + + static ParseLang *PL = [&]() { + // Read from a bnf grammar file. + llvm::ErrorOr> GrammarText = + llvm::MemoryBuffer::getFile(::Grammar); + if (std::error_code EC = GrammarText.getError()) { + llvm::errs() << "Error: can't read grammar file '" << ::Grammar + << "': " << EC.message() << "\n"; + std::exit(1); + } + std::vector Diags; + auto G = Grammar::parseBNF(GrammarText->get()->getBuffer(), Diags); + if (!Diags.empty()) { + for (const auto &Diag : Diags) + llvm::errs() << Diag << "\n"; + std::exit(1); + } + + LRTable *Table = new LRTable(LRTable::buildSLR(*G)); + llvm::DenseMap *Guards = + new llvm::DenseMap(); + for (ExtensionID ID = 1; ID < G->table().AttributeValues.size(); ++ID) + Guards->insert(std::make_pair(ID, alwaysAccept)); + return new ParseLang{*G.release(), *Table, *Guards}; + }(); + return *PL; +} +} // namespace cli + +} // namespace pseudo +} // namespace clang diff --git a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt b/clang-tools-extra/pseudo/lib/cli/CMakeLists.txt copy from clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt copy to clang-tools-extra/pseudo/lib/cli/CMakeLists.txt --- a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/cli/CMakeLists.txt @@ -2,12 +2,11 @@ Support ) -add_clang_library(clangPseudoCXX - CXX.cpp - - DEPENDS - cxx_gen +add_clang_library(clangPseudoCLI + CLI.cpp LINK_LIBS + clangPseudo clangPseudoGrammar + clangPseudoCXX ) diff --git a/clang-tools-extra/pseudo/lib/cxx.bnf b/clang-tools-extra/pseudo/lib/cxx.bnf --- a/clang-tools-extra/pseudo/lib/cxx.bnf +++ b/clang-tools-extra/pseudo/lib/cxx.bnf @@ -739,8 +739,8 @@ #! Contextual keywords -- clang lexer always lexes them as identifier tokens. #! Placeholders for literal text in the grammar that lex as other things. -contextual-override := IDENTIFIER -contextual-final := IDENTIFIER +contextual-override := IDENTIFIER [guard=Override] +contextual-final := IDENTIFIER [guard=Final] contextual-zero := NUMERIC_CONSTANT module-keyword := IDENTIFIER import-keyword := IDENTIFIER diff --git a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt b/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt --- a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt @@ -10,4 +10,5 @@ LINK_LIBS clangPseudoGrammar + clangPseudo ) diff --git a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp --- a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp +++ b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp @@ -7,16 +7,19 @@ //===----------------------------------------------------------------------===// #include "clang-pseudo/cxx/CXX.h" +#include "clang-pseudo/Forest.h" +#include "clang-pseudo/Language.h" +#include "clang-pseudo/grammar/Grammar.h" #include "clang-pseudo/grammar/LRTable.h" +#include namespace clang { namespace pseudo { namespace cxx { - +namespace { static const char *CXXBNF = #include "CXXBNF.inc" ; - const Grammar &getGrammar() { static std::vector Diags; static Grammar *G = Grammar::parseBNF(CXXBNF, Diags).release(); @@ -29,6 +32,29 @@ return *Table; } +bool guardOverride(llvm::ArrayRef RHS, + const TokenStream &Tokens) { + assert(RHS.size() == 1 && + RHS.front()->symbol() == tokenSymbol(clang::tok::identifier)); + return Tokens.tokens()[RHS.front()->startTokenIndex()].text() == "override"; +} +bool guardFinal(llvm::ArrayRef RHS, + const TokenStream &Tokens) { + assert(RHS.size() == 1 && + RHS.front()->symbol() == tokenSymbol(clang::tok::identifier)); + return Tokens.tokens()[RHS.front()->startTokenIndex()].text() == "final"; +} +} // namespace + +const ParseLang &getLanguage() { + static llvm::DenseMap *Guards = + new llvm::DenseMap( + {{(ExtensionID)Extension::Override, guardOverride}, + {(ExtensionID)Extension::Final, guardFinal}}); + static ParseLang *L = new ParseLang{getGrammar(), getLRTable(), *Guards}; + return *L; +} + } // namespace cxx } // namespace pseudo } // namespace clang diff --git a/clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp b/clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp @@ -0,0 +1,9 @@ +// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s +// Verify that the contextual-{final,override} rules are guarded conditionally, +// No ambiguous parsing for the virt-specifier. +class Foo { + void foo1() override; +// CHECK: virt-specifier-seq~IDENTIFIER := tok[7] + void foo2() final; +// CHECK: virt-specifier-seq~IDENTIFIER := tok[13] +}; \ No newline at end of file diff --git a/clang-tools-extra/pseudo/tool/CMakeLists.txt b/clang-tools-extra/pseudo/tool/CMakeLists.txt --- a/clang-tools-extra/pseudo/tool/CMakeLists.txt +++ b/clang-tools-extra/pseudo/tool/CMakeLists.txt @@ -13,5 +13,6 @@ PRIVATE clangPseudo clangPseudoGrammar + clangPseudoCLI ) diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp --- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -10,6 +10,7 @@ #include "clang-pseudo/DirectiveTree.h" #include "clang-pseudo/GLR.h" #include "clang-pseudo/Token.h" +#include "clang-pseudo/cli/CLI.h" #include "clang-pseudo/grammar/Grammar.h" #include "clang-pseudo/grammar/LRGraph.h" #include "clang-pseudo/grammar/LRTable.h" @@ -20,14 +21,11 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Signals.h" -using clang::pseudo::Grammar; using clang::pseudo::TokenStream; using llvm::cl::desc; using llvm::cl::init; using llvm::cl::opt; -static opt - Grammar("grammar", desc("Parse and check a BNF grammar file."), init("")); static opt PrintGrammar("print-grammar", desc("Print the grammar.")); static opt PrintGraph("print-graph", desc("Print the LR graph for the grammar")); @@ -93,42 +91,34 @@ pairBrackets(*ParseableStream); } - if (Grammar.getNumOccurrences()) { - std::string Text = readOrDie(Grammar); - std::vector Diags; - auto G = Grammar::parseBNF(Text, Diags); - - if (!Diags.empty()) { - llvm::errs() << llvm::join(Diags, "\n"); - return 2; - } - llvm::outs() << llvm::formatv("grammar file {0} is parsed successfully\n", - Grammar); + if (true) { + const auto &Lang = clang::pseudo::cli::getLanguage(); if (PrintGrammar) - llvm::outs() << G->dump(); + llvm::outs() << Lang.G.dump(); if (PrintGraph) - llvm::outs() << clang::pseudo::LRGraph::buildLR0(*G).dumpForTests(*G); - auto LRTable = clang::pseudo::LRTable::buildSLR(*G); + llvm::outs() << clang::pseudo::LRGraph::buildLR0(Lang.G).dumpForTests( + Lang.G); + if (PrintTable) - llvm::outs() << LRTable.dumpForTests(*G); + llvm::outs() << Lang.Table.dumpForTests(Lang.G); if (PrintStatistics) - llvm::outs() << LRTable.dumpStatistics(); + llvm::outs() << Lang.Table.dumpStatistics(); if (ParseableStream) { clang::pseudo::ForestArena Arena; clang::pseudo::GSS GSS; llvm::Optional StartSymID = - G->findNonterminal(StartSymbol); + Lang.G.findNonterminal(StartSymbol); if (!StartSymID) { llvm::errs() << llvm::formatv( - "The start symbol {0} doesn't exit in the grammar!\n", Grammar); + "The start symbol {0} doesn't exit in the grammar!\n", StartSymbol); return 2; } - auto &Root = glrParse(*ParseableStream, - clang::pseudo::ParseParams{*G, LRTable, Arena, GSS}, - *StartSymID); + auto &Root = + glrParse(*ParseableStream, + clang::pseudo::ParseParams{Lang, Arena, GSS}, *StartSymID); if (PrintForest) - llvm::outs() << Root.dumpRecursive(*G, /*Abbreviated=*/true); + llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/true); if (PrintStatistics) { llvm::outs() << "Forest bytes: " << Arena.bytes() diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp --- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp +++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp @@ -48,7 +48,15 @@ std::vector Diags; G = Grammar::parseBNF(GrammarBNF, Diags); } - + // FIXME: move to TokenStream class. + TokenStream emptyTokenStream() { + TokenStream Empty; + Empty.finalize(); + return Empty; + } + ParseLang getTestLang() { + return {*G, Table, Guards}; + } void buildGrammar(std::vector Nonterminals, std::vector Rules) { Nonterminals.push_back("_"); @@ -72,7 +80,13 @@ ADD_FAILURE() << "No such symbol found: " << Name; return 0; } - + ExtensionID extensionID(llvm::StringRef AttrValueName) const { + for (unsigned I = 0; I < G->table().AttributeValues.size(); ++I) + if (G->table().AttributeValues[I] == AttrValueName) + return static_cast(I); + ADD_FAILURE() << "No such attribute value found: " << AttrValueName; + return 0; + } RuleID ruleFor(llvm::StringRef NonterminalName) const { auto RuleRange = G->table().Nonterminals[id(NonterminalName)].RuleRange; if (RuleRange.End - RuleRange.Start == 1) @@ -91,6 +105,8 @@ protected: std::unique_ptr G; + LRTable Table; + llvm::DenseMap Guards; ForestArena Arena; GSS GSStack; std::vector NewHeadResults; @@ -117,7 +133,7 @@ /*Parents=*/{GSSNode0}); buildGrammar({}, {}); // Create a fake empty grammar. - LRTable T = LRTable::buildForTests(G->table(), /*Entries=*/{}); + Table = LRTable::buildForTests(G->table(), /*Entries=*/{}); ForestNode &SemiTerminal = Arena.createTerminal(tok::semi, 0); std::vector PendingShift = { @@ -125,7 +141,7 @@ {GSSNode3, Action::shift(5)}, {GSSNode2, Action::shift(4)}, }; - glrShift(PendingShift, SemiTerminal, {*G, T, Arena, GSStack}, + glrShift(PendingShift, SemiTerminal, {getTestLang(), Arena, GSStack}, captureNewHeads()); EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre( @@ -146,7 +162,7 @@ buildGrammar({"class-name", "enum-name"}, {"class-name := IDENTIFIER", "enum-name := IDENTIFIER"}); - LRTable Table = LRTable::buildForTests( + Table = LRTable::buildForTests( G->table(), {{/*State=*/0, id("class-name"), Action::goTo(2)}, {/*State=*/0, id("enum-name"), Action::goTo(3)}}); @@ -158,7 +174,7 @@ std::vector PendingReduce = { {GSSNode1, Action::reduce(ruleFor("class-name"))}, {GSSNode1, Action::reduce(ruleFor("enum-name"))}}; - glrReduce(PendingReduce, {*G, Table, Arena, GSStack}, + glrReduce(PendingReduce, {getTestLang(), Arena, GSStack}, emptyTokenStream(), captureNewHeads()); EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre( @@ -189,13 +205,13 @@ /*State=*/4, &Arena.createTerminal(tok::star, /*TokenIndex=*/1), /*Parents=*/{GSSNode2, GSSNode3}); - LRTable Table = LRTable::buildForTests( + Table = LRTable::buildForTests( G->table(), {{/*State=*/2, id("ptr-operator"), Action::goTo(/*NextState=*/5)}, {/*State=*/3, id("ptr-operator"), Action::goTo(/*NextState=*/6)}}); std::vector PendingReduce = { {GSSNode4, Action::reduce(ruleFor("ptr-operator"))}}; - glrReduce(PendingReduce, {*G, Table, Arena, GSStack}, + glrReduce(PendingReduce, {getTestLang(), Arena, GSStack}, emptyTokenStream(), captureNewHeads()); EXPECT_THAT(NewHeadResults, @@ -238,7 +254,7 @@ GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode, /*Parents=*/{GSSNode2}); - LRTable Table = LRTable::buildForTests( + Table = LRTable::buildForTests( G->table(), {{/*State=*/1, id("type-name"), Action::goTo(/*NextState=*/5)}, {/*State=*/2, id("type-name"), Action::goTo(/*NextState=*/5)}}); @@ -250,7 +266,7 @@ { GSSNode4, Action::reduce(/*RuleID=*/1) // type-name := enum-name }}; - glrReduce(PendingReduce, {*G, Table, Arena, GSStack}, + glrReduce(PendingReduce, {getTestLang(), Arena, GSStack}, emptyTokenStream(), captureNewHeads()); // Verify that the stack heads are joint at state 5 after reduces. @@ -296,7 +312,7 @@ GSStack.addNode(/*State=*/4, /*ForestNode=*/StartTerminal, /*Parents=*/{GSSNode2}); - LRTable Table = LRTable::buildForTests( + Table = LRTable::buildForTests( G->table(), {{/*State=*/0, id("pointer"), Action::goTo(5)}}); // FIXME: figure out a way to get rid of the hard-coded reduce RuleID! std::vector PendingReduce = { @@ -306,7 +322,7 @@ { GSSNode4, Action::reduce(/*RuleID=*/1) // pointer := enum-name * }}; - glrReduce(PendingReduce, {*G, Table, Arena, GSStack}, + glrReduce(PendingReduce,{getTestLang(), Arena, GSStack}, emptyTokenStream(), captureNewHeads()); EXPECT_THAT(NewHeadResults, testing::UnorderedElementsAre( @@ -340,12 +356,12 @@ left-paren := { expr := IDENTIFIER )bnf"); + Table = LRTable::buildSLR(*G); clang::LangOptions LOptions; const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions); - auto LRTable = LRTable::buildSLR(*G); const ForestNode &Parsed = - glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test")); + glrParse(Tokens, {getTestLang(), Arena, GSStack}, id("test")); // Verify that there is no duplicated sequence node of `expr := IDENTIFIER` // in the forest, see the `#1` and `=#1` in the dump string. EXPECT_EQ(Parsed.dumpRecursive(*G), @@ -380,10 +396,10 @@ )bnf"); clang::LangOptions LOptions; const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions); - auto LRTable = LRTable::buildSLR(*G); + Table = LRTable::buildSLR(*G); const ForestNode &Parsed = - glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test")); + glrParse(Tokens, {getTestLang(), Arena, GSStack}, id("test")); EXPECT_EQ(Parsed.dumpRecursive(*G), "[ 0, end) test := \n" "[ 0, end) ├─test := IDENTIFIER\n" @@ -405,10 +421,10 @@ // of the nonterminal `test` when the next token is `eof`, verify that the // parser stops at the right state. const TokenStream &Tokens = cook(lex("id id", LOptions), LOptions); - auto LRTable = LRTable::buildSLR(*G); + Table = LRTable::buildSLR(*G); const ForestNode &Parsed = - glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test")); + glrParse(Tokens, {getTestLang(), Arena, GSStack}, id("test")); EXPECT_EQ(Parsed.dumpRecursive(*G), "[ 0, end) test := IDENTIFIER test\n" "[ 0, 1) ├─IDENTIFIER := tok[0]\n" @@ -416,6 +432,36 @@ "[ 1, end) └─IDENTIFIER := tok[1]\n"); } +TEST_F(GLRTest, GuardExtension) { + build(R"bnf( + _ := start + + start := IDENTIFIER [guard=TestOnly] + )bnf"); + Guards.insert(std::make_pair( + extensionID("TestOnly"), + [&](llvm::ArrayRef RHS, const TokenStream &Tokens) { + assert(RHS.size() == 1 && + RHS.front()->symbol() == tokenSymbol(clang::tok::identifier)); + return Tokens.tokens()[RHS.front()->startTokenIndex()].text() == "test"; + })); + clang::LangOptions LOptions; + Table = LRTable::buildSLR(*G); + + std::string Input = "test"; + const TokenStream &Succeeded = cook(lex(Input, LOptions), LOptions); + EXPECT_EQ(glrParse(Succeeded, {getTestLang(), Arena, GSStack}, id("start")) + .dumpRecursive(*G), + "[ 0, end) start := IDENTIFIER [guard=TestOnly]\n" + "[ 0, end) └─IDENTIFIER := tok[0]\n"); + + Input = "notest"; + const TokenStream &Failed = cook(lex(Input, LOptions), LOptions); + EXPECT_EQ(glrParse(Failed, {getTestLang(), Arena, GSStack}, id("start")) + .dumpRecursive(*G), + "[ 0, end) start := \n"); +} + TEST(GSSTest, GC) { // ┌-A-┬-AB // ├-B-┘