diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h new file mode 100644 --- /dev/null +++ b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h @@ -0,0 +1,197 @@ +//===--- Grammar.h - grammar used by clang pseudo parser --------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines base structures for parsing & modeling a grammar for a +// programming language: +// +// # This is a fake C++ grammar +// _ := translation-unit +// translation-unit := declaration-seq_opt +// declaration-seq := declaration +// declaration-seq := declaration-seq declaration +// +// A grammar formally describes a language, and it is constructed by a set of +// production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either +// non-terminal or terminal, identified by a SymbolID. +// +// Notions about the grammar: +// - "_" is the start symbol +// - single-line comment is supported, starting with a # +// - A rule describes how a nonterminal (left side of :=) is constructed, and +// it is per line in the grammar file +// - Terminals (also called tokens) correspond to the clang::TokenKind; they +// are written in the grammar like "IDENTIFIER", "USING", "+" +// - Nonterminals are specified with "lower-case" names in the grammar; they +// shouldn't be nullable (formed by an empty string), except the start symbol +// - optional symbols are supported (specified with a _opt suffix), and they +// will be eliminated during the grammar parsing stage +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H +#define LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H + +#include "clang/Basic/TokenKinds.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include +#include + +namespace clang { +namespace syntax { +namespace pseudo { +// A SymbolID uniquely identifies a terminal/non-terminal symbol in a grammar. +// Non-terminal IDs are indexes into a table of non-terminal symbols. +// Terminal IDs correspond to the clang TokenKind enum. +using SymbolID = uint16_t; +// SymbolID is only 12 bits wide. +// There are maximum 2^11 terminals and 2^11 nonterminals. +static constexpr unsigned SymbolBits = 12; +// SymbolIDs with the top bit set are tokens/terminals. +static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1); +inline bool isToken(SymbolID ID) { return ID & TokenFlag; } +inline bool isNonterminal(SymbolID ID) { return !isToken(ID); } +// The terminals are always the clang tok::TokenKind (not all are used). +inline tok::TokenKind symbolToToken(SymbolID SID) { + assert(isToken(SID)); + SID &= ~TokenFlag; + assert(SID < tok::NUM_TOKENS); + return static_cast(SID); +} +inline SymbolID tokenSymbol(tok::TokenKind TK) { + return TokenFlag | static_cast(TK); +} + +// A RuleID uniquely identifies a production rule in a grammar. +// It is an index into a table of rules. +using RuleID = uint16_t; +// RuleID is only 12 bits wide. +static constexpr unsigned RuleBits = 12; + +// Represent a production rule in the grammar, e.g. +// expression := a b c +// ^Target ^Sequence +struct Rule { + Rule(SymbolID Target, llvm::ArrayRef Seq); + + static constexpr unsigned SizeBits = 4; + static_assert(SizeBits + SymbolBits <= 16, + "Must be able to store symbol ID + size efficiently"); + static constexpr unsigned MaxElements = 1 << SizeBits; + + // 16 bits for target symbol and size of sequence: + // SymbolID : 12 | Size : 4 + struct { + SymbolID Target : SymbolBits; + uint8_t Size : SizeBits; // Size of the Sequence + } Data; + SymbolID Sequence[MaxElements]; + + SymbolID target() const { return Data.Target; } + uint8_t size() const { return Data.Size; } + llvm::ArrayRef seq() const { + return llvm::ArrayRef(Sequence, size()); + } + friend bool operator==(const Rule &L, const Rule &R) { + return L.target() == R.target() && L.size() == R.size() && + L.seq() == R.seq(); + } +}; + +struct Table; + +// Grammar that describes a programming language, C++ etc. +// It parses BNF grammar rules, and is a building brick for constructing a +// grammar-based parser. +// +// FIXME: add grammar annotations. +class Grammar { +public: + ~Grammar() = default; + Grammar(Grammar &&) = default; + // Text representation of a grammar rule. + struct RuleSpec { + llvm::StringRef Target; + struct Element { + llvm::StringRef Symbol; // Name of the symbol + }; + std::vector Sequence; + + // + static llvm::Expected> + parseAll(llvm::StringRef Lines); + + std::string dump() const; + // Exposed for testing. + static llvm::Expected parseRaw(llvm::StringRef Line); + // Exposed for testing. + // Inline all _opt symbols. + // For example, a rule E := id +_opt id, after elimination, we have two + // equivalent rules: + // 1) E := id + id + // 2) E := id id + static std::vector + eliminateOptional(const std::vector &); + }; + static Grammar build(llvm::ArrayRef); + + // Lookup non-terminal by name. + SymbolID lookupSymbol(llvm::StringRef Name) const; + // Return all rules of the given non-terminal symbol. + llvm::ArrayRef lookupRules(SymbolID SID) const; + const Rule &lookupRule(RuleID RID) const; + + // Dump methods for debugging. + // Dump the whole grammar. + std::string dump() const; + // Dump symbol (terminal or non-terminal) name. + llvm::StringRef dumpSymbol(SymbolID) const; + std::string dumpRule(RuleID) const; + std::string dumpRuleRecursively(RuleID) const; + // Dump all rules of the given nonterminal symbol. + std::string dumpRules(SymbolID) const; + std::string dumpRulesRecursively(SymbolID) const; + + // Diagnose the grammar and returns warnings if any. + std::vector diagnose() const; + + const Table &table() const { return *T; } + +private: + explicit Grammar(std::unique_ptr); + std::unique_ptr
T; +}; + +// Underlying storage of the Grammar. +struct Table { + // The rules are sorted (and thus grouped) by target symbol. + // RuleID is the index of the vector. + std::vector Rules; + + struct Nonterminal { + std::string Name; + bool Nullable = false; + std::pair RuleRange; + }; + // A table of nonterminals, sorted by name. + // SymbolID is the index of the table. + std::vector Nonterminals; + + bool isRule(RuleID RID) const { return RID < Rules.size(); } + bool isSymbol(SymbolID SID) const { + return isToken(SID) ? ((SID & ~TokenFlag) < tok::NUM_TOKENS) + : SID < Nonterminals.size(); + }; +}; + +} // namespace pseudo +} // namespace syntax +} // namespace clang + +#endif diff --git a/clang/lib/Tooling/Syntax/CMakeLists.txt b/clang/lib/Tooling/Syntax/CMakeLists.txt --- a/clang/lib/Tooling/Syntax/CMakeLists.txt +++ b/clang/lib/Tooling/Syntax/CMakeLists.txt @@ -19,3 +19,5 @@ DEPENDS omp_gen ) + +add_subdirectory(Pseudo) diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -0,0 +1,9 @@ +set(LLVM_LINK_COMPONENTS Support) + +add_clang_library(clangToolingPseudo + Grammar.cpp + + LINK_LIBS + clangBasic + clangLex + ) diff --git a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp @@ -0,0 +1,433 @@ +//===--- Grammar.cpp - Grammar for clang pseudo parser ----------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "clang/Basic/TokenKinds.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +namespace clang { +namespace syntax { +namespace pseudo { + +namespace { +static constexpr int NumTerminals = tok::NUM_TOKENS; +static std::string *TerminalNames = nullptr; +static const llvm::StringRef OptSuffix = "_opt"; +static const llvm::StringRef StartSymbol = "_"; + +int initTerminalNames() { + assert(!TerminalNames && "TerminalNames must be not initailized"); + TerminalNames = new std::string[NumTerminals]; + for (unsigned I = 0; I < NumTerminals; ++I) { + tok::TokenKind K = static_cast(I); + if (const auto *Punc = tok::getPunctuatorSpelling(K)) + TerminalNames[I] = Punc; + else + TerminalNames[I] = llvm::StringRef(tok::getTokenName(K)).upper(); + } + return 0; +} + +void computeNullability(Table &T) { + // A nonterminal is nullable if some rule producing it is nullable. + // A rule is nullable if every symbol on the RHS is a nullable nonterminal. + // (We only get to anything nullable at all if some rule has an empty RHS). + + // Logically we have two maps: + // rule_deps: map> + // For each rule, the set of symbols such that if they *all* + // become nullable, the rule is nullable. + // symbol_rdeps: map> + // For each symbol, the set of rules such that if *any* becomes + // nullable, the symbol is nullable. + // In practice, our algorithm deals with propagating updates from the RHS + // to the LHS, so the maps actually go in the other direction. + + llvm::DenseSet NullableRules; + // Maps rule -> rule_deps[rule].size. Rule becomes nullable if it is zero. + llvm::DenseMap RuleDepCount; + // Maps rule_deps[rule] -> rule. When a symbol goes nullable, check these. + llvm::DenseMap> SymbolRdeps; + + // Avoid deep recursion by deferring analysis of rules with a queue. + std::vector PendingNullableRules; + + // Initialize dependency maps. + for (RuleID RID = 0; RID < T.Rules.size(); ++RID) { + const Rule &R = T.Rules[RID]; + if (R.size() == 0) { + PendingNullableRules.push_back(RID); + continue; + } + + // Only track rules that are potentially nullable - no terminals. + if (llvm::any_of(R.seq(), [](SymbolID SID) { return isToken(SID); })) + continue; + + unsigned &DepCount = RuleDepCount[RID]; + for (SymbolID SID : R.seq()) { + auto &Rdep = SymbolRdeps[SID]; + if (!Rdep.empty() && Rdep.back() == RID) + continue; // repeated dep only counted once! + Rdep.push_back(RID); + ++DepCount; + } + } + + while (!PendingNullableRules.empty()) { + // Check if we already knew the rule was nullable, and mark it. + RuleID RID = PendingNullableRules.back(); + PendingNullableRules.pop_back(); + if (!NullableRules.insert(RID).second) + continue; + + // This makes its symbol nullable, check and mark it too. + SymbolID SID = T.Rules[RID].target(); + if (T.Nonterminals[SID].Nullable) + continue; + T.Nonterminals[SID].Nullable = true; + + // This may in turn contribute to the nullability of rdep rules. + // If this was the last needed symbol, the rule is now nullable. + if (SymbolRdeps.count(SID)) + for (RuleID RdepRule : SymbolRdeps[SID]) { + assert(RuleDepCount[RdepRule] > 0); + if (--RuleDepCount[RdepRule] == 0) + PendingNullableRules.push_back(RdepRule); + } + } +} + +void eliminateOptionalTail(llvm::ArrayRef Elements, + std::vector &Result, + llvm::function_ref CB) { + if (Elements.empty()) { + CB(); + return; + } + auto Front = Elements.front(); + if (!Front.Symbol.endswith(OptSuffix)) { + Result.push_back(std::move(Front)); + eliminateOptionalTail(Elements.drop_front(1), Result, CB); + Result.pop_back(); + return; + } + // Enumerate two options: skip the opt symbol, or inline the symbol. + eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip + Front.Symbol = Front.Symbol.drop_back(OptSuffix.size()); // drop "_opt" + Result.push_back(std::move(Front)); + eliminateOptionalTail(Elements.drop_front(1), Result, CB); + Result.pop_back(); +} + +} // namespace + +Rule::Rule(SymbolID Target, llvm::ArrayRef Sequence) + : Data({Target, static_cast(Sequence.size())}) { + assert(Sequence.size() <= Rule::MaxElements); + llvm::copy(Sequence, this->Sequence); +} + +std::string Grammar::RuleSpec::dump() const { + std::string Result; + llvm::raw_string_ostream OS(Result); + OS << this->Target << " :="; + for (const auto &E : this->Sequence) { + OS << " " << E.Symbol; + } + return OS.str(); +} + +llvm::Expected> +Grammar::RuleSpec::parseAll(llvm::StringRef Lines) { + std::vector Specs; + for (llvm::StringRef Line : llvm::split(Lines, '\n')) { + Line = Line.trim(); + // A line started with # is a comment. + if (Line.empty() || Line.startswith("#")) + continue; + auto Spec = pseudo::Grammar::RuleSpec::parseRaw(Line); + if (!Spec) + return Spec.takeError(); + + Specs.push_back(std::move(*Spec)); + } + return eliminateOptional(Specs); +} + +llvm::Expected +Grammar::RuleSpec::parseRaw(llvm::StringRef Line) { + auto Parts = Line.split(":="); + if (Parts.first == Line) // no separator in Line + return llvm::make_error( + llvm::errc::invalid_argument, + llvm::formatv("Failed to parse rule {0}\n", Line).str()); + RuleSpec Result; + Result.Target = Parts.first.trim(); + for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) { + Chunk = Chunk.trim(); + if (Chunk.empty()) + continue; // skip empty + if (Chunk.startswith("#")) + break; // comment, skip anything coming after # + + Result.Sequence.emplace_back(); + Result.Sequence.back().Symbol = Chunk; + } + return Result; +} + +std::vector +Grammar::RuleSpec::eliminateOptional(const std::vector &Input) { + std::vector Results; + std::vector Storage; + for (const auto &R : Input) { + eliminateOptionalTail(R.Sequence, Storage, [&Results, &Storage, &R]() { + Results.emplace_back(); + Results.back().Target = R.Target; + Results.back().Sequence = Storage; + }); + } + return Results; +} + +Grammar::Grammar(std::unique_ptr
T) : T(std::move(T)) {} + +Grammar Grammar::build(llvm::ArrayRef Specs) { + assert(llvm::all_of(Specs, + [](const RuleSpec &R) { + if (R.Target.endswith(OptSuffix)) + return false; + return llvm::all_of( + R.Sequence, + [](const Grammar::RuleSpec::Element &E) { + return !E.Symbol.endswith(OptSuffix); + }); + }) && + "unexpected optional symbols!"); + static int Dummy = initTerminalNames(); + (void)Dummy; + + auto T = std::make_unique
(); + + // Assemble the name->ID and ID->(nonterminal) name maps. + llvm::DenseSet UniqueNonterminals; + llvm::DenseMap SymbolIds; + for (unsigned I = 0; I < NumTerminals; ++I) + SymbolIds.try_emplace(TerminalNames[I], tokenSymbol(tok::TokenKind(I))); + auto Consider = [&](llvm::StringRef Name) { + if (!SymbolIds.count(Name)) + UniqueNonterminals.insert(Name); + }; + for (const auto &Spec : Specs) { + Consider(Spec.Target); + for (const RuleSpec::Element &Elt : Spec.Sequence) + Consider(Elt.Symbol); + } + llvm::for_each(UniqueNonterminals, [&T](llvm::StringRef Name) { + T->Nonterminals.emplace_back(); + T->Nonterminals.back().Name = Name.str(); + }); + assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) && + "Too many nonterminals to fit in SymbolID bits!"); + llvm::sort(T->Nonterminals, + [](const Table::Nonterminal &L, const Table::Nonterminal &R) { + return L.Name < R.Name; + }); + // Build name -> ID maps for nonterminals. + for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) + SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID); + + // Convert the rules. + T->Rules.reserve(Specs.size()); + std::vector Symbols; + auto Lookup = [SymbolIds](llvm::StringRef Name) { + auto It = SymbolIds.find(Name); + assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!"); + return It->second; + }; + for (const auto &Spec : Specs) { + assert(Spec.Sequence.size() < Rule::MaxElements); + Symbols.clear(); + for (const RuleSpec::Element &Elt : Spec.Sequence) + Symbols.push_back(Lookup(Elt.Symbol)); + T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols)); + } + assert(T->Rules.size() < (1 << RuleBits) && + "Too many rules to fit in RuleID bits!"); + llvm::sort(T->Rules, [](const Rule &Left, const Rule &Right) { + return std::forward_as_tuple(Left.target(), Left.size(), Left.Sequence) < + std::forward_as_tuple(Right.target(), Right.size(), Right.Sequence); + }); + RuleID RulePos = 0; + for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) { + RuleID Start = RulePos; + while (RulePos < T->Rules.size() && T->Rules[RulePos].target() == SID) + ++RulePos; + T->Nonterminals[SID].RuleRange = {Start, RulePos}; + } + + computeNullability(*T); + + return Grammar(std::move(T)); +} + +SymbolID Grammar::lookupSymbol(llvm::StringRef Name) const { + auto Find = llvm::partition_point( + T->Nonterminals, + [Name](const Table::Nonterminal &T) { return T.Name < Name; }); + if (Find != T->Nonterminals.end() && Find->Name == Name) + return Find - T->Nonterminals.begin(); + for (unsigned I = 0; I < NumTerminals; ++I) { + if (TerminalNames[I] == Name) + return tokenSymbol(static_cast(I)); + } + assert(false); +} + +llvm::ArrayRef Grammar::lookupRules(SymbolID SID) const { + assert(isNonterminal(SID)); + auto R = T->Nonterminals[SID].RuleRange; + assert(R.second <= T->Rules.size()); + return llvm::makeArrayRef(&T->Rules[R.first], R.second - R.first); +} + +const Rule &Grammar::lookupRule(RuleID RID) const { + assert(RID < T->Rules.size()); + return T->Rules[RID]; +} + +llvm::StringRef Grammar::dumpSymbol(SymbolID SID) const { + assert(T->isSymbol(SID)); + if (isToken(SID)) + return TerminalNames[symbolToToken(SID)]; + return T->Nonterminals[SID].Name; +} + +std::string Grammar::dumpRule(RuleID RID) const { + std::string Result; + llvm::raw_string_ostream OS(Result); + const Rule &R = T->Rules[RID]; + OS << dumpSymbol(R.target()) << " :="; + for (SymbolID SID : R.seq()) { + OS << " " << dumpSymbol(SID); + } + return Result; +} + +std::string Grammar::dumpRules(SymbolID SID) const { + assert(isNonterminal(SID)); + std::string Result; + const auto &Range = T->Nonterminals[SID].RuleRange; + for (RuleID RID = Range.first; RID < Range.second; ++RID) + Result.append(dumpRule(RID)).push_back('\n'); + return Result; +} + +static std::string recursiveDump(const Grammar &G, const Table &T, + llvm::ArrayRef Seed) { + std::string Result; + llvm::DenseSet SeenRules; + llvm::DenseSet SeenSymbols; + std::queue Queue; + auto Visit = [&](RuleID RID) { + assert(G.table().isRule(RID)); + if (!SeenRules.insert(RID).second) + return; + Result.append(G.dumpRule(RID)).push_back('\n'); + for (SymbolID SID : T.Rules[RID].seq()) + if (isNonterminal(SID) && !SeenSymbols.insert(SID).second) + Queue.push(SID); + }; + llvm::for_each(Seed, Visit); + while (!Queue.empty()) { + assert(G.table().isSymbol(Queue.front())); + auto R = T.Nonterminals[Queue.front()].RuleRange; + for (RuleID I = R.first; I < R.second; ++I) + Visit(I); + Queue.pop(); + } + return Result; +} + +std::string Grammar::dumpRuleRecursively(RuleID RID) const { + return recursiveDump(*this, *T, llvm::makeArrayRef(&RID, 1)); +} + +std::string Grammar::dumpRulesRecursively(SymbolID SID) const { + assert(isNonterminal(SID)); + auto Range = T->Nonterminals[SID].RuleRange; + std::vector Seed; + for (RuleID RID = Range.first; RID < Range.second; ++RID) + Seed.push_back(RID); + return recursiveDump(*this, *T, Seed); +} + +std::vector Grammar::diagnose() const { + std::vector Result; + for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) { + auto Range = T->Nonterminals[SID].RuleRange; + if (Range.first == Range.second) + Result.push_back( + llvm::formatv("No rules for nonterminal: {0}", dumpSymbol(SID))); + llvm::StringRef NameRef = T->Nonterminals[SID].Name; + if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) { + Result.push_back(llvm::formatv( + "Token-like name {0} is used as a nonterminal", dumpSymbol(SID))); + } + } + for (RuleID RID = 0; RID + 1 < T->Rules.size(); ++RID) { + if (T->Rules[RID] == T->Rules[RID + 1]) + Result.push_back(llvm::formatv("Duplicate rule: {0}", dumpRule(RID))); + // Warning for nullable non-terminals (except for the start symbol _). + if (T->Rules[RID].size() == 0 && + lookupSymbol(StartSymbol) != T->Rules[RID].target()) + Result.push_back(llvm::formatv("Nullable symbol: {0}", + dumpSymbol(T->Rules[RID].target()))); + } + // symbol-id -> used counts + std::vector UseCounts(T->Nonterminals.size(), 0); + for (const Rule &R : T->Rules) + for (SymbolID SID : R.seq()) + if (isNonterminal(SID)) + ++UseCounts[SID]; + for (SymbolID SID = 0; SID < UseCounts.size(); ++SID) + if (UseCounts[SID] == 0 && T->Nonterminals[SID].Name != StartSymbol) + Result.push_back( + llvm::formatv("Nonterminal never used: {0}", dumpSymbol(SID))); + return Result; +} + +std::string Grammar::dump() const { + std::string Result; + llvm::raw_string_ostream OS(Result); + OS << "Nonterminals:\n"; + for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) + OS << llvm::formatv(" {0} {1}{2}\n", SID, dumpSymbol(SID), + T->Nonterminals[SID].Nullable ? " NULLABLE" : ""); + OS << "Rules:\n"; + for (RuleID RID = 0; RID < T->Rules.size(); ++RID) + OS << llvm::formatv(" {0} {1}\n", RID, dumpRule(RID)); + return OS.str(); +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/unittests/Tooling/Syntax/CMakeLists.txt b/clang/unittests/Tooling/Syntax/CMakeLists.txt --- a/clang/unittests/Tooling/Syntax/CMakeLists.txt +++ b/clang/unittests/Tooling/Syntax/CMakeLists.txt @@ -28,3 +28,5 @@ PRIVATE LLVMTestingSupport ) + +add_subdirectory(Pseudo) diff --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -0,0 +1,20 @@ +set(LLVM_LINK_COMPONENTS + Support + ) + +add_clang_unittest(PseudoParserTests + GrammarTest.cpp +) + +clang_target_link_libraries(PseudoParserTests + PRIVATE + clangBasic + clangLex + clangToolingPseudo + clangTesting + ) + +target_link_libraries(PseudoParserTests + PRIVATE + LLVMTestingSupport +) diff --git a/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp new file mode 100644 --- /dev/null +++ b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp @@ -0,0 +1,114 @@ +//===--- GrammarTest.cpp - grammar tests ------------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace clang { +namespace syntax { +namespace pseudo { +namespace { + +using testing::AllOf; +using testing::ElementsAre; +using testing::Field; +using testing::HasSubstr; +using testing::Not; +using testing::UnorderedElementsAre; + +MATCHER_P(SName, Name, "") { return arg.Symbol == Name; } +MATCHER_P(Target, Name, "") { return arg.Target == Name; } +template +testing::Matcher withSequence(T... Name) { + return Field(&Grammar::RuleSpec::Sequence, ElementsAre(SName(Name)...)); +} + +MATCHER_P(Name, N, "") { return arg.Name == N; } +MATCHER(Null, "") { return arg.Nullable; } + +MATCHER_P(TargetID, SID, "") { return arg.target() == SID; } +template +testing::Matcher withSequenceID(T... IDs) { + return testing::Property(&Rule::seq, ElementsAre(IDs...)); +} +SymbolID symbolID(llvm::StringRef Name, const Grammar &G) { + return G.lookupSymbol(Name); +} + +TEST(GrammarTest, Basic) { + auto RS = cantFail(Grammar::RuleSpec::parseRaw( + "expression := IDENTIFIER + expression # comment")); + EXPECT_THAT(RS, AllOf(Target("expression"), + withSequence("IDENTIFIER", "+", "expression"))); + + auto Error = Grammar::RuleSpec::parseRaw("abc"); + EXPECT_FALSE(Error); + EXPECT_THAT(llvm::toString(Error.takeError()), testing::HasSubstr("Failed")); +} + +TEST(GrammarTest, EliminatedOptional) { + auto RS = cantFail(Grammar::RuleSpec::parseRaw("list := IDENTIFIER ,_opt")); + EXPECT_THAT(RS, AllOf(Target("list"), withSequence("IDENTIFIER", ",_opt"))); + + const auto &InlinedRules = Grammar::RuleSpec::eliminateOptional({RS}); + EXPECT_THAT(InlinedRules, + UnorderedElementsAre( + AllOf(Target("list"), withSequence("IDENTIFIER", ",")), + AllOf(Target("list"), withSequence("IDENTIFIER")))); + + const auto &G = Grammar::build(InlinedRules); + EXPECT_THAT( + G.lookupRules(G.lookupSymbol("list")), + UnorderedElementsAre( + AllOf(TargetID(symbolID("list", G)), + withSequenceID(symbolID("IDENTIFIER", G), symbolID(",", G))), + AllOf(TargetID(symbolID("list", G)), + withSequenceID(symbolID("IDENTIFIER", G))))); +} + +TEST(GrammarTest, Nullable) { + llvm::StringRef Text = R"( + # This is a comment. + null1 := +_opt + null2 := null1 + null3 := null1 +_opt + null4 := + + null4 := + + non-null := + + )"; + const auto RSpecs = cantFail(Grammar::RuleSpec::parseAll(Text)); + + const auto &G = Grammar::build(RSpecs); + EXPECT_THAT(G.table().Nonterminals, + UnorderedElementsAre( + AllOf(Name("null1"), Null()), AllOf(Name("null2"), Null()), + AllOf(Name("null3"), Null()), AllOf(Name("null4"), Null()), + AllOf(Name("non-null"), Not(Null())))); +} + +TEST(GrammarTest, Diagnose) { + llvm::StringRef Text = R"( + _ := undefined + unused := IDENTIFIER + + _ := IDENFIFIE # a typo of the terminal IDENFITIER + )"; + const auto RSpecs = cantFail(Grammar::RuleSpec::parseAll(Text)); + EXPECT_THAT( + Grammar::build(RSpecs).diagnose(), + UnorderedElementsAre(HasSubstr("No rules for nonterminal: undefined"), + HasSubstr("never used"), HasSubstr("Token-like"), + HasSubstr("No rules for nonterminal: IDENFIFIE"))); +} + +} // namespace +} // namespace pseudo +} // namespace syntax +} // namespace clang