diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h b/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h @@ -0,0 +1,64 @@ +//===--- Disambiguate.h - Find the best tree in the forest -------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// A GLR parse forest represents every possible parse tree for the source code. +// +// Before we can do useful analysis/editing of the code, we need to pick a +// single tree which we think is accurate. We use three main types of clues: +// +// A) Semantic language rules may restrict which parses are allowed. +// For example, `string string string X` is *grammatical* C++, but only a +// single type-name is allowed in a decl-specifier-sequence. +// Where possible, these interpretations are forbidden by guards. +// Sometimes this isn't possible, or we want our parser to be lenient. +// +// B) Some constructs are rarer, while others are common. +// For example `a::c` is often a template specialization, and rarely a +// double comparison between a, b, and c. +// +// C) Identifier text hints whether they name types/values/templates etc. +// "std" is usually a namespace, a project index may also guide us. +// Hints may be within the document: if one occurrence of 'foo' is a variable +// then the others probably are too. +// (Text need not match: similar CaseStyle can be a weak hint, too). +// +//----------------------------------------------------------------------------// +// +// Mechanically, we replace each ambiguous node with its best alternative. +// +// "Best" is determined by assigning bonuses/penalties to nodes, to express +// the clues of type A and B above. A forest node representing an unlikely +// parse would apply a penalty to every subtree is is present in. +// Disambiguation proceeds bottom-up, so that the score of each alternative +// is known when a decision is made. +// +// Identifier-based hints within the document mean some nodes should be +// *correlated*. Rather than resolve these simultaneously, we make the most +// certain decisions first and use these results to update bonuses elsewhere. +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/Forest.h" + +namespace clang::pseudo { + +struct DisambiguateParams {}; + +// Maps ambiguous nodes onto the index of their preferred alternative. +using Disambiguation = llvm::DenseMap; + +// Resolve each ambiguous node in the forest. +// Maps each ambiguous node to the index of the chosen alternative. +// FIXME: current implementation is a placeholder and chooses arbitrarily. +Disambiguation disambiguate(const ForestNode *Root, + const DisambiguateParams &Params); + +// Remove all ambiguities from the forest, resolving them according to Disambig. +void removeAmbiguities(ForestNode *&Root, const Disambiguation &Disambig); + +} // namespace clang::pseudo diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h @@ -80,6 +80,10 @@ assert(kind() == Sequence); return children(Data >> RuleBits); } + llvm::MutableArrayRef elements() { + assert(kind() == Sequence); + return children(Data >> RuleBits); + } // Returns all possible interpretations of the code. // REQUIRES: this is an Ambiguous node. @@ -87,6 +91,10 @@ assert(kind() == Ambiguous); return children(Data); } + llvm::MutableArrayRef alternatives() { + assert(kind() == Ambiguous); + return children(Data); + } llvm::ArrayRef children() const { switch (kind()) { @@ -134,6 +142,10 @@ return llvm::makeArrayRef(reinterpret_cast(this + 1), Num); } + llvm::MutableArrayRef children(uint16_t Num) { + return llvm::makeMutableArrayRef(reinterpret_cast(this + 1), + Num); + } Token::Index StartIndex; Kind K : 4; diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h @@ -130,8 +130,8 @@ // A rule `_ := StartSymbol` must exit for the chosen start symbol. // // If the parsing fails, we model it as an opaque node in the forest. -const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol, - const Language &Lang); +ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol, + const Language &Lang); // Shift a token onto all OldHeads, placing the results into NewHeads. // diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt --- a/clang-tools-extra/pseudo/lib/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt @@ -7,6 +7,7 @@ add_clang_library(clangPseudo Bracket.cpp DirectiveTree.cpp + Disambiguate.cpp Forest.cpp GLR.cpp Lex.cpp diff --git a/clang-tools-extra/pseudo/lib/Disambiguate.cpp b/clang-tools-extra/pseudo/lib/Disambiguate.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/lib/Disambiguate.cpp @@ -0,0 +1,48 @@ +//===--- Disambiguate.cpp - Find the best tree in the forest --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/Disambiguate.h" + +namespace clang::pseudo { + +Disambiguation disambiguate(const ForestNode *Root, + const DisambiguateParams &Params) { + // FIXME: this is a dummy placeholder strategy, implement a real one! + Disambiguation Result; + for (const ForestNode &N : Root->descendants()) { + if (N.kind() == ForestNode::Ambiguous) + Result.try_emplace(&N, 1); + } + return Result; +} + +void removeAmbiguities(ForestNode *&Root, const Disambiguation &D) { + std::vector Queue = {&Root}; + while (!Queue.empty()) { + ForestNode **Next = Queue.back(); + Queue.pop_back(); + switch ((*Next)->kind()) { + case ForestNode::Sequence: + for (ForestNode *&Child : (*Next)->elements()) + Queue.push_back(&Child); + break; + case ForestNode::Ambiguous: { + assert(D.count(*Next) != 0 && "disambiguation is incomplete!"); + ForestNode *ChosenChild = (*Next)->alternatives()[D.lookup(*Next)]; + *Next = ChosenChild; + Queue.push_back(Next); + break; + } + case ForestNode::Terminal: + case ForestNode::Opaque: + break; + } + } +} + +} // namespace clang::pseudo diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp --- a/clang-tools-extra/pseudo/lib/GLR.cpp +++ b/clang-tools-extra/pseudo/lib/GLR.cpp @@ -599,8 +599,8 @@ } // namespace -const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol, - const Language &Lang) { +ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol, + const Language &Lang) { GLRReduce Reduce(Params, Lang); assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal"); llvm::ArrayRef Terminals = Params.Forest.createTerminals(Params.Code); @@ -687,7 +687,7 @@ return Result; }; if (auto *Result = SearchForAccept(Heads)) - return *Result; + return *const_cast(Result); // Safe: we created all nodes. // We failed to parse the input, returning an opaque forest node for recovery. // FIXME: as above, we can add fallback error handling so this is impossible. return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0); diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp --- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -8,6 +8,7 @@ #include "clang-pseudo/Bracket.h" #include "clang-pseudo/DirectiveTree.h" +#include "clang-pseudo/Disambiguate.h" #include "clang-pseudo/Forest.h" #include "clang-pseudo/GLR.h" #include "clang-pseudo/Language.h" @@ -45,6 +46,8 @@ static opt StripDirectives("strip-directives", desc("Strip directives and select conditional sections")); +static opt Disambiguate("disambiguate", + desc("Choose best tree from parse forest")); static opt PrintStatistics("print-statistics", desc("Print GLR parser statistics")); static opt PrintForest("print-forest", desc("Print parse forest")); static opt ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"), @@ -70,7 +73,8 @@ namespace pseudo { // Defined in HTMLForest.cpp void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &, - const ForestNode &Root, const TokenStream &); + const ForestNode &Root, const Disambiguation &, + const TokenStream &); namespace { struct NodeStats { @@ -158,6 +162,9 @@ *StartSymID, Lang); if (PrintForest) llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev); + clang::pseudo::Disambiguation Disambig; + if (Disambiguate) + Disambig = clang::pseudo::disambiguate(&Root, {}); if (HTMLForest.getNumOccurrences()) { std::error_code EC; @@ -167,7 +174,8 @@ << "\n"; return 2; } - clang::pseudo::writeHTMLForest(HTMLOut, Lang.G, Root, *ParseableStream); + clang::pseudo::writeHTMLForest(HTMLOut, Lang.G, Root, Disambig, + *ParseableStream); } if (PrintStatistics) { @@ -219,6 +227,14 @@ llvm::outs() << llvm::formatv("Unparsed: {0}%\n", 100.0 * UnparsedTokens / Len); } + + if (Disambiguate && PrintForest) { + ForestNode *DisambigRoot = &Root; + removeAmbiguities(DisambigRoot, Disambig); + llvm::outs() << "Disambiguated tree:\n"; + llvm::outs() << DisambigRoot->dumpRecursive(Lang.G, + /*Abbreviated=*/ForestAbbrev); + } } return 0; diff --git a/clang-tools-extra/pseudo/tool/HTMLForest.cpp b/clang-tools-extra/pseudo/tool/HTMLForest.cpp --- a/clang-tools-extra/pseudo/tool/HTMLForest.cpp +++ b/clang-tools-extra/pseudo/tool/HTMLForest.cpp @@ -43,6 +43,7 @@ // //===----------------------------------------------------------------------===// +#include "clang-pseudo/Disambiguate.h" #include "clang-pseudo/Forest.h" #include "clang-pseudo/grammar/Grammar.h" #include "llvm/ADT/StringExtras.h" @@ -60,6 +61,7 @@ const Grammar &G; const ForestNode &Root; const TokenStream &Stream; + const Disambiguation &Disambig; void write() { Out << "\n"; @@ -150,7 +152,8 @@ break; case ForestNode::Ambiguous: Out.attribute("kind", "ambiguous"); - Out.attribute("selected", AssignID(N->children().front(), End)); + Out.attribute("selected", + AssignID(N->children()[Disambig.lookup(N)], End)); break; case ForestNode::Opaque: Out.attribute("kind", "opaque"); @@ -180,8 +183,9 @@ // We only accept the derived stream here. // FIXME: allow the original stream instead? void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &G, - const ForestNode &Root, const TokenStream &Stream) { - Writer{OS, G, Root, Stream}.write(); + const ForestNode &Root, const Disambiguation &Disambig, + const TokenStream &Stream) { + Writer{OS, G, Root, Stream, Disambig}.write(); } } // namespace pseudo diff --git a/clang-tools-extra/pseudo/unittests/CMakeLists.txt b/clang-tools-extra/pseudo/unittests/CMakeLists.txt --- a/clang-tools-extra/pseudo/unittests/CMakeLists.txt +++ b/clang-tools-extra/pseudo/unittests/CMakeLists.txt @@ -7,6 +7,7 @@ BracketTest.cpp CXXTest.cpp DirectiveTreeTest.cpp + DisambiguateTest.cpp ForestTest.cpp GLRTest.cpp GrammarTest.cpp diff --git a/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp b/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp @@ -0,0 +1,111 @@ +//===--- DisambiguateTest.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang-pseudo/Disambiguate.h" +#include "clang-pseudo/Forest.h" +#include "clang-pseudo/Token.h" +#include "clang/Basic/TokenKinds.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include + +namespace clang { +namespace pseudo { +namespace { +using testing::ElementsAre; +using testing::Pair; +using testing::UnorderedElementsAre; + +// Common disambiguation test fixture. +// This is the ambiguous forest representing parses of 'a * b;'. +class DisambiguateTest : public ::testing::Test { +protected: + // Greatly simplified C++ grammar. + enum Symbol : SymbolID { + Statement, + Declarator, + Expression, + DeclSpecifier, + Type, + Template, + }; + enum Rule : RuleID { + /* LHS__RHS1_RHS2 means LHS := RHS1 RHS2 */ + Statement__DeclSpecifier_Declarator_Semi, + Declarator__Star_Declarator, + Declarator__Identifier, + Statement__Expression_Semi, + Expression__Expression_Star_Expression, + Expression__Identifier, + DeclSpecifier__Type, + DeclSpecifier__Template, + Type__Identifier, + Template__Identifier, + }; + + ForestArena Arena; + ForestNode &A = Arena.createTerminal(tok::identifier, 0); + ForestNode &Star = Arena.createTerminal(tok::star, 1); + ForestNode &B = Arena.createTerminal(tok::identifier, 2); + ForestNode &Semi = Arena.createTerminal(tok::semi, 3); + + // Parse as multiplication expression. + ForestNode &AExpr = + Arena.createSequence(Expression, Expression__Identifier, &A); + ForestNode &BExpr = + Arena.createSequence(Expression, Expression__Identifier, &B); + ForestNode &Expr = + Arena.createSequence(Expression, Expression__Expression_Star_Expression, + {&AExpr, &Star, &BExpr}); + ForestNode &ExprStmt = Arena.createSequence( + Statement, Statement__Expression_Semi, {&Expr, &Semi}); + // Parse as declaration (`a` may be CTAD or not). + ForestNode &AType = + Arena.createSequence(DeclSpecifier, DeclSpecifier__Type, + &Arena.createSequence(Type, Type__Identifier, &A)); + ForestNode &ATemplate = Arena.createSequence( + DeclSpecifier, DeclSpecifier__Template, + &Arena.createSequence(Template, Template__Identifier, &A)); + ForestNode &DeclSpec = + Arena.createAmbiguous(DeclSpecifier, {&AType, &ATemplate}); + ForestNode &BDeclarator = + Arena.createSequence(Declarator, Declarator__Identifier, &B); + ForestNode &BPtr = Arena.createSequence( + Declarator, Declarator__Star_Declarator, {&Star, &BDeclarator}); + ForestNode &DeclStmt = + Arena.createSequence(Statement, Statement__DeclSpecifier_Declarator_Semi, + {&DeclSpec, &Star, &BDeclarator}); + // Top-level ambiguity + ForestNode &Stmt = Arena.createAmbiguous(Statement, {&ExprStmt, &DeclStmt}); +}; + +TEST_F(DisambiguateTest, Remove) { + Disambiguation D; + D.try_emplace(&Stmt, 1); // statement is a declaration, not an expression + D.try_emplace(&DeclSpec, 0); // a is a type, not a (CTAD) template + ForestNode *Root = &Stmt; + removeAmbiguities(Root, D); + + EXPECT_EQ(Root, &DeclStmt); + EXPECT_THAT(DeclStmt.elements(), ElementsAre(&AType, &Star, &BDeclarator)); +} + +TEST_F(DisambiguateTest, DummyStrategy) { + Disambiguation D = disambiguate(&Stmt, {}); + EXPECT_THAT(D, UnorderedElementsAre(Pair(&Stmt, 1), Pair(&DeclSpec, 1))); + + ForestNode *Root = &Stmt; + removeAmbiguities(Root, D); + EXPECT_EQ(Root, &DeclStmt); + EXPECT_THAT(DeclStmt.elements(), + ElementsAre(&ATemplate, &Star, &BDeclarator)); +} + +} // namespace +} // namespace pseudo +} // namespace clang