Index: CMakeLists.txt =================================================================== --- CMakeLists.txt +++ CMakeLists.txt @@ -10,6 +10,7 @@ add_subdirectory(clang-query) add_subdirectory(pp-trace) add_subdirectory(tool-template) +add_subdirectory(clang-highlight) # Add the common testsuite after all the tools. # TODO: Support tests with more granularity when features are off? Index: clang-highlight/CMakeLists.txt =================================================================== --- /dev/null +++ clang-highlight/CMakeLists.txt @@ -0,0 +1,27 @@ +set(LLVM_LINK_COMPONENTS support) + +set(LLVM_USED_LIBS clangTooling) + +add_clang_executable(clang-highlight + ClangHighlight.cpp + TokenClassifier.cpp + OutputWriter.cpp + ) + +target_link_libraries(clang-highlight + clangAST + clangFuzzy + ) +include_directories("Fuzzy") + +install(TARGETS clang-highlight RUNTIME DESTINATION bin) + +add_custom_target(ClangHighlightUnitTests) +set_target_properties(ClangHighlightUnitTests PROPERTIES FOLDER "Clang Highlight Unit Tests") + +function(add_highlight_unittest test_dirname) + add_unittest(ClangHighlightUnitTests ${test_dirname} ${ARGN}) +endfunction() + +add_subdirectory(Fuzzy) +add_subdirectory(unittests) Index: clang-highlight/ClangHighlight.cpp =================================================================== --- /dev/null +++ clang-highlight/ClangHighlight.cpp @@ -0,0 +1,117 @@ +//===-- clang-highlight/ClangHighlight.cpp - Clang highlight tool ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file ClangHighlight.cpp +/// \brief This file implements a clang-highlight tool that automatically +/// highlights (fragments of) C++ code. +/// +//===----------------------------------------------------------------------===// +#include "llvm/Support/Signals.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/MemoryBuffer.h" +#include "clang/Basic/Version.h" +#include "OutputWriter.h" +#include "TokenClassifier.h" + +using namespace llvm; +using namespace clang::highlight; + +// Mark all our options with this category, everything else (except for -version +// and -help) will be hidden. +static cl::OptionCategory ClangHighlightCategory("Clang-highlight options"); +cl::OptionCategory &getClangHighlightCategory() { + return ClangHighlightCategory; +} + +static cl::opt IdentifiersOnly( + "identifiers-only", + cl::desc("Highlight identifiers only. E.g. don't highlight the '*' " + "in \"type *i;\""), + cl::cat(ClangHighlightCategory)); + +static cl::opt DumpAST("dump-ast", cl::desc("Print the fuzzy AST."), + cl::cat(ClangHighlightCategory)); + +static cl::opt OutputFormatFlag( + cl::desc("Output format for the highlighted code."), + cl::values(clEnumValN(OutputFormat::StdoutColored, "stdout", + "write colored stdout"), + clEnumValN(OutputFormat::HTML, "html", "write html"), + clEnumValN(OutputFormat::SemanticHTML, "shtml", + "write semantic html"), + clEnumValN(OutputFormat::LaTeX, "latex", "write latex"), + clEnumValEnd), + cl::cat(ClangHighlightCategory)); + +cl::opt OutputFilename("o", cl::desc("Write output to "), + cl::value_desc("file"), + cl::cat(ClangHighlightCategory)); + +static cl::opt FileName(cl::Positional, cl::desc(""), + cl::Required, + cl::cat(ClangHighlightCategory)); + +static void PrintVersion() { + raw_ostream &OS = llvm::outs(); + OS << clang::getClangToolFullVersion("clang-highlight") << '\n'; +} + +static bool parserHighlight(StringRef File, OutputFormat Format, + StringRef OutFile, bool IdentifiersOnly, + bool DumpAST) { + auto Source = llvm::MemoryBuffer::getFileOrSTDIN(File); + if (std::error_code err = Source.getError()) { + llvm::errs() << err.message() << '\n'; + return true; + } + + if (!OutFile.empty()) { + std::string ErrMsg; + raw_fd_ostream Out(std::string(OutFile).c_str(), ErrMsg, + llvm::sys::fs::F_Text); + if (!ErrMsg.empty()) { + llvm::errs() << ErrMsg << '\n'; + return true; + } + highlight(std::move(*Source), File, makeOutputWriter(Format, Out), + IdentifiersOnly, DumpAST); + } else { + highlight(std::move(*Source), File, makeOutputWriter(Format, llvm::outs()), + IdentifiersOnly, DumpAST); + } + return false; +} + +int main(int argc, const char **argv) { + llvm::sys::PrintStackTraceOnErrorSignal(); + + // Hide unrelated options. + StringMap Options; + cl::getRegisteredOptions(Options); + for (auto &Option : Options) + if (Option.second->Category != &ClangHighlightCategory && + Option.first() != "help" && Option.first() != "version") + Option.second->setHiddenFlag(cl::ReallyHidden); + + cl::SetVersionPrinter(PrintVersion); + cl::ParseCommandLineOptions( + argc, argv, "A tool to highlight C and C++ code.\n\n" + "If no arguments are specified, it highlights the code from " + "standard input\n" + "and writes the result to the standard output.\n"); + + bool Error = false; + + Error |= parserHighlight(FileName, OutputFormatFlag, OutputFilename, + IdentifiersOnly, DumpAST); + + return Error ? 1 : 0; +} Index: clang-highlight/Fuzzy/AnnotatedToken.h =================================================================== --- /dev/null +++ clang-highlight/Fuzzy/AnnotatedToken.h @@ -0,0 +1,92 @@ +//===--- AnnotatedToken.h - clang-highlight ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_CLANG_HIGHLIGHT_ANNOTATED_TOKEN_H +#define LLVM_CLANG_TOOLS_CLANG_HIGHLIGHT_ANNOTATED_TOKEN_H + +#include "clang/Lex/Lexer.h" +#include + +namespace clang { +namespace fuzzy { + +class ASTElement; + +class AnnotatedToken { + Token Tok_; + ASTElement *Annot; + +public: + AnnotatedToken(Token Tok) : Tok_(Tok), Annot(nullptr) {} + + StringRef getText(const SourceManager &SourceMgr) const { + return StringRef(SourceMgr.getCharacterData(Tok().getLocation()), + Tok().getLength()); + } + + tok::TokenKind getTokenKind() const { return Tok().getKind(); } + + Token& Tok() { return Tok_; } + const Token& Tok() const { return Tok_; } + + void setASTReference(ASTElement *ASTReference) { Annot = ASTReference; } + const ASTElement *getASTReference() const { return Annot; } + ASTElement *getASTReference() { return Annot; } + bool hasASTReference() const { return Annot; } +}; + +class AnnotatedTokenRef { + AnnotatedToken *ATok; + +public: + AnnotatedTokenRef(AnnotatedToken *ATok, ASTElement *AstRef) : ATok(ATok) { + if (ATok) + ATok->setASTReference(AstRef); + } + AnnotatedTokenRef(nullptr_t = nullptr) : ATok(nullptr) {} + AnnotatedTokenRef(const AnnotatedTokenRef &) = default; + AnnotatedTokenRef(AnnotatedTokenRef &&) = default; + + AnnotatedTokenRef(AnnotatedTokenRef const &o, ASTElement *AstRef) + : ATok(o.ATok) { + if (ATok) + ATok->setASTReference(AstRef); + } + + AnnotatedTokenRef &operator=(const AnnotatedTokenRef &) = default; + AnnotatedTokenRef &operator=(AnnotatedTokenRef &&) = default; + + operator bool() const { return ATok; } + AnnotatedToken *get() { return ATok; } + AnnotatedToken *get() const { return ATok; } + + AnnotatedToken &getRef() { + assert(*this); + return *ATok; + } + const AnnotatedToken &getRef() const { + assert(*this); + return *ATok; + } + + AnnotatedToken &operator*() { return getRef(); } + const AnnotatedToken &operator*() const { return getRef(); } + AnnotatedToken *operator->() { return &getRef(); } + const AnnotatedToken *operator->() const { return &getRef(); } + + AnnotatedTokenRef &operator=(AnnotatedToken *ATok) { + this->ATok = ATok; + return *this; + } +}; + +} // end namespace fuzzy +} // end namespace clang + +#endif // LLVM_CLANG_TOOLS_CLANG_HIGHLIGHT_ANNOTATED_TOKEN_H Index: clang-highlight/Fuzzy/CMakeLists.txt =================================================================== --- /dev/null +++ clang-highlight/Fuzzy/CMakeLists.txt @@ -0,0 +1,13 @@ +set(LLVM_LINK_COMPONENTS support) + +clang_tablegen(FuzzyNodes.inc -gen-clang-stmt-nodes + SOURCE FuzzyNodes.td + TARGET FuzzyNodes) + +add_clang_library(clangFuzzy + FuzzyParser.cpp + FuzzyASTPrinter.cpp + + ADDITIONAL_HEADERS + FuzzyNodes.td + ) Index: clang-highlight/Fuzzy/FuzzyAST.h =================================================================== --- /dev/null +++ clang-highlight/Fuzzy/FuzzyAST.h @@ -0,0 +1,992 @@ +//===--- FuzzyAST.h - clang-highlight ---------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_CLANG_HIGHLIGHT_FUZZY_AST_H +#define LLVM_CLANG_TOOLS_CLANG_HIGHLIGHT_FUZZY_AST_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/STLExtras.h" +#include "clang/Basic/SourceManager.h" +#include "AnnotatedToken.h" +#include + +namespace llvm { +class raw_ostream; +} + +namespace clang { +namespace fuzzy { + +/// ASTElement: Anything inside the AST that may be referenced by an +/// AnnotatedToken must be an ASTElement. This class is not strictly needed +/// from an AST point of view. +class ASTElement { +public: + virtual ~ASTElement() = default; // Not accessible + // TODO: TableGen + enum ASTElementClass { + NoASTElementClass = 0, + UnparsableBlockClass, + TypeClass, + TemplateDeclClass, + TypeDecorationClass, + VarInitializationClass, + VarDeclClass, + ExprLineStmtClass, + ReturnStmtClass, + CompoundStmtClass, + DeclStmtClass, + firstExpr, + DeclRefExprClass, + ParenExprClass, + LiteralConstantClass, + UnaryOperatorClass, + BinaryOperatorClass, + CallExprClass, + lastExpr, + LabelStmtClass, + WhileStmtClass, + DoWhileStmtClass, + ForStmtClass, + IfStmtClass, + ClassDeclClass, + NamespaceDeclClass, + FunctionDeclClass, + TemplateParameterTypeClass, + PPStringClass, + firstPPDirective, + PPIncludeClass, + PPIfClass, + UnparsablePPClass, + lastPPDirective, + }; + + ASTElementClass getASTClass() const { return sClass; } + +protected: + ASTElement(ASTElementClass SC) : sClass(SC) {} + +private: + ASTElementClass sClass; +}; + +/// An expression in it's classical sense. If an expression is used as a +/// statement, it has to be embedded into a ExprStmt (yet to be implemented). +/// Rationale is that there is otherwise no way to store the semicolon. +class Expr : public ASTElement { +protected: + Expr(ASTElementClass SC) : ASTElement(SC) {} + +public: + virtual ~Expr() = 0; + static bool classof(const ASTElement *T) { + return firstExpr <= T->getASTClass() && T->getASTClass() <= lastExpr; + } +}; +inline Expr::~Expr() {} + +class Type; + +class TypeOrExpression { + std::unique_ptr Ptr; + +public: + TypeOrExpression(std::unique_ptr T); + TypeOrExpression(std::unique_ptr E) : Ptr(std::move(E)) {} + TypeOrExpression(const TypeOrExpression &) = delete; + TypeOrExpression &operator=(const TypeOrExpression &) = delete; + TypeOrExpression(TypeOrExpression &&O) = default; + TypeOrExpression &operator=(TypeOrExpression &&O) = default; + + bool isType() const { + assert(Ptr); + return isa(Ptr.get()); + } + Type &asType() { return *cast(Ptr.get()); } + Expr &asExpr() { return *cast(Ptr.get()); } +}; + +struct QualifiedID { + struct TemplateArguments { + llvm::SmallVector Args; + llvm::SmallVector Separators; + }; + + llvm::SmallVector NameSegments; + llvm::Optional > TemplateArgs; + + void reown(ASTElement *Ref) { + for (auto &N : NameSegments) + N->setASTReference(Ref); + if (TemplateArgs) { + for (auto &ATok : (*TemplateArgs)->Separators) + ATok->setASTReference(Ref); + } + } + + void addNameQualifier(AnnotatedToken *NameTok, ASTElement *Ref) { + NameSegments.push_back(AnnotatedTokenRef(NameTok, Ref)); + } + + void makeTemplateArgs() { + TemplateArgs = std::make_shared(); + } + void addTemplateSeparator(AnnotatedToken *ATok, ASTElement *Ref) { + (*TemplateArgs)->Separators.push_back(AnnotatedTokenRef(ATok, Ref)); + } + void addTemplateArgument(std::unique_ptr T) { + (*TemplateArgs)->Args.push_back(TypeOrExpression(std::move(T))); + } + void addTemplateArgument(std::unique_ptr E) { + (*TemplateArgs)->Args.push_back(TypeOrExpression(std::move(E))); + } +}; + +// Parentheses over an expression +class ParenExpr : public Expr { + enum { + LEFT, + RIGHT, + END_EXPR + }; + AnnotatedTokenRef Parens[END_EXPR]; + +public: + std::unique_ptr Value; + + ParenExpr(AnnotatedToken *Left, std::unique_ptr Value, + AnnotatedToken *Right) + : Expr(ParenExprClass), Value(std::move(Value)) { + setLeftParen(Left); + setRightParen(Right); + } + + void setParen(int Index, AnnotatedToken *AT) { + Parens[Index] = AnnotatedTokenRef(AT, this); + } + void setLeftParen(AnnotatedToken *AT) { setParen(LEFT, AT); } + void setRightParen(AnnotatedToken *AT) { setParen(RIGHT, AT); } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == ParenExprClass; + } +}; + +// A variable name or function name inside an expression. +class DeclRefExpr : public Expr { +public: + QualifiedID Qualifier; + + DeclRefExpr() : Expr(DeclRefExprClass) {} + + void addNameQualifier(AnnotatedToken *NameTok) { + Qualifier.addNameQualifier(NameTok, this); + } + void makeTemplateArgs() { Qualifier.makeTemplateArgs(); } + void addTemplateSeparator(AnnotatedToken *ATok) { + Qualifier.addTemplateSeparator(ATok, this); + } + void addTemplateArgument(std::unique_ptr T) { + Qualifier.addTemplateArgument(std::move(T)); + } + void addTemplateArgument(std::unique_ptr E) { + Qualifier.addTemplateArgument(std::move(E)); + } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == DeclRefExprClass; + } +}; + +/// Int, char or string literals +class LiteralConstant : public Expr { +public: + AnnotatedTokenRef Tok; + LiteralConstant(AnnotatedToken *Tok) + : Expr(LiteralConstantClass), Tok(Tok, this) { + Tok->setASTReference(this); + } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == LiteralConstantClass; + } +}; + +/// Any unary operator, even the overloaded ones. +class UnaryOperator : public Expr { +public: + AnnotatedTokenRef OperatorTok; + std::unique_ptr Value; + + UnaryOperator(AnnotatedToken *OperatorTok, std::unique_ptr Value) + : Expr(UnaryOperatorClass), OperatorTok(OperatorTok, this), + Value(std::move(Value)) { + OperatorTok->setASTReference(this); + } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == UnaryOperatorClass; + } +}; + +/// Used to store any kind of binary operators, even the overloaded ones. +class BinaryOperator : public Expr { + enum { + LHS, + RHS, + END_EXPR + }; + std::unique_ptr SubExprs[END_EXPR]; + +public: + AnnotatedTokenRef OperatorTok; + + BinaryOperator(std::unique_ptr lhs, std::unique_ptr rhs, + AnnotatedToken *OperatorTok) + : Expr(BinaryOperatorClass), OperatorTok(OperatorTok, this) { + SubExprs[LHS] = std::move(lhs); + SubExprs[RHS] = std::move(rhs); + } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == BinaryOperatorClass; + } + + Expr *getLHS() { return cast(SubExprs[LHS].get()); } + const Expr *getLHS() const { return cast(SubExprs[LHS].get()); } + Expr *getRHS() { return cast(SubExprs[RHS].get()); } + const Expr *getRHS() const { return cast(SubExprs[RHS].get()); } +}; + +/// Function calls +class CallExpr : public Expr { +public: + QualifiedID Qualifier; + enum { + LEFT, + RIGHT, + END_EXPR + }; + AnnotatedTokenRef Parens[END_EXPR]; + llvm::SmallVector, 4> Args; + llvm::SmallVector Commas; + + CallExpr(std::unique_ptr FunctionName) + : Expr(CallExprClass), Qualifier(FunctionName->Qualifier) { + Qualifier.reown(this); + } + + void setParen(int Index, AnnotatedToken *AT) { + Parens[Index] = AnnotatedTokenRef(AT, this); + } + void setLeftParen(AnnotatedToken *AT) { setParen(LEFT, AT); } + void setRightParen(AnnotatedToken *AT) { setParen(RIGHT, AT); } + + void appendComma(AnnotatedToken *AT) { + Commas.push_back(AnnotatedTokenRef(AT, this)); + } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == CallExprClass; + } +}; + +/// In contrast to the clang AST, a Stmt is a real statement, that is either a +/// CompoundStmt or a LineStmt. +class Stmt : public ASTElement { +public: + virtual ~Stmt() = 0; // Not optimized + + Stmt(ASTElementClass SC) : ASTElement(SC) {} +}; +inline Stmt::~Stmt() {} + +struct UnparsableBlock : Stmt { + UnparsableBlock() : Stmt(UnparsableBlockClass) {} + void push_back(AnnotatedToken *Tok) { + Body.push_back(AnnotatedTokenRef(Tok, this)); + } + llvm::SmallVector Body; + + static bool classof(const ASTElement *T) { + return T->getASTClass() == UnparsableBlockClass; + } +}; + +class Expr; + +/// By a semicolon terminated statement +class LineStmt : public Stmt { + AnnotatedTokenRef Semi; + +protected: + LineStmt(ASTElementClass SC, AnnotatedToken *Semi) + : Stmt(SC), Semi(Semi, this) {} + LineStmt(ASTElementClass SC, nullptr_t) : Stmt(SC), Semi(nullptr) {} + +public: + void setSemi(AnnotatedToken *Tok) { Semi = AnnotatedTokenRef(Tok, this); } +}; + +/// An expression terminated by a semicolon +struct ExprLineStmt : LineStmt { + ExprLineStmt(std::unique_ptr Body, AnnotatedToken *Semi) + : LineStmt(ExprLineStmtClass, Semi), Body(std::move(Body)) {} + + std::unique_ptr Body; + + static bool classof(const ASTElement *T) { + return T->getASTClass() == ExprLineStmtClass; + } +}; + +struct ReturnStmt : LineStmt { + ReturnStmt(AnnotatedToken *Return, std::unique_ptr Body, + AnnotatedToken *Semi) + : LineStmt(ReturnStmtClass, Semi), Body(std::move(Body)), + Return(Return, this) {} + + std::unique_ptr Body; + AnnotatedTokenRef Return; + + static bool classof(const ASTElement *T) { + return T->getASTClass() == ReturnStmtClass; + } +}; + +struct LabelStmt : Stmt { + AnnotatedTokenRef LabelName, Colon; + + LabelStmt(AnnotatedToken *LabelName, AnnotatedToken *Colon) + : Stmt(LabelStmtClass), LabelName(LabelName, this), Colon(Colon, this) {} + + static bool classof(const ASTElement *T) { + return T->getASTClass() == LabelStmtClass; + } +}; + +/// A Type with it's decorations. +struct Type : ASTElement { + Type() : ASTElement(TypeClass) {} + + struct Decoration : ASTElement { + enum DecorationClass { + Pointer, + Reference, + }; + Decoration(DecorationClass Class, AnnotatedToken *Tok) + : ASTElement(TypeDecorationClass), Class(Class), Tok(Tok) {} + DecorationClass Class; + AnnotatedToken *Tok; + + void fix() { Tok->setASTReference(this); } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == TypeDecorationClass; + } + }; + + llvm::SmallVector Decorations; + QualifiedID Qualifier; + + void addDecoration(Decoration Dec) { + auto *OldLoc = Decorations.empty() ? nullptr : &Decorations.front(); + Decorations.push_back(Dec); + if (OldLoc != &Decorations.front()) + for (auto &D : Decorations) + D.fix(); + } + + void addNameQualifier(AnnotatedToken *NameTok) { + Qualifier.addNameQualifier(NameTok, this); + } + void makeTemplateArgs() { Qualifier.makeTemplateArgs(); } + void addTemplateSeparator(AnnotatedToken *ATok) { + Qualifier.addTemplateSeparator(ATok, this); + } + void addTemplateArgument(std::unique_ptr T) { + Qualifier.addTemplateArgument(std::move(T)); + } + void addTemplateArgument(std::unique_ptr E) { + Qualifier.addTemplateArgument(std::move(E)); + } + static bool classof(const ASTElement *T) { + return T->getASTClass() == TypeClass; + } + + std::unique_ptr cloneWithoutDecorations() { + auto Clone = llvm::make_unique(); + Clone->Qualifier = Qualifier; + Clone->Qualifier.reown(Clone.get()); + return Clone; + } +}; + +inline TypeOrExpression::TypeOrExpression(std::unique_ptr T) + : Ptr(std::move(T)) {} + +/// Initialization of a variable +struct VarInitialization : ASTElement { + enum InitializationType { + NONE = 0, + ASSIGNMENT, + CONSTRUCTOR, + BRACE, + }; + VarInitialization() : ASTElement(VarInitializationClass), InitType(NONE) {} + + void setAssignmentOps(InitializationType InitType, + AnnotatedToken AssignmentOps[2]) { + this->InitType = ASSIGNMENT; + if (InitType == ASSIGNMENT) { + this->AssignmentOps[0] = AnnotatedTokenRef(&AssignmentOps[0], this); + this->AssignmentOps[1] = AnnotatedTokenRef(nullptr); + } else { + this->AssignmentOps[0] = AnnotatedTokenRef(&AssignmentOps[0], this); + this->AssignmentOps[1] = AnnotatedTokenRef(&AssignmentOps[1], this); + } + } + + InitializationType InitType; + AnnotatedTokenRef AssignmentOps[2]; // '=' or '('+')' or '{'+'}' + std::unique_ptr Value; + + static bool classof(const ASTElement *T) { + return T->getASTClass() == VarInitializationClass; + } +}; + +/// Declaration of a variable with optional initialization +struct VarDecl : ASTElement { + VarDecl() : ASTElement(VarDeclClass) {} + + void setName(AnnotatedToken *Tok) { + this->NameTok = AnnotatedTokenRef(Tok, this); + } + + std::unique_ptr VariableType; + AnnotatedTokenRef NameTok; + llvm::Optional Value; + + static bool classof(const ASTElement *T) { + return T->getASTClass() == VarDeclClass; + } +}; + +/// Only for variable declarations (for now) +struct DeclStmt : LineStmt { + llvm::SmallVector, 2> Decls; + llvm::SmallVector Commas; + + DeclStmt() : LineStmt(DeclStmtClass, nullptr) {} + + void appendComma(AnnotatedToken *Tok) { + Commas.push_back(AnnotatedTokenRef(Tok, this)); + } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == DeclStmtClass; + } +}; + +class CompoundStmt; + +struct TemplateParameterType : ASTElement { + TemplateParameterType() : ASTElement(TemplateParameterTypeClass) {} + enum { + KEYWORD, + NAME, + EQUAL, + END_EXPR + }; + AnnotatedTokenRef Refs[END_EXPR]; + std::unique_ptr DefaultType; + + void setRef(int Index, AnnotatedToken *Tok) { + Refs[Index] = AnnotatedTokenRef(Tok, this); + } + void setKeyword(AnnotatedToken *Tok) { setRef(KEYWORD, Tok); } + void setName(AnnotatedToken *Tok) { setRef(NAME, Tok); } + void setEqual(AnnotatedToken *Tok) { setRef(EQUAL, Tok); } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == TemplateParameterTypeClass; + } +}; + +struct TemplateDecl : Stmt { + TemplateDecl() : Stmt(TemplateDeclClass) {} + + std::unique_ptr Templated; + enum { + KEYWORD, + LEFT, + RIGHT, + END_EXPR + }; + AnnotatedTokenRef Refs[END_EXPR]; + + llvm::SmallVector, 2> Params; + llvm::SmallVector Commas; + + void addParam(std::unique_ptr P) { + Params.push_back(std::move(P)); + } + void addComma(AnnotatedToken *Tok) { + Commas.push_back(AnnotatedTokenRef(Tok, this)); + } + + void setRef(int Index, AnnotatedToken *Tok) { + Refs[Index] = AnnotatedTokenRef(Tok, this); + } + void setKeyword(AnnotatedToken *Tok) { setRef(KEYWORD, Tok); } + void setLess(AnnotatedToken *Tok) { setRef(LEFT, Tok); } + void setGreater(AnnotatedToken *Tok) { setRef(RIGHT, Tok); } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == TemplateDeclClass; + } +}; + +struct FunctionDecl : Stmt { + FunctionDecl() : Stmt(FunctionDeclClass) {} + enum { + LEFT, + RIGHT, + SEMI, + END_EXPR + }; + AnnotatedTokenRef Refs[END_EXPR]; + llvm::SmallVector Decls; + llvm::SmallVector, 4> Params; + llvm::SmallVector Commas; + + void appendComma(AnnotatedToken *AT) { + Commas.push_back(AnnotatedTokenRef(AT, this)); + } + + std::unique_ptr ReturnType; + + void setRef(int Index, AnnotatedToken *Tok) { + Refs[Index] = AnnotatedTokenRef(Tok, this); + } + void setLeftBrace(AnnotatedToken *Tok) { setRef(LEFT, Tok); } + void setRightBrace(AnnotatedToken *Tok) { setRef(RIGHT, Tok); } + void setSemi(AnnotatedToken *Tok) { setRef(SEMI, Tok); } + void addDeclSpecifier(AnnotatedToken *Tok) { + Decls.push_back(AnnotatedTokenRef(Tok, this)); + } + + QualifiedID Name; + void addNameQualifier(AnnotatedToken *NameTok) { + Name.addNameQualifier(NameTok, this); + } + void makeTemplateArgs(AnnotatedToken *Tok) { + llvm_unreachable("don't add template arguments to function names"); + } + void addTemplateSeparator(AnnotatedToken *Tok) { + llvm_unreachable("don't add template arguments to function names"); + } + + std::unique_ptr Body; + + static bool classof(const ASTElement *T) { + return T->getASTClass() == FunctionDeclClass; + } +}; + +template class IndirectRange { +public: + IndirectRange(Iter First, Iter Last) : First(First), Last(Last) {} + struct IndirectIter { + IndirectIter(Iter Pos) : Pos(Pos) {} + Iter Pos; + friend bool operator==(IndirectIter LHS, IndirectIter RHS) { + return LHS.Pos == RHS.Pos; + } + friend bool operator!=(IndirectIter LHS, IndirectIter RHS) { + return LHS.Pos != RHS.Pos; + } + IndirectIter operator++() { + ++Pos; + return *this; + } + IndirectIter operator++(int) { + auto Self = *this; + ++*this; + return Self; + } + Value &operator*() { return **Pos; } + }; + + IndirectIter begin() { return First; } + IndirectIter end() { return Last; } + + std::size_t size() const { + static_assert( + std::is_base_of< + std::random_access_iterator_tag, + typename std::iterator_traits::iterator_category>::value, + "Size only allowed for Random Access Iterators."); + return std::distance(First.Pos, Last.Pos); + } + +private: + IndirectIter First, Last; +}; + +struct Scope { + using child_range = IndirectRange< + llvm::SmallVector, 8>::iterator, Stmt>; + using const_child_range = IndirectRange< + llvm::SmallVector, 8>::const_iterator, Stmt>; + + llvm::SmallVector, 8> Body; + + child_range children() { return child_range(Body.begin(), Body.end()); } + const_child_range children() const { + return const_child_range(Body.begin(), Body.end()); + } + + void addStmt(std::unique_ptr Statement) { + Body.push_back(std::move(Statement)); + } +}; + +template struct BlockScope : Scope { + enum { + LBR, + RBR, + END_EXPR + }; + AnnotatedTokenRef Braces[END_EXPR]; + void setBrace(int BraceIdx, AnnotatedToken *Tok) { + assert(0 <= BraceIdx && BraceIdx < END_EXPR); + Braces[BraceIdx] = AnnotatedTokenRef(Tok, static_cast(this)); + } + void setLeftBrace(AnnotatedToken *Tok) { setBrace(LBR, Tok); } + void setRightBrace(AnnotatedToken *Tok) { setBrace(RBR, Tok); } + + bool hasScope() const { return Braces[LBR]; } +}; + +/// A {}-Block with Statements inside. +class CompoundStmt : public Stmt, public BlockScope { +public: + CompoundStmt(AnnotatedToken *lbr, AnnotatedToken *rbr) + : Stmt(CompoundStmtClass) { + setLeftBrace(lbr); + setRightBrace(rbr); + } + + CompoundStmt() : Stmt(CompoundStmtClass) {} + + static bool classof(const ASTElement *T) { + return T->getASTClass() == CompoundStmtClass; + } +}; + +using CondExpr = std::unique_ptr; + +struct WhileStmt : Stmt { + WhileStmt() : Stmt(WhileStmtClass) {} + + CondExpr Cond; + std::unique_ptr Body; + + enum { + KEYWORD, + LEFT, + RIGHT, + END_EXPR, + }; + AnnotatedTokenRef Refs[END_EXPR]; + + void setRef(int Index, AnnotatedToken *Tok) { + Refs[Index] = AnnotatedTokenRef(Tok, this); + } + void setKeyword(AnnotatedToken *Tok) { setRef(KEYWORD, Tok); } + void setLeftParen(AnnotatedToken *Tok) { setRef(LEFT, Tok); } + void setRightParen(AnnotatedToken *Tok) { setRef(RIGHT, Tok); } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == WhileStmtClass; + } +}; + +struct DoWhileStmt : LineStmt { + DoWhileStmt() : LineStmt(DoWhileStmtClass, nullptr) {} + + CondExpr Cond; + std::unique_ptr Body; + + enum { + KEYWORD_DO, + KEYWORD_WHILE, + LEFT, + RIGHT, + END_EXPR, + }; + AnnotatedTokenRef Refs[END_EXPR]; + + void setRef(int Index, AnnotatedToken *Tok) { + Refs[Index] = AnnotatedTokenRef(Tok, this); + } + void setDo(AnnotatedToken *Tok) { setRef(KEYWORD_DO, Tok); } + void setWhile(AnnotatedToken *Tok) { setRef(KEYWORD_WHILE, Tok); } + void setLeftParen(AnnotatedToken *Tok) { setRef(LEFT, Tok); } + void setRightParen(AnnotatedToken *Tok) { setRef(RIGHT, Tok); } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == WhileStmtClass; + } +}; + +struct IfStmt : Stmt { + IfStmt() : Stmt(IfStmtClass) {} + + struct IfBranch { + CondExpr Cond; + std::unique_ptr Body; + + enum { + KEYWORD1, + KEYWORD2, + LEFT, + RIGHT, + END_EXPR, + }; + AnnotatedTokenRef Refs[END_EXPR]; + + IfBranch(ASTElement *ASTRef, AnnotatedToken *Keyword1, + AnnotatedToken *Keyword2, AnnotatedToken *LeftParen, CondExpr Cond, + AnnotatedToken *RightParen, std::unique_ptr Body) + : Cond(std::move(Cond)), Body(std::move(Body)) { + setRef(KEYWORD1, Keyword1, ASTRef); + setRef(KEYWORD2, Keyword2, ASTRef); + setRef(LEFT, LeftParen, ASTRef); + setRef(RIGHT, RightParen, ASTRef); + } + void setRef(int Index, AnnotatedToken *Tok, ASTElement *ASTRef) { + Refs[Index] = AnnotatedTokenRef(Tok, ASTRef); + } + }; + + llvm::SmallVector Branches; + + void addBranch(AnnotatedToken *Keyword1, AnnotatedToken *Keyword2, + AnnotatedToken *LeftParen, CondExpr Cond, + AnnotatedToken *RightParen, std::unique_ptr Body) { + Branches.push_back(IfBranch(this, Keyword1, Keyword2, LeftParen, + std::move(Cond), RightParen, std::move(Body))); + } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == IfStmtClass; + } +}; + +struct ForStmt : Stmt { + + ForStmt() : Stmt(ForStmtClass) {} + + CondExpr Init, Cond; + std::unique_ptr Inc; + std::unique_ptr Body; + + enum { + KEYWORD, + LEFT, + RIGHT, + SEMI1, + SEMI2, + END_EXPR, + }; + AnnotatedTokenRef Refs[END_EXPR]; + + void setRef(int Index, AnnotatedToken *Tok) { + Refs[Index] = AnnotatedTokenRef(Tok, this); + } + void setKeyword(AnnotatedToken *Tok) { setRef(KEYWORD, Tok); } + void setLeftParen(AnnotatedToken *Tok) { setRef(LEFT, Tok); } + void setRightParen(AnnotatedToken *Tok) { setRef(RIGHT, Tok); } + + void setSemi1(AnnotatedToken *Tok) { setRef(SEMI1, Tok); } + void setSemi2(AnnotatedToken *Tok) { setRef(SEMI2, Tok); } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == ForStmtClass; + } +}; + +struct ClassDecl : LineStmt, BlockScope { + enum { + CLASS, + COLON, + END_EXPR + }; + AnnotatedTokenRef Refs[END_EXPR]; + + std::unique_ptr Name; + + struct BaseClass { + AnnotatedTokenRef Accessibility, Comma; + std::unique_ptr T; + }; + + llvm::SmallVector BaseClasses; + + ClassDecl() : LineStmt(ClassDeclClass, nullptr) {} + + void setRef(int Index, AnnotatedToken *Tok) { + Refs[Index] = AnnotatedTokenRef(Tok, this); + } + void setClass(AnnotatedToken *Tok) { setRef(CLASS, Tok); } + void setColon(AnnotatedToken *Tok) { setRef(COLON, Tok); } + + void addBaseClass(AnnotatedToken *Accessibility, std::unique_ptr T, + AnnotatedToken *Comma) { + BaseClasses.push_back({ AnnotatedTokenRef(Accessibility, this), + AnnotatedTokenRef(Comma, this), + std::move(T), }); + } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == ClassDeclClass; + } +}; + +struct NamespaceDecl : Stmt, BlockScope { + enum { + NAMESPACE, + NAME, + END_EXPR + }; + AnnotatedTokenRef Refs[END_EXPR]; + + NamespaceDecl() : Stmt(NamespaceDeclClass) {} + + void setRef(int Index, AnnotatedToken *Tok) { + Refs[Index] = AnnotatedTokenRef(Tok, this); + } + void setNamespace(AnnotatedToken *Tok) { setRef(NAMESPACE, Tok); } + void setName(AnnotatedToken *Tok) { setRef(NAME, Tok); } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == NamespaceDeclClass; + } +}; + +struct PPDirective : ASTElement { +protected: + PPDirective(ASTElementClass SC) : ASTElement(SC) {} + +public: + static bool classof(const ASTElement *T) { + auto Class = T->getASTClass(); + return firstPPDirective <= Class && Class <= lastPPDirective; + } +}; + +struct PPString : ASTElement { + PPString() : ASTElement(PPStringClass) {} + + llvm::SmallVector Refs; + + void addToken(AnnotatedToken *Tok) { + Refs.push_back(AnnotatedTokenRef(Tok, this)); + } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == PPStringClass; + } +}; + +struct PPInclude : PPDirective { + PPInclude() : PPDirective(PPIncludeClass) {} + + enum { + HASH, + INCLUDE, + EOD, + END_EXPR + }; + AnnotatedTokenRef Refs[END_EXPR]; + std::unique_ptr Path; + + void setRef(int Index, AnnotatedToken *Tok) { + Refs[Index] = AnnotatedTokenRef(Tok, this); + } + void setHash(AnnotatedToken *Tok) { setRef(HASH, Tok); } + void setInclude(AnnotatedToken *Tok) { setRef(INCLUDE, Tok); } + void setEOD(AnnotatedToken *Tok) { setRef(EOD, Tok); } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == PPIncludeClass; + } +}; + +struct PPIf : PPDirective { + PPIf() : PPDirective(PPIfClass) {} + + enum { + HASH, + KEYWORD, + EOD, + END_EXPR + }; + AnnotatedTokenRef Refs[END_EXPR]; + + std::unique_ptr Cond; + + void setRef(int Index, AnnotatedToken *Tok) { + Refs[Index] = AnnotatedTokenRef(Tok, this); + } + void setHash(AnnotatedToken *Tok) { setRef(HASH, Tok); } + void setKeyword(AnnotatedToken *Tok) { setRef(KEYWORD, Tok); } + void setEOD(AnnotatedToken *Tok) { setRef(EOD, Tok); } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == PPIfClass; + } +}; + +struct UnparsablePP : PPDirective { + UnparsablePP() : PPDirective(UnparsablePPClass) {} + + llvm::SmallVector Refs; + void push_back(AnnotatedToken *Tok) { + Refs.push_back(AnnotatedTokenRef(Tok, this)); + } + + static bool classof(const ASTElement *T) { + return T->getASTClass() == UnparsablePPClass; + } +}; + +struct TranslationUnit : Scope { + llvm::SmallVector, 8> PPDirectives; + + void addPPDirective(std::unique_ptr PP) { + PPDirectives.push_back(std::move(PP)); + } +}; + +TranslationUnit fuzzyparse(AnnotatedToken *first, AnnotatedToken *last); + +void printAST(llvm::raw_ostream &OS, const Stmt &Root, + const SourceManager &SourceMgr); + +void printAST(llvm::raw_ostream &OS, const TranslationUnit &TU, + const SourceManager &SourceMgr); + +} // end namespace fuzzy +} // end namespace clang + +#endif // LLVM_CLANG_TOOLS_CLANG_HIGHLIGHT_FUZZY_AST_H Index: clang-highlight/Fuzzy/FuzzyASTPrinter.cpp =================================================================== --- /dev/null +++ clang-highlight/Fuzzy/FuzzyASTPrinter.cpp @@ -0,0 +1,273 @@ +//===--- FuzzyParser.cpp - clang-highlight ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#include "llvm/Support/raw_os_ostream.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/STLExtras.h" +#include "FuzzyAST.h" + +using namespace llvm; + +namespace clang { +namespace fuzzy { + +namespace { +struct Indented { + const int Indent; + explicit Indented(int Indent) : Indent(Indent) {} + friend raw_ostream &operator<<(raw_ostream &OS, Indented ID) { + const int Total = 4 * ID.Indent; + for (int i = 0; i < Total; ++i) + OS.write(' '); + return OS; + } + Indented next() { return Indented(Indent + 1); } +}; +} // end anonymous namespace + +namespace { +struct ASTPrinter { + const SourceManager &SourceMgr; + raw_ostream &OS; + + void print(Indented Indent, const Type &T); + void print(Indented Indent, const VarDecl &DCL); + void print(Indented Indent, const Expr &EXP); + void print(Indented Indent, const Stmt &stmt); + void print(Indented Indent, const QualifiedID &Qual); + void print(Indented Indent, const PPDirective &Qual); + void printScope(Indented Indent, const Scope &Sc); + void printCondition(Indented Indent, const char *Name, ASTElement *E); +}; +} // end anonymous namespace + +void ASTPrinter::printScope(Indented Indent, const Scope &Sc) { + OS << "{\n"; + for (auto &S : Sc.children()) + print(Indent.next(), S); + OS << Indent << "}\n"; +} + +void ASTPrinter::printCondition(Indented Indent, const char *Name, + ASTElement *E) { + OS << Indent.next() << Name << (E ? "\n" : ": \n"); + if (E) { + if (auto *D = dyn_cast(E)) + print(Indent.next().next(), *D); + else if (auto *V = dyn_cast(E)) + print(Indent.next().next(), *V); + else if (auto *U = dyn_cast(E)) + print(Indent.next().next(), *U); + else + print(Indent.next().next(), *cast(E)); + } +} + +void ASTPrinter::print(Indented Indent, const QualifiedID &Qual) { + for (auto &N : Qual.NameSegments) { + OS << N->getText(SourceMgr); + } + if (Qual.TemplateArgs) { + OS << "\n" << Indent << "<\n"; + for (auto &A : (*Qual.TemplateArgs)->Args) { + if (A.isType()) + print(Indent.next(), A.asType()); + else + print(Indent.next(), A.asExpr()); + } + OS << Indent << '>'; + } +} + +void ASTPrinter::print(Indented Indent, const Type &T) { + OS << Indent << "Type "; + for (auto &D : T.Decorations) + OS << '\'' << D.Tok->getText(SourceMgr) << "' "; + OS << '\''; + print(Indent.next(), T.Qualifier); + OS << "'\n"; +} + +void ASTPrinter::print(Indented Indent, const VarDecl &DCL) { + OS << Indent << "VarDecl '" << DCL.NameTok->getText(SourceMgr) << "'\n"; + print(Indent.next(), *DCL.VariableType); + if (DCL.Value) { + const char *InitName[] = { "?", "=", "()", "{}" }; + assert(1 <= DCL.Value->InitType && DCL.Value->InitType < 4); + OS << Indent.next() << "Assignment Type '" << InitName[DCL.Value->InitType] + << "'\n"; + assert(DCL.Value->Value); + print(Indent.next(), *DCL.Value->Value); + } +} + +void ASTPrinter::print(Indented Indent, const Expr &EXP) { + if (auto *BinOp = dyn_cast(&EXP)) { + print(Indent.next(), *BinOp->getLHS()); + OS << Indent << tok::getTokenName(BinOp->OperatorTok->getTokenKind()) + << '\n'; + print(Indent.next(), *BinOp->getRHS()); + } else if (auto *Decl = dyn_cast(&EXP)) { + OS << Indent << "DeclRefExpr '"; + print(Indent.next(), Decl->Qualifier); + OS << "'\n"; + } else if (auto *Lit = dyn_cast(&EXP)) { + OS << Indent << Lit->Tok->getText(SourceMgr) << '\n'; + } else if (auto *Call = dyn_cast(&EXP)) { + OS << Indent << "call expr '"; + print(Indent.next(), Call->Qualifier); + OS << "'\n"; + for (auto &Arg : Call->Args) + print(Indent.next(), *Arg); + } else if (auto *Unar = dyn_cast(&EXP)) { + OS << Indent << Unar->OperatorTok->getText(SourceMgr) << "\n"; + print(Indent.next(), *Unar->Value); + } else if (auto *PE = dyn_cast(&EXP)) { + OS << Indent << "ParenExpr:\n"; + print(Indent.next(), *PE->Value); + } else { + llvm_unreachable("TODO: unhandled fuzzy ast node of type Expr"); + } +} + +void ASTPrinter::print(Indented Indent, const Stmt &stmt) { + if (auto *DS = dyn_cast(&stmt)) { + OS << Indent << "DeclStmt\n"; + for (const auto &VD : DS->Decls) + print(Indent.next(), *VD); + } else if (auto *UB = dyn_cast(&stmt)) { + (void)UB; + OS << Indent << "Unparsable Block:\n"; + for (auto T : UB->Body) + OS << Indent.next() << T->getText(SourceMgr) << '\n'; + } else if (auto *ELS = dyn_cast(&stmt)) { + OS << Indent << "ExprLineStmt\n"; + print(Indent.next(), *ELS->Body); + } else if (auto *RS = dyn_cast(&stmt)) { + OS << Indent << "ReturnStmt\n"; + if (RS->Body) + print(Indent.next(), *RS->Body); + else + OS << Indent.next() << "\n"; + } else if (auto *FD = dyn_cast(&stmt)) { + OS << Indent << "FunctionDecl '"; + print(Indent.next().next(), FD->Name); + OS << "'\n" << Indent.next() << "Body:\n"; + if (FD->Body) + print(Indent.next().next(), *FD->Body); + } else if (auto *CD = dyn_cast(&stmt)) { + OS << Indent << '\'' << CD->Refs[ClassDecl::CLASS]->getText(SourceMgr) + << "' "; + print(Indent.next(), *CD->Name); + if (!CD->BaseClasses.empty()) { + OS << " derived from\n"; + for (auto &BC : CD->BaseClasses) { + OS << Indent.next() + << (BC.Accessibility ? BC.Accessibility->getText(SourceMgr) + : "") << ' '; + print(Indent.next().next(), *BC.T); + } + } + if (!CD->hasScope()) + OS << " (declaration only)\n"; + else + printScope(Indent, *CD); + } else if (auto *LBL = dyn_cast(&stmt)) { + OS << Indent << "Label '" << LBL->LabelName->getText(SourceMgr) << "'\n"; + } else if (auto *NS = dyn_cast(&stmt)) { + OS << Indent << "Namespace '" + << (NS->Refs[NamespaceDecl::NAME] + ? NS->Refs[NamespaceDecl::NAME]->getText(SourceMgr) + : "") << '\''; + printScope(Indent, *NS); + } else if (auto TD = dyn_cast(&stmt)) { + OS << Indent << "Template <'\n"; + for (auto &A : TD->Params) { + if (auto *E = dyn_cast(A.get())) + print(Indent.next().next(), *E); + else if (auto *VD = dyn_cast(A.get())) + print(Indent.next().next(), *VD); + else + print(Indent.next().next(), *static_cast(A.get())); + } + OS << Indent.next() << "> with Body:\n"; + print(Indent.next().next(), *TD->Templated); + } else if (auto *If = dyn_cast(&stmt)) { + OS << Indent << "If\n"; + for (auto &B : If->Branches) { + printCondition(Indent, "Condition", B.Cond.get()); + OS << Indent.next() << "Body:\n"; + print(Indent.next().next(), *B.Body); + } + } else if (auto *CS = dyn_cast(&stmt)) { + OS << Indent << "CompoundStmt:\n"; + for (auto &S : CS->Body) + print(Indent.next(), *S); + } else if (auto *While = dyn_cast(&stmt)) { + OS << Indent << "WhileStmt:\n"; + printCondition(Indent, "Condition", While->Cond.get()); + OS << Indent.next() << "Body:\n"; + print(Indent.next().next(), *While->Body); + } else if (auto *For = dyn_cast(&stmt)) { + OS << Indent << "ForStmt:\n"; + printCondition(Indent, "Init", For->Init.get()); + printCondition(Indent, "Condition", For->Cond.get()); + printCondition(Indent, "Incr", For->Inc.get()); + OS << Indent.next() << "Body:\n"; + print(Indent.next().next(), *For->Body); + } else { + llvm_unreachable("TODO: unhandled fuzzy ast node"); + } +} + +void ASTPrinter::print(Indented Indent, const PPDirective &PP) { + if (auto *Inc = dyn_cast(&PP)) { + OS << Indent << "Include Directive: '"; + if (Inc->Path) + for (auto &S : Inc->Path->Refs) + OS << S->getText(SourceMgr); + OS << "'\n"; + } else if (auto *If = dyn_cast(&PP)) { + OS << Indent << "Preprocessor '" + << If->Refs[PPIf::KEYWORD]->getText(SourceMgr) << "':\n"; + if (If->Cond) { + if (auto *E = dyn_cast(If->Cond.get())) + print(Indent.next(), *E); + else + print(Indent.next(), *cast(If->Cond.get())); + } + } else if (auto *UP = dyn_cast(&PP)) { + OS << Indent << "Unparsable PP:\n"; + for (auto R : UP->Refs) + OS << Indent.next() << R->getText(SourceMgr) << '\n'; + } else { + llvm_unreachable("TODO: unhandled preprocessor directive"); + } +} + +void printAST(raw_ostream &OS, const Stmt &Root, + const SourceManager &SourceMgr) { + ASTPrinter AP{ SourceMgr, OS }; + AP.print(Indented(0), Root); +} + +void printAST(raw_ostream &OS, const TranslationUnit &TU, + const SourceManager &SourceMgr) { + ASTPrinter AP{ SourceMgr, OS }; + for (auto &P : TU.PPDirectives) { + assert(P); + AP.print(Indented(0), *P); + } + for (auto &S : TU.Body) { + assert(S); + AP.print(Indented(0), *S); + } +} + +} // end namespace fuzzy +} // end namespace clang Index: clang-highlight/Fuzzy/FuzzyParser.cpp =================================================================== --- /dev/null +++ clang-highlight/Fuzzy/FuzzyParser.cpp @@ -0,0 +1,1069 @@ +//===--- FuzzyParser.cpp - clang-highlight ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#include "llvm/Support/Debug.h" +#include "llvm/ADT/STLExtras.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/OperatorPrecedence.h" +#include "FuzzyAST.h" + +using namespace llvm; + +namespace clang { +namespace fuzzy { + +namespace { +template class BasicTokenFilter { + AnnotatedToken *First, *Last; + + void skipWhitespaces() { + for (;;) { + while (First != Last && (First->getTokenKind() == tok::unknown || + First->getTokenKind() == tok::comment)) + ++First; + + if (SkipPreprocessor && First->getTokenKind() == tok::hash && + First->Tok().isAtStartOfLine()) + while (First != Last && First++->getTokenKind() != tok::eod) + ; + else + break; + } + assert(First <= Last); + } + +public: + BasicTokenFilter(AnnotatedToken *First, AnnotatedToken *Last) + : First(First), Last(Last) { + skipWhitespaces(); + } + + AnnotatedToken *next() { + assert(!eof()); + auto Ret = First++; + skipWhitespaces(); + assert(Ret->getTokenKind() != tok::raw_identifier); + return Ret; + } + + class TokenFilterState { + friend class BasicTokenFilter; + TokenFilterState(AnnotatedToken *First, AnnotatedToken *Last) + : First(First), Last(Last) {} + AnnotatedToken *First, *Last; + }; + + TokenFilterState mark() const { return TokenFilterState(First, Last); } + void rewind(TokenFilterState State) { + First = State.First; + Last = State.Last; + } + + BasicTokenFilter rangeAsTokenFilter(TokenFilterState From, + TokenFilterState To) const { + assert(From.Last == To.Last); + assert(From.First <= To.First); + assert(To.First < To.Last); + return BasicTokenFilter(From.First, To.First + 1); + } + + class TokenFilterGuard { + friend class BasicTokenFilter; + TokenFilterGuard(BasicTokenFilter *TF, TokenFilterState State) + : TF(TF), State(State) {} + + public: + ~TokenFilterGuard() { + if (TF) + TF->rewind(State); + } + void dismiss() { TF = nullptr; } + BasicTokenFilter *TF; + TokenFilterState State; + }; + TokenFilterGuard guard() { return TokenFilterGuard(this, mark()); } + + AnnotatedToken *peek() { return First; } + const AnnotatedToken *peek() const { return First; } + tok::TokenKind peekKind() const { return First->getTokenKind(); } + + bool eof() const { return peekKind() == tok::eof; } +}; +using TokenFilter = BasicTokenFilter; +using RawTokenFilter = BasicTokenFilter; +} // end anonymous namespace + +template +static bool checkKind(BasicTokenFilter &TF, tok::TokenKind Kind) { + return TF.peekKind() == Kind; +} + +static int PrecedenceUnaryOperator = prec::PointerToMember + 1; +static int PrecedenceArrowAndPeriod = prec::PointerToMember + 2; + +static std::unique_ptr parseExpr(TokenFilter &TF, int Precedence = 1, + bool StopAtGreater = false); + +static std::unique_ptr parseType(TokenFilter &TF, + bool WithDecorations = true); + +static std::unique_ptr parseUnaryOperator(TokenFilter &TF) { + if (checkKind(TF, tok::plus) || checkKind(TF, tok::minus) || + checkKind(TF, tok::exclaim) || checkKind(TF, tok::tilde) || + checkKind(TF, tok::star) || checkKind(TF, tok::amp) || + checkKind(TF, tok::plusplus) || checkKind(TF, tok::minusminus)) { + AnnotatedToken *Op = TF.next(); + auto Operand = parseUnaryOperator(TF); + if (!Operand) + return {}; + return make_unique(Op, std::move(Operand)); + } + + return parseExpr(TF, PrecedenceArrowAndPeriod); +} + +static std::unique_ptr +parseCallExpr(TokenFilter &TF, std::unique_ptr FunctionName) { + assert(checkKind(TF, tok::l_paren)); + auto Func = make_unique(std::move(FunctionName)); + Func->setLeftParen(TF.next()); + while (!checkKind(TF, tok::r_paren)) { + Func->Args.push_back(parseExpr(TF, prec::Comma + 1)); + if (checkKind(TF, tok::comma)) + Func->appendComma(TF.next()); + else + break; + } + if (checkKind(TF, tok::r_paren)) { + Func->setRightParen(TF.next()); + return std::move(Func); + } + return {}; +} + +static bool isLiteralOrConstant(tok::TokenKind K) { + if (isLiteral(K)) + return true; + + switch (K) { + case tok::kw_this: + case tok::kw_true: + case tok::kw_false: + case tok::kw___objc_yes: + case tok::kw___objc_no: + case tok::kw_nullptr: + return true; + default: + return false; + } +} + +template +static bool parseNamespaceQualifiers(TokenFilter &TF, QualOwner &Qual) { + auto Guard = TF.guard(); + + if (checkKind(TF, tok::kw_operator)) { + Qual.addNameQualifier(TF.next()); + if (!TF.peek()) + return false; + Qual.addNameQualifier(TF.next()); + Guard.dismiss(); + return true; + } + + bool GlobalNamespaceColon = true; + do { + if (checkKind(TF, tok::coloncolon)) + Qual.addNameQualifier(TF.next()); + else if (!GlobalNamespaceColon) + return false; + GlobalNamespaceColon = false; + if (!checkKind(TF, tok::identifier)) + return false; + Qual.addNameQualifier(TF.next()); + } while (checkKind(TF, tok::coloncolon)); + + Guard.dismiss(); + return true; +} + +template +static bool parseTemplateArgs(TokenFilter &TF, QualOwner &Qual, + std::false_type) { + return true; +} +template +static bool parseTemplateArgs(TokenFilter &TF, QualOwner &Qual, + std::true_type) { + auto Guard = TF.guard(); + + if (checkKind(TF, tok::less)) { + Qual.makeTemplateArgs(); + bool isFirst = true; + do { + Qual.addTemplateSeparator(TF.next()); + + if (isFirst && checkKind(TF, tok::greater)) + break; + isFirst = false; + + if (auto Arg = parseType(TF)) + Qual.addTemplateArgument(std::move(Arg)); + else if (auto E = parseExpr(TF, prec::Comma + 1, /*StopAtGreater=*/true)) + Qual.addTemplateArgument(std::move(E)); + else + return false; + } while (checkKind(TF, tok::comma)); + if (!checkKind(TF, tok::greater)) + return false; + Qual.addTemplateSeparator(TF.next()); + } + + Guard.dismiss(); + return true; +} + +template +static bool parseQualifiedID(TokenFilter &TF, QualOwner &Qual, + WithTemplateArgs WTA = std::true_type{}) { + auto Guard = TF.guard(); + if (parseNamespaceQualifiers(TF, Qual) && parseTemplateArgs(TF, Qual, WTA)) { + Guard.dismiss(); + return true; + } + return false; +} + +static std::unique_ptr parseExpr(TokenFilter &TF, int Precedence, + bool StopAtGreater) { + if (!TF.peek()) + return {}; + + if (Precedence == PrecedenceUnaryOperator) + return parseUnaryOperator(TF); + + if (Precedence > PrecedenceArrowAndPeriod) { + if (isLiteralOrConstant(TF.peekKind())) + return make_unique(TF.next()); + + if (checkKind(TF, tok::l_paren)) { + auto Left = TF.next(); + auto Val = parseExpr(TF, 1, false); + if (!checkKind(TF, tok::r_paren)) + return {}; + auto Right = TF.next(); + return make_unique(Left, std::move(Val), Right); + } + + if (checkKind(TF, tok::identifier) || checkKind(TF, tok::coloncolon)) { + auto DR = make_unique(); + if (!parseQualifiedID(TF, *DR) && + !parseQualifiedID(TF, *DR, std::false_type{})) + return {}; + if (checkKind(TF, tok::l_paren)) + return parseCallExpr(TF, std::move(DR)); + std::unique_ptr Ret = std::move(DR); + while (checkKind(TF, tok::plusplus) || checkKind(TF, tok::minusminus)) + Ret = make_unique(TF.next(), std::move(Ret)); + return std::move(Ret); + } + + return {}; + } + auto LeftExpr = parseExpr(TF, Precedence + 1, StopAtGreater); + if (!LeftExpr) + return {}; + + while (!TF.eof()) { + if (StopAtGreater && checkKind(TF, tok::greater)) + break; + + int CurrentPrecedence = getBinOpPrecedence(TF.peekKind(), true, true); + if (checkKind(TF, tok::period) || checkKind(TF, tok::arrow)) + CurrentPrecedence = PrecedenceArrowAndPeriod; + if (CurrentPrecedence == 0) + return LeftExpr; + + assert(CurrentPrecedence <= Precedence); + if (CurrentPrecedence < Precedence) + break; + assert(CurrentPrecedence == Precedence); + + AnnotatedToken *OperatorTok = TF.next(); + + auto RightExpr = parseExpr(TF, Precedence + 1, StopAtGreater); + if (!RightExpr) + return {}; + + LeftExpr = make_unique(std::move(LeftExpr), + std::move(RightExpr), OperatorTok); + } + + return LeftExpr; +} + +static std::unique_ptr parseReturnStmt(TokenFilter &TF) { + auto Guard = TF.guard(); + if (!checkKind(TF, tok::kw_return)) + return {}; + auto *Return = TF.next(); + std::unique_ptr Body; + if (!checkKind(TF, tok::semi)) { + Body = parseExpr(TF); + if (!Body || !checkKind(TF, tok::semi)) + return {}; + } + assert(checkKind(TF, tok::semi)); + auto *Semi = TF.next(); + Guard.dismiss(); + return make_unique(Return, std::move(Body), Semi); +} + +static void parseTypeDecorations(TokenFilter &TF, Type &T) { + // TODO: add const and volatile + while (checkKind(TF, tok::star) || checkKind(TF, tok::amp) || + checkKind(TF, tok::ampamp)) + T.Decorations.push_back(Type::Decoration(checkKind(TF, tok::star) + ? Type::Decoration::Pointer + : Type::Decoration::Reference, + TF.next())); + for (auto &Dec : T.Decorations) + Dec.fix(); +} + +static bool isBuiltinType(tok::TokenKind K) { + switch (K) { + case tok::kw_short: + case tok::kw_long: + case tok::kw___int64: + case tok::kw___int128: + case tok::kw_signed: + case tok::kw_unsigned: + case tok::kw__Complex: + case tok::kw__Imaginary: + case tok::kw_void: + case tok::kw_char: + case tok::kw_wchar_t: + case tok::kw_char16_t: + case tok::kw_char32_t: + case tok::kw_int: + case tok::kw_half: + case tok::kw_float: + case tok::kw_double: + case tok::kw_bool: + case tok::kw__Bool: + case tok::kw__Decimal32: + case tok::kw__Decimal64: + case tok::kw__Decimal128: + case tok::kw___vector: + return true; + default: + return false; + } +} + +static bool isCVQualifier(tok::TokenKind K) { + switch (K) { + case tok::kw_const: + case tok::kw_constexpr: + case tok::kw_volatile: + case tok::kw_register: + return true; + default: + return false; + } +} + +static std::unique_ptr parseType(TokenFilter &TF, bool WithDecorations) { + auto Guard = TF.guard(); + std::unique_ptr T = make_unique(); + + while (isCVQualifier(TF.peekKind()) || checkKind(TF, tok::kw_typename)) + T->addNameQualifier(TF.next()); + + if (checkKind(TF, tok::kw_auto)) { + T->addNameQualifier(TF.next()); + } else if (isBuiltinType(TF.peekKind())) { + while (isBuiltinType(TF.peekKind())) + T->addNameQualifier(TF.next()); + } else if (!parseQualifiedID(TF, *T)) { + return {}; + } + while (isCVQualifier(TF.peekKind())) + T->addNameQualifier(TF.next()); + + if (WithDecorations) + parseTypeDecorations(TF, *T); + + Guard.dismiss(); + return T; +} + +static std::unique_ptr parseVarDecl(TokenFilter &TF, + Type *TypeName = 0, + bool NameOptional = false, + bool StopAtGreater = false) { + auto Guard = TF.guard(); + auto VD = make_unique(); + VarDecl &D = *VD; + + if (!TypeName) { + D.VariableType = parseType(TF); + if (!D.VariableType) + return {}; + } else { + D.VariableType = TypeName->cloneWithoutDecorations(); + } + parseTypeDecorations(TF, *D.VariableType); + + if (checkKind(TF, tok::identifier)) { + D.setName(TF.next()); + } else if (!NameOptional) { + return {}; + } + + if (checkKind(TF, tok::equal)) { + auto *EqualTok = TF.next(); + if (auto Value = parseExpr(TF, prec::Comma + 1, StopAtGreater)) { + D.Value = VarInitialization(); + D.Value->setAssignmentOps(VarInitialization::ASSIGNMENT, EqualTok); + D.Value->Value = std::move(Value); + } else { + return {}; + } + } else { + // TODO: var(init) and var{init} not yet implemented + } + Guard.dismiss(); + return VD; +} + +static std::unique_ptr parseDeclStmt(TokenFilter &TF, + bool WithSemi = true) { + auto Guard = TF.guard(); + + auto TypeName = parseType(TF, /*WithDecorations=*/false); + if (!TypeName) + return {}; + auto Declaration = make_unique(); + + while (!TF.eof()) { + if (checkKind(TF, tok::semi)) { + if (Declaration->Decls.empty()) + return {}; + if (WithSemi) + Declaration->setSemi(TF.next()); + Guard.dismiss(); + return std::move(Declaration); + } + if (auto D = parseVarDecl(TF, TypeName.get())) + Declaration->Decls.push_back(std::move(D)); + else + return {}; + + if (checkKind(TF, tok::comma)) { + Declaration->appendComma(TF.next()); + } else if (!checkKind(TF, tok::semi)) { + return {}; + } + } + + return {}; +} + +static bool parseDestructor(TokenFilter &TF, FunctionDecl &F) { + auto Pos = TF.mark(); + + int Tildes = 0; + while (checkKind(TF, tok::tilde) || checkKind(TF, tok::identifier) || + checkKind(TF, tok::coloncolon)) { + Tildes += checkKind(TF, tok::tilde); + TF.next(); + } + if (Tildes != 1) + return false; + + if (!checkKind(TF, tok::l_paren)) + return false; + + TF.rewind(Pos); + + F.ReturnType = make_unique(); + + while (checkKind(TF, tok::tilde) || checkKind(TF, tok::identifier) || + checkKind(TF, tok::coloncolon)) { + if (checkKind(TF, tok::tilde)) + F.addNameQualifier(TF.next()); + else + F.ReturnType->addNameQualifier(TF.next()); + } + + return true; +} + +static bool isDeclSpecifier(tok::TokenKind K) { + switch (K) { + case tok::kw_friend: + // case tok::kw_constexpr: + // case tok::kw_const: + // case tok::kw_mutable: + case tok::kw_typedef: + // case tok::kw_register: + case tok::kw_static: + // case tok::kw_thread_local: + case tok::kw_extern: + case tok::kw_inline: + case tok::kw_virtual: + case tok::kw_explicit: + return true; + default: + return false; + } +} + +static std::unique_ptr +parseFunctionDecl(TokenFilter &TF, bool NameOptional = false) { + auto Guard = TF.guard(); + auto F = make_unique(); + + while (isDeclSpecifier(TF.peekKind())) + F->addDeclSpecifier(TF.next()); + + bool InDestructor = false; + + if (auto T = parseType(TF)) { + F->ReturnType = std::move(T); + } else if (NameOptional && parseDestructor(TF, *F)) { + InDestructor = true; + } else { + return {}; + } + + if (!InDestructor) { + if (!checkKind(TF, tok::identifier) && !checkKind(TF, tok::kw_operator)) { + if (!NameOptional) + return {}; + } else if (!parseQualifiedID(TF, *F, std::false_type{})) { + return {}; + } + } + + if (!checkKind(TF, tok::l_paren)) + return {}; + + F->setLeftBrace(TF.next()); + while (!checkKind(TF, tok::r_paren)) { + F->Params.push_back(parseVarDecl(TF, 0, true)); + if (!F->Params.back()) + return {}; + if (checkKind(TF, tok::comma)) + F->appendComma(TF.next()); + else + break; + } + if (!checkKind(TF, tok::r_paren)) + return {}; + + F->setRightBrace(TF.next()); + + // if (InConstructor && checkKind(TF, tok::colon)) { + // TODO: Don't skip initializer list and [[x]] and const + while (!TF.eof() && !checkKind(TF, tok::l_brace) && !checkKind(TF, tok::semi)) + TF.next(); + //} + + if (checkKind(TF, tok::semi)) + F->setSemi(TF.next()); + Guard.dismiss(); + return std::move(F); +} + +static std::unique_ptr skipUnparsable(TokenFilter &TF) { + assert(!TF.eof()); + auto UB = make_unique(); + while (!TF.eof()) { + auto Kind = TF.peekKind(); + UB->push_back(TF.next()); + if (Kind == tok::semi || Kind == tok::r_brace || Kind == tok::l_brace) + break; + } + return std::move(UB); +} + +static std::unique_ptr parseLabelStmt(TokenFilter &TF) { + auto Guard = TF.guard(); + if (!(checkKind(TF, tok::identifier) || checkKind(TF, tok::kw_private) || + checkKind(TF, tok::kw_protected) || checkKind(TF, tok::kw_public))) + return {}; + auto *LabelName = TF.next(); + if (!checkKind(TF, tok::colon)) + return {}; + Guard.dismiss(); + return make_unique(LabelName, TF.next()); +} + +static std::unique_ptr parseIncludeDirective(RawTokenFilter &TF) { + if (!checkKind(TF, tok::hash)) + return {}; + auto Guard = TF.guard(); + + auto *HashTok = TF.next(); + if (TF.peek()->Tok().getIdentifierInfo()->getPPKeywordID() != tok::pp_include) + return {}; + + auto Inc = make_unique(); + Inc->setHash(HashTok); + Inc->setInclude(TF.next()); + Inc->Path = make_unique(); + + while (!checkKind(TF, tok::eod)) { + Inc->Path->addToken(TF.next()); + } + Inc->setEOD(TF.next()); + return Inc; +} + +static std::unique_ptr parsePPIf(RawTokenFilter &TF) { + if (!checkKind(TF, tok::hash)) + return {}; + auto Guard = TF.guard(); + + auto *HashTok = TF.next(); + + if (TF.peek()->Tok().getIdentifierInfo()->getPPKeywordID() != tok::pp_else && + TF.peek()->Tok().getIdentifierInfo()->getPPKeywordID() != tok::pp_if && + TF.peek()->Tok().getIdentifierInfo()->getPPKeywordID() != tok::pp_elif && + TF.peek()->Tok().getIdentifierInfo()->getPPKeywordID() != tok::pp_endif) + return {}; + + auto If = make_unique(); + If->setHash(HashTok); + If->setKeyword(TF.next()); + + auto Start = TF.mark(); + + if (!checkKind(TF, tok::eod)) { + while (!checkKind(TF, tok::eod)) + TF.next(); + assert(checkKind(TF, tok::eod)); + + TokenFilter SubTF = TF.rangeAsTokenFilter(Start, TF.mark()); + + auto SubStart = SubTF.mark(); + std::unique_ptr Cond; + if ((Cond = parseExpr(SubTF)) && checkKind(TF, tok::eod)) + If->Cond = std::move(Cond); + else { + SubTF.rewind(SubStart); + auto UB = make_unique(); + while (!checkKind(SubTF, tok::eod)) + UB->push_back(SubTF.next()); + If->Cond = std::move(UB); + } + } + + assert(checkKind(TF, tok::eod)); + If->setEOD(TF.next()); + return If; +} + +static std::unique_ptr parsePPDirective(RawTokenFilter &TF) { + assert(checkKind(TF, tok::hash)); + if (auto I = parseIncludeDirective(TF)) + return std::move(I); + if (auto D = parsePPIf(TF)) + return std::move(D); + auto UP = make_unique(); + while (!checkKind(TF, tok::eod)) + UP->push_back(TF.next()); + return std::move(UP); +} + +static std::unique_ptr parseAny(TokenFilter &TF, + bool SkipUnparsable = true, + bool NameOptional = false); + +static bool parseScope(TokenFilter &TF, Scope &Sc, bool NameOptional = false) { + if (checkKind(TF, tok::r_brace)) + return true; + while (auto St = parseAny(TF, true, NameOptional)) { + Sc.addStmt(std::move(St)); + if (TF.eof()) + return false; + if (checkKind(TF, tok::r_brace)) + return true; + } + return checkKind(TF, tok::r_brace); +} + +static std::unique_ptr parseCompoundStmt(TokenFilter &TF) { + if (!checkKind(TF, tok::l_brace)) + return {}; + auto C = make_unique(); + C->setLeftBrace(TF.next()); + parseScope(TF, *C); + if (checkKind(TF, tok::r_brace)) + C->setRightBrace(TF.next()); + // else: just pass + return C; +} + +static std::unique_ptr parseControlFlowBody(TokenFilter &TF) { + return checkKind(TF, tok::l_brace) ? parseCompoundStmt(TF) : parseAny(TF); +} + +static std::unique_ptr parseCond(TokenFilter &TF, + bool ForLoopInit = false) { + if (ForLoopInit) + if (auto D = parseDeclStmt(TF, /*WithSemi=*/false)) + return std::move(D); + { + auto Guard = TF.guard(); + if (auto D = parseVarDecl(TF)) { + if (checkKind(TF, tok::r_paren)) { + Guard.dismiss(); + return std::move(D); + } + } + } + if (auto E = parseExpr(TF)) + return std::move(E); + + auto UB = make_unique(); + int ParenOpen = 1; + while (!TF.eof()) { + if (checkKind(TF, tok::l_paren)) { + ++ParenOpen; + } else if (checkKind(TF, tok::r_paren)) { + if (--ParenOpen == 0) { + return std::move(UB); + } + } + + if (checkKind(TF, tok::l_brace) || checkKind(TF, tok::r_brace) || + checkKind(TF, tok::semi)) + return std::move(UB); + + UB->push_back(TF.next()); + } + return std::move(UB); +} + +static std::unique_ptr parseControlFlowStmt(TokenFilter &TF) { + auto Guard = TF.guard(); + + if (checkKind(TF, tok::kw_while)) { + auto S = make_unique(); + + S->setKeyword(TF.next()); + if (!checkKind(TF, tok::l_paren)) + return {}; + S->setLeftParen(TF.next()); + + if (!(S->Cond = parseCond(TF))) + return {}; + + if (checkKind(TF, tok::r_paren)) + S->setRightParen(TF.next()); + + S->Body = parseControlFlowBody(TF); + + Guard.dismiss(); + return std::move(S); + } + + if (checkKind(TF, tok::kw_if)) { + auto If = make_unique(); + for (bool ElseBranch = false, First = true; !ElseBranch; First = false) { + AnnotatedToken *KW1, *KW2 = nullptr; + if (First && checkKind(TF, tok::kw_if)) { + KW1 = TF.next(); + } else if (checkKind(TF, tok::kw_else)) { + KW1 = TF.next(); + if (checkKind(TF, tok::kw_if)) + KW2 = TF.next(); + else + ElseBranch = true; + } else { + break; + } + + std::unique_ptr Cond; + AnnotatedToken *LPar = nullptr, *RPar = nullptr; + + if (!ElseBranch) { + if (!checkKind(TF, tok::l_paren)) + return {}; + LPar = TF.next(); + + if (!(Cond = parseCond(TF))) + return {}; + + if (checkKind(TF, tok::r_paren)) + RPar = TF.next(); + } + + auto Body = parseControlFlowBody(TF); + + If->addBranch(KW1, KW2, LPar, std::move(Cond), RPar, std::move(Body)); + } + Guard.dismiss(); + return std::move(If); + } + + if (checkKind(TF, tok::kw_for)) { + auto S = make_unique(); + + S->setKeyword(TF.next()); + if (!checkKind(TF, tok::l_paren)) + return {}; + S->setLeftParen(TF.next()); + + if (!checkKind(TF, tok::semi) && + !(S->Init = parseCond(TF, /*ForLoopInit=*/true))) + return {}; + if (!checkKind(TF, tok::semi)) + return {}; + S->setSemi1(TF.next()); + if (!checkKind(TF, tok::semi) && !(S->Cond = parseCond(TF))) + return {}; + if (!checkKind(TF, tok::semi)) + return {}; + S->setSemi2(TF.next()); + if (!checkKind(TF, tok::r_paren) && !(S->Inc = parseExpr(TF))) + return {}; + + if (checkKind(TF, tok::r_paren)) + S->setRightParen(TF.next()); + + S->Body = parseControlFlowBody(TF); + + Guard.dismiss(); + return std::move(S); + } + + return {}; +} + +static bool parseClassScope(TokenFilter &TF, ClassDecl &C) { + if (!checkKind(TF, tok::l_brace)) + return false; + + C.setLeftBrace(TF.next()); + if (!parseScope(TF, C, true)) + return false; + + if (checkKind(TF, tok::r_brace)) + C.setRightBrace(TF.next()); + + if (checkKind(TF, tok::semi)) + C.setSemi(TF.next()); + // else: just pass + + return true; +} + +static std::unique_ptr parseNamespaceDecl(TokenFilter &TF) { + if (!checkKind(TF, tok::kw_namespace)) + return {}; + auto Guard = TF.guard(); + + AnnotatedToken *NSTok = TF.next(), *NameTok = nullptr; + if (checkKind(TF, tok::identifier)) + NameTok = TF.next(); + + if (!checkKind(TF, tok::l_brace)) + return {}; + + auto NS = make_unique(); + NS->setNamespace(NSTok); + NS->setName(NameTok); + NS->setLeftBrace(TF.next()); + + parseScope(TF, *NS); + + if (checkKind(TF, tok::r_brace)) + NS->setRightBrace(TF.next()); + + Guard.dismiss(); + return std::move(NS); +} + +static std::unique_ptr parseClassDecl(TokenFilter &TF) { + if (!(checkKind(TF, tok::kw_class) || checkKind(TF, tok::kw_struct) || + checkKind(TF, tok::kw_union) || checkKind(TF, tok::kw_enum))) + return {}; + + auto Guard = TF.guard(); + + auto C = make_unique(); + C->setClass(TF.next()); + + if (!(C->Name = parseType(TF))) + return {}; + + if (checkKind(TF, tok::colon)) { + C->setColon(TF.next()); + bool Skip = true; + for (;;) { + AnnotatedToken *Accessibility = nullptr; + if (checkKind(TF, tok::kw_private) || checkKind(TF, tok::kw_protected) || + checkKind(TF, tok::kw_public)) + Accessibility = TF.next(); + auto T = parseType(TF, false); + if (!T) + break; + if (checkKind(TF, tok::l_brace)) { + C->addBaseClass(Accessibility, std::move(T), nullptr); + Skip = false; + break; + } + if (!checkKind(TF, tok::comma)) + break; + C->addBaseClass(Accessibility, std::move(T), TF.next()); + } + if (Skip) { + while (!checkKind(TF, tok::l_brace)) + TF.next(); + } + } + + if (checkKind(TF, tok::semi)) + C->setSemi(TF.next()); + else + parseClassScope(TF, *C); + + Guard.dismiss(); + return C; +} + +static std::unique_ptr +parseTemplateParameterType(TokenFilter &TF) { + if (!(checkKind(TF, tok::kw_typename) || checkKind(TF, tok::kw_class))) + return {}; + auto Guard = TF.guard(); + + auto TPT = make_unique(); + TPT->setKeyword(TF.next()); + if (!checkKind(TF, tok::identifier)) + return {}; + TPT->setName(TF.next()); + + if (checkKind(TF, tok::equal)) { + TPT->setEqual(TF.next()); + if (!(TPT->DefaultType = parseType(TF))) + return {}; + } + + Guard.dismiss(); + return TPT; +} +static std::unique_ptr parseTemplateDecl(TokenFilter &TF) { + if (!checkKind(TF, tok::kw_template)) + return {}; + + auto Guard = TF.guard(); + auto T = make_unique(); + T->setKeyword(TF.next()); + + if (!checkKind(TF, tok::less)) + return {}; + T->setLess(TF.next()); + + while (!checkKind(TF, tok::greater)) { + if (auto D = parseVarDecl(TF, /*TypeName=*/0, /*NameOptional*/ false, + /*StopAtGreater=*/true)) + T->addParam(std::move(D)); + else if (auto TPT = parseTemplateParameterType(TF)) + T->addParam(std::move(TPT)); + else + return {}; + + if (checkKind(TF, tok::comma)) + T->addComma(TF.next()); + else if (!checkKind(TF, tok::greater)) + return {}; + } + + assert(checkKind(TF, tok::greater)); + T->setGreater(TF.next()); + + if (auto F = parseFunctionDecl(TF)) + T->Templated = std::move(F); + else if (auto C = parseClassDecl(TF)) + T->Templated = std::move(C); + else + return {}; + + Guard.dismiss(); + return T; +} + +static std::unique_ptr parseAny(TokenFilter &TF, bool SkipUnparsable, + bool NameOptional) { + if (auto S = parseDeclStmt(TF)) + return S; + if (auto S = parseReturnStmt(TF)) + return S; + if (auto S = parseLabelStmt(TF)) + return S; + if (auto S = parseControlFlowStmt(TF)) + return S; + if (auto S = parseTemplateDecl(TF)) + return std::move(S); + if (auto S = parseFunctionDecl(TF, NameOptional)) { + if (checkKind(TF, tok::semi)) + S->setSemi(TF.next()); + else if (checkKind(TF, tok::l_brace)) { + S->Body = parseCompoundStmt(TF); + } + return std::move(S); + } + if (auto S = parseNamespaceDecl(TF)) + return S; + + if (auto S = parseClassDecl(TF)) { + if (checkKind(TF, tok::semi)) + S->setSemi(TF.next()); + else if (checkKind(TF, tok::l_brace)) { + parseClassScope(TF, *S); + } + return std::move(S); + } + { + auto Guard = TF.guard(); + if (auto E = parseExpr(TF)) { + if (checkKind(TF, tok::semi)) { + Guard.dismiss(); + return make_unique(std::move(E), TF.next()); + } + } + } + return SkipUnparsable ? skipUnparsable(TF) : std::unique_ptr(); +} + +TranslationUnit fuzzyparse(AnnotatedToken *first, AnnotatedToken *last) { + TranslationUnit TU; + { + BasicTokenFilter TF(first, last); + while (!TF.eof()) { + if (TF.peekKind() == tok::hash && TF.peek()->Tok().isAtStartOfLine()) + TU.addPPDirective(parsePPDirective(TF)); + TF.next(); + } + } + { + TokenFilter TF(first, last); + while (!TF.eof()) + TU.addStmt(parseAny(TF)); + } + return TU; +} + +} // end namespace fuzzy +} // end namespace clang Index: clang-highlight/FuzzyType.h =================================================================== --- /dev/null +++ clang-highlight/FuzzyType.h @@ -0,0 +1,39 @@ +#error NOT YET NEEDED +//===--- FuzzyType.h - clang-highlight --------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_CLANG_HIGHLIGHT_FUZZY_TYPE_H +#define LLVM_CLANG_TOOLS_CLANG_HIGHLIGHT_FUZZY_TYPE_H + +#include "clang/Basic/SourceManager.h" +#include "AnnotatedToken.h" +#include + +using namespace clang; + +namespace clang { +namespace fuzzy { + +struct Type { + struct TypeAnnotation { + enum AnnotationClass { + Pointer, + Reference, + }; + AnnotationClass Class; + AnnotatedToken *Tok; + }; + llvm::SmallVector Annotations; + AnnotatedToken *NameToken; +}; + +} // end namespace fuzzy +} // end namespace clang + +#endif // LLVM_CLANG_TOOLS_CLANG_HIGHLIGHT_FUZZY_TYPE_H Index: clang-highlight/OutputWriter.h =================================================================== --- /dev/null +++ clang-highlight/OutputWriter.h @@ -0,0 +1,56 @@ +//===--- OutputWriter.h - clang-highlight -----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_CLANG_HIGHLIGHT_OUTPUT_WRITER_H +#define LLVM_CLANG_TOOLS_CLANG_HIGHLIGHT_OUTPUT_WRITER_H + +#include "llvm/ADT/StringRef.h" +#include + +namespace clang { +namespace highlight { + +enum class OutputFormat { + StdoutColored, + HTML, + SemanticHTML, + LaTeX, + // TODO: XML +}; + +enum class TokenClass { + NONE, + Type, + Variable, + Function, + Namespace, + Keyword, + Comment, + Preprocessor, + String, + Char, + Numeric, + Other, + Whitespace, +}; + +class OutputWriter { +public: + virtual void writeToken(llvm::StringRef Text, TokenClass Class) = 0; + virtual ~OutputWriter(); +}; + +// \brief Creates a output writer that writes in the specified Format to stdout +std::unique_ptr makeOutputWriter(OutputFormat Format, + llvm::raw_ostream &OS); + +} // end namespace highlight +} // end namespace clang + +#endif // LLVM_CLANG_TOOLS_CLANG_HIGHLIGHT_OUTPUT_WRITER_H Index: clang-highlight/OutputWriter.cpp =================================================================== --- /dev/null +++ clang-highlight/OutputWriter.cpp @@ -0,0 +1,290 @@ +//===--- OutputWriter.cpp - clang-highlight ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file OutputWriter.cpp +/// \brief Converts the metadata into a given output format. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "OutputWriter.h" + +using namespace llvm; + +namespace clang { +namespace highlight { + +OutputWriter::~OutputWriter() {} + +namespace { +struct StdoutFormatInfo { + StdoutFormatInfo(raw_ostream::Colors Color, bool Bold = false) + : Color(Color), Bold(Bold) {} + raw_ostream::Colors Color; + bool Bold; +}; +} // end anonymous namespace + +static StdoutFormatInfo getFormatInfo(TokenClass Class) { + switch (Class) { + case TokenClass::Type: + return { raw_ostream::GREEN }; + case TokenClass::Keyword: + return { raw_ostream::BLUE }; + case TokenClass::Comment: + return { raw_ostream::RED }; + case TokenClass::Namespace: + return { raw_ostream::GREEN }; + case TokenClass::Preprocessor: + return { raw_ostream::CYAN }; + case TokenClass::String: + case TokenClass::Char: + return { raw_ostream::MAGENTA }; + case TokenClass::Numeric: + return { raw_ostream::BLUE, true }; + case TokenClass::Function: + return { raw_ostream::BLACK, true }; + default: + return { raw_ostream::BLACK }; + } +} + +static const char *getSpanStyle(TokenClass Class) { + switch (Class) { + case TokenClass::Namespace: + case TokenClass::Type: + return "color:green"; + case TokenClass::Keyword: + return "color:blue"; + case TokenClass::Comment: + return "color:darkred"; + case TokenClass::Preprocessor: + return "color:purple"; + case TokenClass::String: + return "color:red"; + case TokenClass::Char: + return "color:magenta"; + case TokenClass::Numeric: + return "color:DarkSlateGray"; + case TokenClass::Function: + return "color:black;font-style:italic"; + default: + return "color:black"; + } +} + +static const char *getClassName(TokenClass Class) { + switch (Class) { + case TokenClass::Namespace: + return "namespace"; + case TokenClass::Type: + return "type"; + case TokenClass::Keyword: + return "keyword"; + case TokenClass::Comment: + return "comment"; + case TokenClass::Preprocessor: + return "preprocessor"; + case TokenClass::String: + return "string"; + case TokenClass::Char: + return "char"; + case TokenClass::Function: + return "function"; + case TokenClass::Numeric: + return "numeric"; + case TokenClass::Variable: + return "variable"; + default: + return "default"; + } +} + +namespace { +class XmlEscaper { + StringRef S; + +public: + XmlEscaper(StringRef S) : S(S) {}; + + friend raw_ostream &operator<<(raw_ostream &OS, const XmlEscaper &HE) { + for (char C : HE.S) + switch (C) { + case '&': + OS << "&"; + break; + case '\'': + OS << "'"; + break; + case '"': + OS << """; + break; + case '<': + OS << "<"; + break; + case '>': + OS << ">"; + break; + default: + OS << C; + break; + } + return OS; + } +}; +} // end anonymous namespace + +XmlEscaper xmlEscaped(StringRef S) { return XmlEscaper(S); } + +namespace { +class ColorStreamWriter : public OutputWriter { + raw_ostream &OS; + +public: + ColorStreamWriter(raw_ostream &OS) : OS(OS) { + OS.changeColor(raw_ostream::BLACK); + } + ~ColorStreamWriter() { OS.changeColor(raw_ostream::BLACK); } + + void writeToken(StringRef Text, TokenClass Class) override { + StdoutFormatInfo Style = getFormatInfo(Class); + OS.changeColor(Style.Color, Style.Bold); + OS << Text; + } +}; +} // end anonymous namespace + +namespace { +class HtmlWriter : public OutputWriter { + raw_ostream &OS; + +public: + HtmlWriter(raw_ostream &OS) : OS(OS) { + OS << "

"; + } + ~HtmlWriter() { OS << "

"; } + + void writeToken(StringRef Text, TokenClass Class) override { + OS << R"()" + << xmlEscaped(Text) << ""; + } +}; +} // end anonymous namespace + +namespace { +class SemanticHtmlWriter : public OutputWriter { + raw_ostream &OS; + +public: + SemanticHtmlWriter(raw_ostream &OS) : OS(OS) { + OS << R"( +

)"; + } + ~SemanticHtmlWriter() { OS << "

"; } + + void writeToken(StringRef Text, TokenClass Class) override { + OS << R"()" + << xmlEscaped(Text) << ""; + } +}; +} // end anonymous namespace + +namespace { +class LaTeXEscaper { + StringRef S; + +public: + LaTeXEscaper(StringRef S) : S(S) {}; + + friend raw_ostream &operator<<(raw_ostream &OS, const LaTeXEscaper &HE) { + for (char C : HE.S) + switch (C) { + case '{': + case '}': + case '_': + case '&': + case '#': + case '%': + case '$': + OS << "{\\" << C << "}"; + break; + case '^': + OS << "{\\^{}}"; + break; + case '\\': + OS << "{\\textbackslash}"; + break; + case '<': + OS << "{\\textless}"; + break; + case '>': + OS << "{\\textgreater}"; + break; + case '~': + OS << "{\\textasciitilde}"; + break; + default: + OS << C; + } + return OS; + } +}; +} // end anonymous namespace + +LaTeXEscaper latexEscaped(StringRef S) { return LaTeXEscaper(S); } + +namespace { +class LaTeXWriter : public OutputWriter { + raw_ostream &OS; + +public: + LaTeXWriter(raw_ostream &OS) : OS(OS) {} + ~LaTeXWriter() {} + + void writeToken(StringRef Text, TokenClass Class) override { + if (Class == TokenClass::Whitespace) + OS << latexEscaped(Text); + else + OS << "\\clangHighlightToken{" << getClassName(Class) << "}{" + << latexEscaped(Text) << "}"; + } +}; +} // end anonymous namespace + +std::unique_ptr makeOutputWriter(OutputFormat Format, + raw_ostream &OS) { + switch (Format) { + case OutputFormat::StdoutColored: + return std::unique_ptr(new ColorStreamWriter(OS)); + case OutputFormat::HTML: + return std::unique_ptr(new HtmlWriter(OS)); + case OutputFormat::SemanticHTML: + return std::unique_ptr(new SemanticHtmlWriter(OS)); + case OutputFormat::LaTeX: + return std::unique_ptr(new LaTeXWriter(OS)); + default: + llvm_unreachable("invalid flag"); + } +} + +} // end namespace highlight +} // end namespace clang Index: clang-highlight/TokenClassifier.h =================================================================== --- /dev/null +++ clang-highlight/TokenClassifier.h @@ -0,0 +1,32 @@ +//===--- TokenClassifier.h - clang-highlight --------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_CLANG_HIGHLIGHT_TOKEN_CLASSIFIER_H +#define LLVM_CLANG_TOOLS_CLANG_HIGHLIGHT_TOKEN_CLASSIFIER_H + +#include "llvm/ADT/StringRef.h" +#include + +namespace llvm { +class MemoryBuffer; +} + +namespace clang { +namespace highlight { + +class OutputWriter; + +void highlight(std::unique_ptr Source, + llvm::StringRef FileName, std::unique_ptr OW, + bool IdentifiersOnly = false, bool DumpAST = false); + +} // end namespace highlight +} // end namespace clang + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CLANG_TIDY_H Index: clang-highlight/TokenClassifier.cpp =================================================================== --- /dev/null +++ clang-highlight/TokenClassifier.cpp @@ -0,0 +1,196 @@ +//===--- TokenClassifier.cpp - clang-highlight ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#include "llvm/Support/Debug.h" +#include "clang/Lex/Lexer.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceManager.h" +#include "llvm/Config/config.h" +#include "OutputWriter.h" +#include +#include "TokenClassifier.h" +#include "Fuzzy/FuzzyAST.h" + +using namespace clang; + +namespace clang { +namespace highlight { + +LangOptions getFormattingLangOpts(bool Cpp03 = false) { + LangOptions LangOpts; + LangOpts.CPlusPlus = 1; + LangOpts.CPlusPlus11 = Cpp03 ? 0 : 1; + LangOpts.CPlusPlus1y = Cpp03 ? 0 : 1; + LangOpts.LineComment = 1; + LangOpts.Bool = 1; + LangOpts.ObjC1 = 1; + LangOpts.ObjC2 = 1; + return LangOpts; +} + +bool isCharLiteral(tok::TokenKind TK) { + switch (TK) { + case tok::char_constant: + case tok::wide_char_constant: + case tok::utf16_char_constant: + case tok::utf32_char_constant: + return true; + default: + return false; + } +} + +bool isKeyword(tok::TokenKind TK) { + switch (TK) { +#define KEYWORD(X, Y) case tok::kw_##X: +#include "clang/Basic/TokenKinds.def" + return true; + default: + return false; + } +} + +TokenClass convertTokenKindToTokenClass(tok::TokenKind TK) { + if (isCharLiteral(TK)) + return TokenClass::Char; + if (isStringLiteral(TK)) + return TokenClass::String; + if (TK == tok::numeric_constant) + return TokenClass::Numeric; + if (isKeyword(TK)) + return TokenClass::Keyword; + if (TK == tok::annot_typename) + return TokenClass::Type; + if (TK == tok::comment) + return TokenClass::Comment; + if (TK == tok::unknown || TK == tok::eod) + return TokenClass::Whitespace; + return TokenClass::Other; +} + +void highlight(std::unique_ptr Source, StringRef FileName, + std::unique_ptr OW, bool IdentifiersOnly, + bool DumpAST) { + using namespace llvm; + using namespace clang; + + FileManager Files((FileSystemOptions())); + DiagnosticsEngine Diagnostics( + IntrusiveRefCntPtr(new DiagnosticIDs), + new DiagnosticOptions); + SourceManager SourceMgr(Diagnostics, Files); + llvm::MemoryBuffer *Buf = Source.release(); // SourceMgr owns Buf for us + const clang::FileEntry *Entry = + Files.getVirtualFile(FileName, Buf->getBufferSize(), 0); + SourceMgr.overrideFileContents(Entry, Buf); + FileID ID = + SourceMgr.createFileID(Entry, SourceLocation(), clang::SrcMgr::C_User); + + auto Langs = getFormattingLangOpts(); + Lexer Lex(ID, SourceMgr.getBuffer(ID), SourceMgr, Langs); + Lex.SetKeepWhitespaceMode(true); + + IdentifierTable IdentTable(getFormattingLangOpts()); + + std::vector AllTokens; + + for (;;) { + Token TmpTok; + Lex.LexFromRawLexer(TmpTok); + + if (TmpTok.getKind() == tok::hash && TmpTok.isAtStartOfLine()) + Lex.setParsingPreprocessorDirective(true); + if (TmpTok.getKind() == tok::eod) + Lex.setParsingPreprocessorDirective(false); + + AllTokens.push_back(fuzzy::AnnotatedToken(TmpTok)); + Token &ThisTok = AllTokens.back().Tok(); + + StringRef TokenText(SourceMgr.getCharacterData(ThisTok.getLocation()), + ThisTok.getLength()); + + if (ThisTok.is(tok::raw_identifier)) { + IdentifierInfo &Info = IdentTable.get(TokenText); + ThisTok.setIdentifierInfo(&Info); + ThisTok.setKind(Info.getTokenID()); + } + + if (ThisTok.is(tok::eof)) + break; + } + + auto TU = fuzzy::fuzzyparse(&*AllTokens.begin(), &*AllTokens.end()); + + if (DumpAST) { + fuzzy::printAST(llvm::dbgs(), TU, SourceMgr); + return; + } + + const char *LastTokenStart = nullptr, *ThisTokenStart = nullptr; + Token LastTok; + TokenClass Class = TokenClass::NONE; + for (auto &ATok : AllTokens) { + Token &ThisTok = ATok.Tok(); + + ThisTokenStart = SourceMgr.getCharacterData(ThisTok.getLocation()); + if (LastTokenStart) { + if (Class == TokenClass::NONE || LastTok.getKind() == tok::eod) + Class = convertTokenKindToTokenClass(LastTok.getKind()); + OW->writeToken(StringRef(LastTokenStart, ThisTokenStart - LastTokenStart), + Class); + } + + Class = TokenClass::NONE; + + StringRef TokenText(SourceMgr.getCharacterData(ThisTok.getLocation()), + ThisTok.getLength()); + + if (ATok.hasASTReference()) { + auto *R = ATok.getASTReference(); + if (llvm::isa(R) && + ATok.getTokenKind() == tok::identifier) { + Class = TokenClass::Namespace; + } + auto isType = [&] { + return llvm::isa(R) || + llvm::isa(R); + }; + auto isTypeDecl = [&] { + return ATok.getTokenKind() == tok::identifier && + (llvm::isa(R) || + llvm::isa(R)); + }; + if ((!IdentifiersOnly || ATok.getTokenKind() == tok::identifier) && + (isType() || isTypeDecl())) { + Class = TokenClass::Type; + ThisTok.setKind(tok::annot_typename); + } + if (isa(R)) { + Class = TokenClass::String; + } + if (isa(R)) { + Class = TokenClass::Preprocessor; + } + if (isa(R)) { + Class = TokenClass::Variable; + } + if (ATok.getTokenKind() == tok::identifier && + (isa(R) || isa(R))) { + Class = TokenClass::Function; + } + } + LastTok = ThisTok; + LastTokenStart = ThisTokenStart; + } +} + +} // end namespace highlight +} // end namespace clang Index: clang-highlight/latex/clanghighlight.sty =================================================================== --- /dev/null +++ clang-highlight/latex/clanghighlight.sty @@ -0,0 +1,68 @@ +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{clanghighlight}[2014/07/30 v0.1 clang-highlight package for LaTeX.] +\usepackage{fancyvrb} +\usepackage{xcolor} +\usepackage{ifplatform} + +\ifwindows + \providecommand\DeleteFile[1]{\immediate\write18{del #1}} +\else + \providecommand\DeleteFile[1]{\immediate\write18{rm #1}} +\fi + +\newcommand\ch@style@namespace{\color{teal}} +\newcommand\ch@style@type{\color[HTML]{228B22}} +\newcommand\ch@style@keyword{\color{violet}} +\newcommand\ch@style@comment{\color[HTML]{800000}\itshape} +\newcommand\ch@style@preprocessor{\color[HTML]{483D8B}} +\newcommand\ch@style@string{\color[HTML]{DE2E2E}} +\newcommand\ch@style@char{\color{purple}} +\newcommand\ch@style@function{\color[HTML]{000080}} +\newcommand\ch@style@numeric{\color[HTML]{707070}} +\newcommand\ch@style@variable{\color{black}} +\newcommand\ch@style@default{\color{black}} + +\newcommand\clangHighlightToken[2]{{\expandafter\csname ch@style@#1\endcsname{}#2}} + +\providecommand\ch@clanghighlight[1]{clang-highlight #1} +\newcommand\clanghighlightCmd[1]{\renewcommand\ch@clanghighlight[1]{#1 ##1}} +\clanghighlightCmd{clang-highlight} + +\def\ch@fvopts{} +\newcommand\cxxset[1]{\def\ch@fvopts{#1}} + +\begingroup +\catcode`\^^M\active% +\global\def\activeeol{^^M}% +\endgroup + +\def\cxx@[#1]{\def\ch@fvoptsarg{#1} + \VerbatimEnvironment\begin{VerbatimOut}[codes={\catcode`\^^I=12}]{\jobname.cc}} +\def\cxx@noargs#1{\edef\temp{[]\activeeol\string#1}\expandafter\cxx@\temp} + +\newenvironment{cxx}% +{\@ifnextchar[\cxx@\cxx@noargs}% +{\end{VerbatimOut}% +\inputcxx[\ch@fvoptsarg]{\jobname.cc}% +\DeleteFile{\jobname.cc}% +} + +\newcommand\inputcxx[2][]{% +\protected@xdef\ch@cmd{\ch@clanghighlight{-latex #2 -o \jobname.ch}} +\IfFileExists{\jobname.ch}{\DeleteFile{\jobname.ch}}{} +\immediate\write18{\ch@cmd} +\IfFileExists{\jobname.ch}{% + \edef\ch@fvoptsall{\ch@fvopts,#1}% + \expandafter\VerbatimInput\expandafter[\ch@fvoptsall,commandchars=\\\{\}]{\jobname.ch}% + \DeleteFile{\jobname.ch}}% +{\PackageError{clanghighlight}{Error executing `\ch@cmd'.}{Make sure% + clang-highlight is properly installed or doesn't crash with the given input.}% +}% +} + +\AtEndOfPackage{ + \ifnum\pdf@shellescape=1\relax\else + \PackageError{clanghighlight} + {You must invoke LaTeX with the -shell-escape flag} + {Pass the -shell-escape flag to LaTeX.}\fi +} Index: clang-highlight/latex/sample.tex =================================================================== --- /dev/null +++ clang-highlight/latex/sample.tex @@ -0,0 +1,49 @@ +\documentclass{article} +\usepackage[T1]{fontenc} +\usepackage{upquote} +\usepackage{minted} +\usepackage{listings} +\usepackage{clanghighlight} + +% Note: You might need change the path to clang-highlight +% \clanghighlightCmd{/path/to/clang-highlight} + +\newcommand\pkg[1]{\textsf{#1}} + +\begin{document} + +Sample code highlighted by the command line tool \verb|clang-highlight| and the +\LaTeX\ package \pkg{clanghighlight}: +\begin{cxx}[numbers=left] +#include +template T make(); +int main() /* block */ { // comment + const T& x = make("string", 'c'); +} +\end{cxx} +% instead of inline, one can use \inputcxx[]{file.cpp} +% Also, the options are optional. \begin{cxx} works as does \inputcxx{file} + +Same code using \textsf{Pygments} and the package \pkg{minted}: +\begin{minted}[linenos=true]{c++} +#include +template T make(); +int main() /* block */ { // comment + const T& x = make("string", 'c'); +} +\end{minted} +Note that \pkg{minted} has some limitations, because \textsf{Pygments} doesn't +output semantic \LaTeX. The single quote isn't straight (can be fixed through +a hack though) and the colors schemes can't be modified from within \LaTeX. + +And with \pkg{listings}, the pure \LaTeX\ solution: + +\begin{lstlisting}[language=c++,numbers=left] +#include +template T make(); +int main() /* block */ { // comment + const T& x = make("string", 'c'); +} +\end{lstlisting} + +\end{document} Index: clang-highlight/unittests/CMakeLists.txt =================================================================== --- /dev/null +++ clang-highlight/unittests/CMakeLists.txt @@ -0,0 +1,15 @@ +set(LLVM_LINK_COMPONENTS + Support + ) + +include_directories("..") + +add_highlight_unittest(FuzzyParseTests + FuzzyParseTest.cpp + ) + +target_link_libraries(FuzzyParseTests + clangAST + clangTooling + clangFuzzy + ) Index: clang-highlight/unittests/FuzzyParseTest.cpp =================================================================== --- /dev/null +++ clang-highlight/unittests/FuzzyParseTest.cpp @@ -0,0 +1,498 @@ +//===- unittests/FuzzyParseTest.cpp - fuzzy parsing unit tests ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Debug.h" +#include "clang/Lex/Lexer.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceManager.h" +#include "llvm/Config/config.h" +#include "gtest/gtest.h" +#include "Fuzzy/FuzzyAST.h" + +#define DEBUG_TYPE "highlight-test" + +using namespace llvm; + +namespace clang { +namespace fuzzy { + +class ClassOfTester { + bool (*FunPtr)(const ASTElement *); + +public: + ClassOfTester(bool (*FunPtr)(const ASTElement *)) : FunPtr(FunPtr) {} + bool verify(const ASTElement *AE) { return FunPtr(AE); } +}; +template ClassOfTester makeClassOfTester() { + return ClassOfTester(&T::classof); +} + +template SmallVector checkTypeSeq() { + ClassOfTester Seq[] = { makeClassOfTester()... }; + SmallVector Ret(Seq, Seq + sizeof...(T)); + return Ret; +} + +LangOptions getFormattingLangOpts(bool Cpp03 = false) { + LangOptions LangOpts; + LangOpts.CPlusPlus = 1; + LangOpts.CPlusPlus11 = Cpp03 ? 0 : 1; + LangOpts.CPlusPlus1y = Cpp03 ? 0 : 1; + LangOpts.LineComment = 1; + LangOpts.Bool = 1; + LangOpts.ObjC1 = 1; + LangOpts.ObjC2 = 1; + return LangOpts; +} + +class FuzzyParseTest : public ::testing::Test { +protected: + struct ParseResult { + TranslationUnit TU; + std::vector Tokens; + static constexpr const char *FileName = ""; + FileManager Files; + DiagnosticsEngine Diagnostics; + SourceManager SourceMgr; + FileID ID; + Lexer Lex; + IdentifierTable IdentTable; + + ParseResult(StringRef Code) + : Files((FileSystemOptions())), + Diagnostics(IntrusiveRefCntPtr(new DiagnosticIDs), + new DiagnosticOptions), + SourceMgr(Diagnostics, Files), + ID(SourceMgr.createFileID( + MemoryBuffer::getMemBuffer(Code, FileName))), + Lex(ID, SourceMgr.getBuffer(ID), SourceMgr, getFormattingLangOpts()), + IdentTable(getFormattingLangOpts()) { + Lex.SetKeepWhitespaceMode(true); + + for (;;) { + Token TmpTok; + Lex.LexFromRawLexer(TmpTok); + + if (TmpTok.getKind() == tok::hash && TmpTok.isAtStartOfLine()) + Lex.setParsingPreprocessorDirective(true); + if (TmpTok.getKind() == tok::eod) + Lex.setParsingPreprocessorDirective(false); + + Tokens.push_back(fuzzy::AnnotatedToken(TmpTok)); + Token &ThisTok = Tokens.back().Tok(); + + StringRef TokenText(SourceMgr.getCharacterData(ThisTok.getLocation()), + ThisTok.getLength()); + + if (ThisTok.is(tok::raw_identifier)) { + IdentifierInfo &Info = IdentTable.get(TokenText); + ThisTok.setIdentifierInfo(&Info); + ThisTok.setKind(Info.getTokenID()); + } + + if (ThisTok.is(tok::eof)) + break; + } + + TU = fuzzy::fuzzyparse(&*Tokens.begin(), &*Tokens.end()); + } + }; + + void checkParse(StringRef Code, + SmallVector TokenTypes) { + ParseResult Parsed(Code); + auto &AllTokens = Parsed.Tokens; + + size_t NonWhitespaceTokens = 0; + for (auto &Tok : AllTokens) + if (Tok.getTokenKind() != tok::comment && + Tok.getTokenKind() != tok::unknown && Tok.getTokenKind() != tok::eof) + ++NonWhitespaceTokens; + + EXPECT_EQ(NonWhitespaceTokens, TokenTypes.size()); + for (size_t I = 0, J = 0; I < TokenTypes.size(); ++I, ++J) { + while (AllTokens[J].getTokenKind() == tok::comment || + AllTokens[J].getTokenKind() == tok::unknown || + AllTokens[J].getTokenKind() == tok::eof) + ++J; + if (!TokenTypes[I].verify(AllTokens[J].getASTReference())) { + dbgs() << "Parsed " << Code << " into:\n"; + for (auto &S : Parsed.TU.children()) + printAST(dbgs(), S, Parsed.SourceMgr); + dbgs() << "I=" << I << ", J=" << J << '\n'; + EXPECT_TRUE(TokenTypes[I].verify(AllTokens[J].getASTReference())); + } + } + } + + void checkUnparsable(StringRef Code) { + ParseResult Parsed(Code); + for (auto &Tok : Parsed.Tokens) + if (Tok.getTokenKind() != tok::comment && + Tok.getTokenKind() != tok::unknown && Tok.getTokenKind() != tok::eof) + EXPECT_TRUE(isa(Tok.getASTReference())); + } + void checkUnparsable(std::initializer_list Codes) { + for (const char *C : Codes) + checkUnparsable(C); + } + + void dump(ParseResult &Parsed, StringRef Code) { + dbgs() << Code << '\n'; + + dbgs() << "Parsed " << Code << " into:\n"; + for (auto &S : Parsed.TU.children()) + printAST(dbgs(), S, Parsed.SourceMgr); + } + + template void checkToplevel(StringRef Code) { + ParseResult Parsed(Code); + if (Parsed.TU.children().size() != 1 || + !isa(Parsed.TU.Body[0].get())) { + dump(Parsed, Code); + } + EXPECT_EQ(Parsed.TU.children().size(), size_t(1)); + EXPECT_TRUE(isa(Parsed.TU.Body[0].get())); + } + + template + void checkToplevel(std::initializer_list Codes) { + for (const char *C : Codes) + checkToplevel(C); + } + + template void checkFirstPPOn(StringRef Code, F &&f) { + ParseResult Parsed(Code); + if (Parsed.TU.PPDirectives.size() == 0) { + dump(Parsed, Code); + EXPECT_TRUE(Parsed.TU.PPDirectives.size() > 0); + return; + } + if (!f(*Parsed.TU.PPDirectives[0], false)) { + dump(Parsed, Code); + EXPECT_TRUE(f(*Parsed.TU.PPDirectives[0], true)); + } + } + + template void checkFirstOn(StringRef Code, F &&f) { + ParseResult Parsed(Code); + if (Parsed.TU.children().size() == 0) { + dump(Parsed, Code); + EXPECT_TRUE(Parsed.TU.children().size() > 0); + return; + } + if (!f(*Parsed.TU.Body[0], false)) { + dump(Parsed, Code); + EXPECT_TRUE(f(*Parsed.TU.Body[0], true)); + } + } + + template void checkFirst(StringRef Code) { + checkFirstOn(Code, [&](const Stmt &S, bool Abort) { + if (Abort) + EXPECT_TRUE(isa(S)); + else + return isa(S); + return true; + }); + } + template + void checkFirst(std::initializer_list Codes) { + for (const char *C : Codes) + checkFirst(C); + } + + template void checkFirstPP(StringRef Code) { + checkFirstPPOn(Code, [&](const PPDirective &P, bool Abort) { + if (Abort) + EXPECT_TRUE(isa(P)); + else + return isa(P); + return true; + }); + } + template + void checkFirstPP(std::initializer_list Codes) { + for (const char *C : Codes) + checkFirstPP(C); + } +}; + +TEST_F(FuzzyParseTest, DeclStmtTest) { + checkParse("int i;", checkTypeSeq()); + checkParse("int i=5;", checkTypeSeq()); + checkParse("int i=5,j;", + checkTypeSeq()); + checkParse( + "int i=5,j=i;", + checkTypeSeq()); + checkParse( + "int i,j,k,l,m,n,o,p;", + checkTypeSeq()); + + checkParse("int *p;", + checkTypeSeq()); + checkParse("type &p;", + checkTypeSeq()); + + checkParse( + "int* p,* /*comment*/ ** * * q;", + checkTypeSeq()); + + checkParse( + "a b=c,*d=e,********f=****g**h;", + checkTypeSeq()); + + checkToplevel({ "a b;", // + "a b=c(d,e);", // + "a b=c(d,e,*g),*h=*i;", + // + "int a;", // + "unsigned long long int a;", // + "signed char a;", // + "double a;" }); + + checkParse("register const volatile constexpr int i;", + checkTypeSeq()); + + checkUnparsable({ "int 1=2;", // + "1 + !(unparsable!!!);" }); +} + +TEST_F(FuzzyParseTest, ExprLineStmtTest) { + checkToplevel({ "a*b*c;", // + "a*b*c=d;", // + "a*b*c==d;", // + "f();", // + "f(a,b,c);", // + "f(1,2,3);", // + "f(1)*g;", // + "n::f(1)*g;", // + "a+b;", // + "a-b;", // + "a*b*c;", // + "a/b;", // + "a&b&c;", // + "a^b;", // + "a|b;", // + "a<>b;", // + "ab;", // + "~a;", // + "!a;", // + "-a;", // + "--a;", // + "++a;", // + "++++~~~+~!~++++++!--++++++a;", // + "\"string literal\";", // + "nullptr;", // + "this;", // + "true;", // + "false;", // + "-1;", // + "(1+-1)*(3+5);" }); + checkUnparsable({ "1(a,b);", // + "f(", // + "f(," }); +} + +TEST_F(FuzzyParseTest, QualifiedIDs) { + checkToplevel( + { "std::vector v;", // + "::std::vector v1;", // + "std::vector v2;", // + "std::vector v3;", // + "std::vector<> v4;", // + "std::vector<1> v5;", // + "std::tr1::stl::vector<> v6;", // + "::vector<> v7;", // + "::std::tr1::stl::vector, ::std::pair > v8;", + "n::n::n::n::n::a,g > > > g;", + "a::b ***e=f::g<1>*h::i<2,j>(::k::l);", + "auto x = std::make_unique(0);" }); + + checkParse("auto x = std::make_unique(0);", + checkTypeSeq()); + checkToplevel({ "n::f(a::b());", // + "n::f(a::b<2*3>());", // + "t<1+b>();", // + "t< 1<<2 >();", // + "t< (1>2) >();" }); + checkUnparsable("t<1> 2>();"); +} + +TEST_F(FuzzyParseTest, FunctionDeclStmt) { + const char *Tests[] = { + "void f(int,int);", // + "void g(int i=0);", // + "static std::unique_ptr parseVarDecl(TokenFilter &TF," + " Type *TypeName = 0," + " bool NameOptional = false);", + "void dismiss() { TF = nullptr; }", // + "type func1();", "type func2() { 1+1; }", // + "type func3(type a) { 1+1; }", "type func4(type a, type b) { 1+1; }", + "static type func5();", + "static std::unique_ptr parseExpression(TokenFilter &TF," + " int Precedence," + " bool StopAtGreater);", + "static bool checkKind(TokenFilter &TF, tok::TokenKind Kind){}", + }; + for (const char *Code : Tests) + checkFirst(Code); +} + +TEST_F(FuzzyParseTest, ReturnStmt) { + checkToplevel({ "return 1;", // + "return a*b;", // + "return;" }); + checkUnparsable("return return;"); +} + +TEST_F(FuzzyParseTest, StructDecl) { + checkFirst({ "struct C;", // + "union C;", // + "class C{};", // + "class C{ >< };" }); + + auto checkFirstIsFunctionDecl = [&](StringRef Code) { + checkFirstOn(Code, [](const Stmt &S, bool Abort) { + if (Abort) + EXPECT_TRUE(isa(S)); + else if (!isa(S)) + return false; + const auto &CD = cast(S); + if (Abort) + EXPECT_EQ(CD.Body.size(), (size_t)1); + else if (CD.Body.size() != 1) + return false; + + if (Abort) + EXPECT_TRUE(isa(*CD.Body.front())); + else if (!isa(*CD.Body.front())) + return false; + + return true; + }); + }; + + checkFirstIsFunctionDecl("struct C { C(){} };"); + checkFirstIsFunctionDecl("struct C { ~C(){} };"); + checkFirstIsFunctionDecl("struct C { virtual void f() override =0; };"); + checkFirstIsFunctionDecl( + "struct C { static constexpr bool g() { return true; } };"); + checkFirstIsFunctionDecl("struct C { C()=default; };"); + checkFirstIsFunctionDecl("struct C { bool operator<(int o); };"); + checkFirstIsFunctionDecl( + "struct C { friend C operator==(C lhs, C rhs)=default; };"); +} + +TEST_F(FuzzyParseTest, IfStmt) { + const char *Tests[] = { + "if (true) {}", // + "if (0) do_sth();", // + "if (int i=0) {}", // + "if (int i=0) {} else do_sth_else();", + "if (int*i=0) {} else if (false) {} else do_sth_else();", + "if (int*i=0) {} else if (ns::t<4> x=4) {} else do_sth_else();", + "if (int*i=0) {} else if (ns::t<4> x=4) {} else do_sth_else();", + "if (1){}else if(1){}else if(1){}else if(1){}else if(1){}else " + "if(1){}else{}", + }; + for (const char *Code : Tests) + checkFirst(Code); + + checkUnparsable("else if (1);"); +} + +TEST_F(FuzzyParseTest, IfStmtFuzzy) { + checkFirst({ "if () {}", // + "if (true {}", // + "if (false)) {}", // + "if ();", }); +} + +TEST_F(FuzzyParseTest, WhileStmt) { + checkFirst({ "while (true) {}", // + "while (0) do_sth();", // + "while (int i=0) {}", // + }); +} + +TEST_F(FuzzyParseTest, ForStmt) { + checkFirst({ "for (;;) {}", // + "for (;;);", // + "for (int i=0;;) {}", // + "for (T x=0,y=3;;) {}", // + "for (T x,y,z;;) {}", // + "for (int i=0;int j=0;) {}", // + "for (int i=0;i<10;i=i+1) {}", // + "for (;int j;);", // + "for (;;i=i+1) {}", // + }); +} + +TEST_F(FuzzyParseTest, TemplateDecl) { + const char *Tests[] = { + "template void f();", + "template void f();", + "template void f();", + "template void f();", + //"template void f();", + "template void f() {}", + "template struct C;", + "template struct C;", + "template void f();", + "template void f();", + "template void f();", + }; + for (const char *Code : Tests) + checkFirst(Code); +} + +TEST_F(FuzzyParseTest, PPIf) { + checkFirstPP({ "#if 1", // + "#else", // + "#elif 1", // + "# if unparsable!", // + "#else EXPR", // + "#elif 1&1+1*3+f(3)", }); +} + +TEST_F(FuzzyParseTest, PPInclude) { + checkFirstPP({ "#include ", // + "#include \"header.h\"", // + "#include \"\"", // + "#include <>", // + "# /*comment*/ include ", + // " /*comment*/ # /*comment*/ include ", + "# include \"fancy/path!/???.h_\"", }); +} + +} // end namespace fuzzy +} // end namespace clang Index: docs/LibFuzzy.rst =================================================================== --- /dev/null +++ docs/LibFuzzy.rst @@ -0,0 +1,231 @@ +======== +LibFuzzy +======== + +LibFuzzy is a library for heuristically parsing C++ based on Clang's Lexer. +The fuzzy parser is fault-tolerant, works without knowledge of the build system +and on incomplete source files. As the parser necessarily makes guesses, the +resulting syntax tree may be partially wrong. + +This documents describes the LibFuzzy design and interface. + +When to use LibFuzzy +-------------------- + +Use LibFuzzy when you ...: + +* need fault-tolerant AST information +* need classification of tokens, but not more +* don't want setup overhead for your tool +* want fast results from a small input + +Do not use LibFuzzy when you ...: + +* need 100% accuracy +* need the context informations that a full Clang AST provides + +Look at the different options for +`Tooling http://clang.llvm.org/docs/Tooling.html` if you are interested in +non-fuzzy approaches. + +The Fuzzy AST +------------- + +The fuzzy AST is defined in ``Fuzzy/FuzzyAST.h``. It is designed to be as +similar as possible to the +`Clang AST http://clang.llvm.org/docs/IntroductionToTheClangAST.html`, but +differs because of some design decisions: + +* Each AST node contains references to all tokens that belong to it. This + implies that by visiting all nodes of the AST of a particular source code, you + find all the tokens lexed from that code. + + This has led to some hierarchy changes. E.g. ``Expr`` isn't derived from + ``Stmt`` because as a statement ``Expr`` needs a trailing semicolon, but + otherwise it doesn't. Therefore ``ExprLineStmt`` exists to make an ``Expr`` + into a ``Stmt`` and keep track of the semicolon. + +* After parsing, each token of the input stream has a reference to the AST node + that contains it. + + That's why a common base class for all AST nodes exists: ``ASTElement``. The + Clang AST doesn't have that. + +* The fuzzy parser doesn't go much deeper than classification of tokens. + + There's no canonicalization of qualified identifiers. Types don't contain a + reference to the type definition and can't be compared. + +How to use the Fuzzy AST +------------------------ + +The main to call the fuzzy parser is ``fuzzyparse`` which takes a range of +AnnotateToken as input. + +.. code-block:: c++ + + TranslationUnit fuzzyparse(AnnotatedToken *first, AnnotatedToken *last); + +``AnnotatedToken`` is a Clang Lexer token combined with a reference where +in the fuzzy AST it is located. + +.. code-block:: c++ + + class AnnotatedToken { + clang::Token Tok_; + ASTElement *Annot; + ... + }; + +The Clang Tokens can be obtained by the Clang Lexer in raw mode. The source +code of :program:`clang-highlight` contains sample usage. + +Current state +------------- + +The fuzzy parser can be tested with :program:`clang-highlight` and the +``-dump-ast`` option. + +.. code-block:: bash + + $ cat sample01.cpp + if () { + f(1+1); + } + $ clang-highlight -dump-ast sample01.cpp + If + Condition + Unparsable Block: + < + unparsable + > + Body: + CompoundStmt: + ExprLineStmt + call expr 'f' + 1 + plus + 1 + +The parser recognizes the if statement but is unable to parse the condition. +Every unparsable range of source code is put into a ``UnparsableBlock`` which +itself is a subclass of ``ASTElement``. The fuzzy parser is successfully able +to recover from this error. + +C++ does not have a context free grammar. If in doubt, a fuzzy parser has to +make guesses which may or may not be right. + +.. code-block:: bash + + $ cat sample02.cpp + auto ps = std::make_unique(); + std::array a; + const int SIZE=5; + std::array b; + $ clang-highlight -dump-ast sample02.cpp + DeclStmt + VarDecl 'ps' + Type 'auto' + Assignment Type '=' + call expr 'std::make_unique + < + Type 'std::string' + >' + DeclStmt + VarDecl 'a' + Type 'std::array + < + Type 'int' + 5 + >' + DeclStmt + VarDecl 'SIZE' + Type 'constint' + Assignment Type '=' + 5 + DeclStmt + VarDecl 'b' + Type 'std::array + < + Type 'int' + Type 'SIZE' + >' + +There are a number of guesses that need to be made in this code. Most +importantly: + +* Is ``std::make_unique`` a function or a type? +* Is ``std::string`` a constant or a type? +* Is ``SIZE`` a constant or a type? + +The first two questions cannot be decided without further context. The current +strategy is simple: If something looks like a function call, then it's a +function and not a constructor. If a template argument is either a type or a +constant, then it's a type. + +This strategy may be wrong. Give that ``SIZE`` is declared inside this code +snippet, it's very certain to assume that ``SIZE`` is a constant. However, the +fuzzy parser currently does not include context information from the part he +already has parsed. + +.. code-block:: bash + + $ cat sample03.cpp + #if __cplusplus <= 199711L // C++03 or older + std::tr1::auto_ptr p; + #else // C++11 + std::unique_ptr p; + #endif + $ clang-highlight -dump-ast sample03.cpp + Preprocessor 'if': + DeclRefExpr '__cplusplus' + lessequal + 199711L + Preprocessor 'else': + Preprocessor 'endif': + DeclStmt + VarDecl 'p' + Type 'std::tr1::auto_ptr + < + Type 'int' + >' + DeclStmt + VarDecl 'p' + Type 'std::unique_ptr + < + Type 'int' + >' + +This illustrates why the Clang Parser isn't easily usable for highlighting even +if the code is perfectly fine. There is no good solution to parse all +preprocessor branches. If a program depends, say, on 10 macros (``__linux__``, +``__cplusplus``, ``sizeof int``, etc.) then there are 2^10=1024 compilation +passes needed to get all possible results -- which may even lead to different +ASTs in the same places. If a compiler ignores the conditions the code may +contain syntax errors. The easiest solution would be to make only one pass and +gray the unused code paths out. + +The fuzzy parser parses all preprocessor statements in one pass and the code +without them in another. Because its fuzziness, this should go reasonably well. + +What next +--------- + +* Add all syntax elements of C++: Currently, only the most used subset of C++ + is implemented. + +* Improve the fuzziness. Add more sophisticated algorithms to handle unbalanced + parentheses for example. + +* Use context information. The parser could make use of a symbol table based on + the code it has seen already. + +* Language support for C and Objective C: Even though these languages share a + lot of their syntax with C++, they have subtle differences. It shouldn't be + hard to add those to the parser. + +* Optimize for speed: Add a memory manager for the AST and improve the parser. + There hasn't been much focus on speed yet. + +* Conversion between Clang's AST and the fuzzy AST. It there is a way to + produce a Clang AST, why not make use of it for tools that use the Fuzzy AST? Index: docs/clang-highlight.rst =================================================================== --- /dev/null +++ docs/clang-highlight.rst @@ -0,0 +1,114 @@ +=============== +Clang-Highlight +=============== + +:program:`clang-highlight` is a syntax highlighting tool for C++ based on a +:doc:`LibFuzzy` framework. + +Using clang-highlight +===================== + +:program:`clang-highlight` accepts exactly one argument, the file you want to +highlight. + +.. code-block:: bash + + $ clang-highlight test.cpp + +Or if you want to see it through a pager: + +.. code-block:: bash + + $ clang-highlight test.cpp | less -R + +Pass ``-`` as file name if you want to highlight the standard input. + +Output formats +-------------- + +.. option:: -stdout + + The default output format. Uses console colors. + +.. option:: -html + + Writes HTML as output with hardcoded colors. The individual tokens have the + form ``int``. + +.. option:: -shtml + + Writes semantic HTML with CSS selectors. The individual tokens have the + form ``int``. The class can be specified in + a separate style sheet by the user. + +.. option:: -latex + + Writes semantic LaTeX for use with the package that is bundled with + clang-highlight. See below. + +Further options +--------------- + +.. option:: -identifiers-only + + Per default, the star ``*`` in ``type *i;`` is classified as part of the type + name as is ``<`` and ``>`` in ``unique_ptr``. To disable this feature, + use the ``-identifiers-only`` option. + +.. option:: -dump-ast + + Only included for testing the fuzzy parser, will be removed later. + +.. option:: -o + + Output to a file instead of standard output. + +The LaTeX Package ``clanghighlight`` +------------------------------------ + +:program:`clang-highlight` can be used as a highlighter for LaTeX code. The +file ``clanghighlight.sty`` that is included in this repository provides a +package for easy usage. Just put it in the same directory as the ``.tex`` file +you are writing. + +.. code-block:: latex + + \usepackage{clanghighlight} % put this into the preamble + + % You might need to specify the full path to clang-highlight + % \clanghighlightCmd{/path/to/clang-highlight} + + % in the document: + \begin{cxx} + // your code goes here + \end{cxx} + + \begin{cxx}[numbers=left] % the options are directly passed to fancyvrb + // your code goes here + \end{cxx} + + \inputcxx{file.cpp} % use code from a file + +This package is only in beta status and some more functionality might be added +soon. + +Comparison to other highlighters +-------------------------------- + +Other highlighters exist, but mostly use regular expressions and are therefore +limited by design. See :doc:`LibFuzzy` for how :program:`clang-highlight` +parses C++. + +* `Pygments http://pygments.org/`: "Generic syntax highlighter for general use" + written in Python. Lexers are python classes. The current C++ lexer uses + regular expressions and only highlights preprocessor and keywords. + +* `GNU Source-Highlight http://www.gnu.org/software/src-highlite/`: Generic + highlighter available for many languages. Types, keywords, functions etc. can + be defined by a regular expression in a configuration file. In C++, only + keywords, symbols and functions (without templates) are highlighted. In + particular, there is no code to highlight other types than the builtin ones. + +* Highlighter from Editors (:program:`emacs`, :program:`vim`, etc.): Mostly + regex-based. Tightly coupled into the editor, not intended for use on the + command line.