diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h b/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h new file mode 100644 --- /dev/null +++ b/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h @@ -0,0 +1,147 @@ +//===--- Preprocess.h - Preprocess token streams -----------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The pseudoparser tries to match a token stream to the C++ grammar. +// Preprocessor #defines and other directives are not part of this grammar, and +// should be removed before the file can be parsed. +// +// Conditional blocks like #if...#else...#endif are particularly tricky, as +// simply stripping the directives may not produce a grammatical result: +// +// return +// #ifndef DEBUG +// 1 +// #else +// 0 +// #endif +// ; +// +// This header supports analyzing and removing the directives in a source file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H +#define LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H + +#include "clang/Basic/TokenKinds.h" +#include "clang/Tooling/Syntax/Pseudo/Token.h" +#include + +namespace clang { +class LangOptions; +namespace syntax { +namespace pseudo { + +/// Describes the structure of a source file, as seen by the preprocessor. +/// +/// The structure is a tree, whose leaves are plain source code and directives, +/// and whose internal nodes are #if...#endif sections. +/// +/// (root) +/// |-+ Directive #include +/// |-+ Code int main() { +/// | ` printf("hello, "); +/// |-+ Conditional -+ Directive #ifndef NDEBUG +/// | |-+ Code printf("debug\n"); +/// | |-+ Directive #else +/// | |-+ Code printf("production\n"); +/// | `-+ Directive #endif +/// |-+ Code return 0; +/// ` } +/// +/// Unlike the clang preprocessor, we model the full tree explicitly. +/// This class does not recognize macro usage, only directives. +struct PPStructure { + /// A range of code containing no directives. + struct Code { + Token::Range Tokens; + }; + /// A preprocessor directive. + struct Directive { + /// Raw tokens making up the directive, starting with `#`. + Token::Range Tokens; + clang::tok::PPKeywordKind Kind = clang::tok::pp_not_keyword; + }; + /// A preprocessor conditional section. + /// + /// This starts with an #if, #ifdef, #ifndef etc directive. + /// It covers all #else branches, and spans until the matching #endif. + struct Conditional { + /// The sequence of directives that introduce top-level alternative parses. + /// + /// The first branch will have an #if type directive. + /// Subsequent branches will have #else type directives. + std::vector> Branches; + /// The directive terminating the conditional, should be #endif. + Directive End; + }; + + /// Some piece of the file. {One of Code, Directive, Conditional}. + class Chunk; // Defined below. + std::vector Chunks; + + /// Extract preprocessor structure by examining the raw tokens. + static PPStructure parse(const TokenStream &); + + // FIXME: add heuristically selection of conditional branches. + // FIXME: allow deriving a preprocessed stream +}; +llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure &); +llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure::Chunk &); +llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure::Code &); +llvm::raw_ostream &operator<<(llvm::raw_ostream &, + const PPStructure::Directive &); +llvm::raw_ostream &operator<<(llvm::raw_ostream &, + const PPStructure::Conditional &); + +// FIXME: This approximates std::variant. +// Switch once we can use C++17. +class PPStructure::Chunk { +public: + enum Kind { K_Empty, K_Code, K_Directive, K_Conditional }; + Kind kind() const { + return CodeVariant ? K_Code + : DirectiveVariant ? K_Directive + : K_Conditional; + } + + Chunk() = delete; + Chunk(const Chunk &) = delete; + Chunk(Chunk &&) = default; + Chunk &operator=(const Chunk &) = delete; + Chunk &operator=(Chunk &&) = default; + ~Chunk() = default; + + // T => Chunk constructor. + Chunk(Code C) : CodeVariant(std::move(C)) {} + Chunk(Directive C) : DirectiveVariant(std::move(C)) {} + Chunk(Conditional C) : ConditionalVariant(std::move(C)) {} + + // Chunk => T& and const T& conversions. +#define CONVERSION(CONST, V) \ + explicit operator CONST V &() CONST { return *V##Variant; } + CONVERSION(const, Code); + CONVERSION(, Code); + CONVERSION(const, Directive); + CONVERSION(, Directive); + CONVERSION(const, Conditional); + CONVERSION(, Conditional); +#undef CONVERSION + +private: + // Wasteful, a union variant would be better! + llvm::Optional CodeVariant; + llvm::Optional DirectiveVariant; + llvm::Optional ConditionalVariant; +}; + +} // namespace pseudo +} // namespace syntax +} // namespace clang + +#endif diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Token.h b/clang/include/clang/Tooling/Syntax/Pseudo/Token.h new file mode 100644 --- /dev/null +++ b/clang/include/clang/Tooling/Syntax/Pseudo/Token.h @@ -0,0 +1,192 @@ +//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Tokens are the first level of abstraction above bytes used in pseudoparsing. +// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor). +// The tokens is wrapped into pseudo::Token, along with line/indent info. +// +// Unlike clang, we make multiple passes over the whole file, out-of-order. +// Therefore we retain the whole token sequence in memory. (This is feasible as +// we process one file at a time). pseudo::TokenStream holds such a stream. +// The initial stream holds the raw tokens read from the file, later passes +// operate on derived TokenStreams (e.g. with directives stripped). +// +// Similar facilities from clang that are *not* used: +// - SourceManager: designed around multiple files and precise macro expansion. +// - clang::Token: coupled to SourceManager, doesn't retain layout info. +// (pseudo::Token is similar, but without SourceLocations). +// - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros. +// (pseudo::TokenStream is similar, but a flat token list). +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H +#define LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H + +#include "clang/Basic/LLVM.h" +#include "clang/Basic/TokenKinds.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include + +namespace clang { +class LangOptions; +namespace syntax { +namespace pseudo { + +/// A single C++ or preprocessor token. +/// +/// Unlike clang::Token and syntax::Token, these tokens are not connected to a +/// SourceManager - we are not dealing with multiple files. +struct Token { + /// An Index identifies a token within a stream. + using Index = uint32_t; + /// A sentinel Index indicating no token. + constexpr static Index Invalid = std::numeric_limits::max(); + struct Range; + + /// The token text. + /// + /// Typically from the original source file, but may have been synthesized. + StringRef text() const { return StringRef(Data, Length); } + const char *Data; + uint32_t Length; + + /// Zero-based line number. + uint32_t Line = 0; + /// Width of whitespace before the first token on this line. + uint8_t Indent = 0; + /// Flags have some meaning defined by the function that produced this stream. + uint8_t Flags = 0; + // Helpers to get/set Flags based on `enum class`. + template bool flag(T Mask) const { + return Flags & uint8_t{static_cast>(Mask)}; + } + template void setFlag(T Mask) { + Flags |= uint8_t{static_cast>(Mask)}; + } + + /// The type of token as determined by clang's lexer. + clang::tok::TokenKind Kind = clang::tok::unknown; +}; +static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!"); +llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &); + +/// A half-open range of tokens within a stream. +struct Token::Range { + Token::Index Begin = 0; + Token::Index End = 0; + + uint32_t size() const { return End - Begin; } + static Range empty(unsigned Index) { return Range{Index, Index}; } +}; +llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &); + +/// A complete sequence of Tokens representing a source file. +/// +/// This may match a raw file from disk, or be derived from a previous stream. +/// For example, stripping comments from a TokenStream results in a new stream. +/// +/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes: +/// int main ( ) ; +/// eof kw_int ident l_paren r_paren semi eof +/// front() back() +/// 0 1 2 3 4 5 +class TokenStream { +public: + /// Create an empty stream. + /// + /// Initially, the stream is appendable and not finalized. + /// The token sequence may only be accessed after finalize() is called. + /// + /// Payload is an opaque object which will be owned by the stream. + /// e.g. an allocator to hold backing storage for synthesized token text. + explicit TokenStream(std::shared_ptr Payload = nullptr); + + /// Append a token to the stream, which must not be finalized. + void push(Token T) { + assert(!isFinalized()); + Storage.push_back(std::move(T)); + } + + /// Finalize the token stream, allowing tokens to be accessed. + /// Tokens may no longer be appended. + void finalize(); + bool isFinalized() const; + + /// Returns the index of T within the stream. + /// + /// T must be within the stream or the end sentinel (not the start sentinel). + Token::Index index(const Token &T) const { + assert(&T != Storage.data() && "start sentinel"); + assert(&T >= Storage.data() && &T < Storage.data() + Storage.size()); + return &T - Tokens.data(); + } + + ArrayRef tokens() const { + assert(isFinalized()); + return Tokens; + } + ArrayRef tokens(Token::Range R) const { + return tokens().slice(R.Begin, R.End - R.Begin); + } + + /// May return the end sentinel if the stream is empty. + Token &front() { return Storage[1]; } + const Token &front() const { return Storage[1]; } + + /// Print the tokens in this stream to the output stream. + /// + /// The presence of newlines/spaces is preserved, but not the quantity. + void print(llvm::raw_ostream &) const; + +private: + std::shared_ptr Payload; + + MutableArrayRef Tokens; + std::vector Storage; +}; +llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &); + +/// Extracts a raw token stream from the source code. +/// +/// All tokens will reference the data of the provided string. +/// +/// Tokens containing trigraps, escaped newlines, UCNs etc will have text() that +/// reflect this, and will have NeedsCleaning set. +/// Tokens at the start of an unescaped newline (where a directive may start) +/// will have StartsPPLine. +/// "word-like" tokens such as identifiers and keywords will be raw_identifier. +TokenStream lex(const std::string &, const clang::LangOptions &); +enum class LexFlags : uint8_t { + NeedsCleaning = 1 << 0, + StartsPPLine = 1 << 1, +}; + +/// Derives a token stream by decoding escapes and interpreting raw_identifiers. +/// +/// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and +/// their backing data is owned by the returned stream. +/// raw_identifier tokens are assigned specific types (identifier, keyword etc). +/// +/// The StartsPPLine flag is preserved. +/// +/// Formally the identifier correctly happens before preprocessing, while we +/// should only cook raw_identifiers that survive preprocessing. +/// However, ignoring the Token::Kind of tokens in directives achieves the same. +/// (And having cooked token kinds in PP-disabled sections is useful for us). +TokenStream cook(const TokenStream &, const clang::LangOptions &); + +} // namespace pseudo +} // namespace syntax +} // namespace clang + +#endif diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt --- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt +++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -3,7 +3,10 @@ add_clang_library(clangToolingSyntaxPseudo Grammar.cpp GrammarBNF.cpp - + Lex.cpp + Preprocess.cpp + Token.cpp + LINK_LIBS clangBasic clangLex diff --git a/clang/lib/Tooling/Syntax/Pseudo/Lex.cpp b/clang/lib/Tooling/Syntax/Pseudo/Lex.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/Lex.cpp @@ -0,0 +1,114 @@ +//===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/TokenKinds.h" +#include "clang/Lex/Lexer.h" +#include "clang/Lex/LiteralSupport.h" +#include "clang/Tooling/Syntax/Pseudo/Token.h" + +namespace clang { +namespace syntax { +namespace pseudo { + +TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) { + clang::SourceLocation Start; + // Tokenize using clang's lexer in raw mode. + // std::string guarantees null-termination, which the lexer needs. + clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(), + Code.data() + Code.size()); + Lexer.SetCommentRetentionState(true); + + TokenStream Result; + clang::Token CT; + unsigned LastOffset = 0; + unsigned Line = 0; + unsigned Indent = 0; + for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof; + Lexer.LexFromRawLexer(CT)) { + unsigned Offset = + CT.getLocation().getRawEncoding() - Start.getRawEncoding(); + + Token Tok; + Tok.Data = &Code[Offset]; + Tok.Length = CT.getLength(); + Tok.Kind = CT.getKind(); + + // Update current line number and indentation from raw source code. + unsigned NewLineStart = 0; + for (unsigned i = LastOffset; i < Offset; ++i) { + if (Code[i] == '\n') { + NewLineStart = i + 1; + ++Line; + } + } + // Indentation isn't always well defined when lines are continued. + if ((NewLineStart || !LastOffset) && CT.isAtStartOfLine()) { + Indent = 0; + for (char c : StringRef(Code).slice(NewLineStart, Offset)) { + if (c == ' ') + ++Indent; + else if (c == '\t') + Indent += 8; + else + break; + } + } + Tok.Indent = Indent; + Tok.Line = Line; + + if (CT.isAtStartOfLine()) + Tok.setFlag(LexFlags::StartsPPLine); + if (CT.needsCleaning() || CT.hasUCN()) + Tok.setFlag(LexFlags::NeedsCleaning); + + Result.push(Tok); + LastOffset = Offset; + } + Result.finalize(); + return Result; +} + +TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) { + auto CleanedStorage = std::make_shared(); + clang::IdentifierTable Identifiers(LangOpts); + TokenStream Result(CleanedStorage); + + for (auto Tok : Code.tokens()) { + if (Tok.flag(LexFlags::NeedsCleaning)) { + // Remove escaped newlines and trigraphs. + llvm::SmallString<64> CleanBuffer; + const char *Pos = Tok.text().begin(); + while (Pos < Tok.text().end()) { + unsigned CharSize; + CleanBuffer.push_back( + clang::Lexer::getCharAndSizeNoWarn(Pos, CharSize, LangOpts)); + Pos += CharSize; + } + // Remove universal character names (UCN). + llvm::SmallString<64> UCNBuffer; + clang::expandUCNs(UCNBuffer, CleanBuffer); + + llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage); + Tok.Data = Text.data(); + Tok.Length = Text.size(); + Tok.Flags &= ~static_cast(LexFlags::NeedsCleaning); + } + // Cook raw_identifiers into identifier, keyword, etc. + if (Tok.Kind == tok::raw_identifier) + Tok.Kind = Identifiers.get(Tok.text()).getTokenID(); + Result.push(std::move(Tok)); + } + + Result.finalize(); + return Result; +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp b/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp @@ -0,0 +1,211 @@ +//===--- Preprocess.cpp - Preprocess token streams ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/Preprocess.h" +#include "clang/Basic/IdentifierTable.h" +#include "llvm/Support/FormatVariadic.h" + +namespace clang { +namespace syntax { +namespace pseudo { +namespace { + +class Parser { +public: + explicit Parser(const TokenStream &Code) : Code(Code), Tok(&Code.front()) {} + void parse(PPStructure *result) { parse(result, /*TopLevel=*/true); } + +private: + // Roles that a directive might take within a conditional block. + enum class Cond { None, If, Else, End }; + static Cond classifyDirective(tok::PPKeywordKind kind) { + switch (kind) { + case clang::tok::pp_if: + case clang::tok::pp_ifdef: + case clang::tok::pp_ifndef: + return Cond::If; + case clang::tok::pp_elif: + case clang::tok::pp_elifdef: + case clang::tok::pp_elifndef: + case clang::tok::pp_else: + return Cond::Else; + case clang::tok::pp_endif: + return Cond::End; + default: + return Cond::None; + } + } + + // Parses tokens starting at Tok into PP. + // If we reach an #end or #else directive that ends PP, returns it. + // If TopLevel is true, then we do not expect #end and always return None. + llvm::Optional parse(PPStructure *PP, bool TopLevel) { + auto StartsDirective = + [&, AllowDirectiveAt((const Token *)nullptr)]() mutable { + if (Tok->flag(LexFlags::StartsPPLine)) { + // If we considered a comment at the start of a PP-line, it doesn't + // start a directive but the directive can still start after it. + if (Tok->Kind == tok::comment) + AllowDirectiveAt = Tok + 1; + return Tok->Kind == tok::hash; + } + return Tok->Kind == tok::hash && AllowDirectiveAt == Tok; + }; + while (Tok->Kind != tok::eof) { + while (StartsDirective()) { + PPStructure::Directive Directive; + parseDirective(&Directive); + Cond Kind = classifyDirective(Directive.Kind); + if (Kind == Cond::If) { + PPStructure::Conditional Conditional; + Conditional.Branches.emplace_back(); + Conditional.Branches.back().first = std::move(Directive); + parseConditional(&Conditional); + PP->Chunks.push_back(std::move(Conditional)); + continue; + } + // Unexpected #else or #endif at top level; parse as normal directives. + if (Kind == Cond::None || TopLevel) { + PP->Chunks.push_back(std::move(Directive)); + continue; + } + assert(Kind == Cond::Else || Kind == Cond::End); + return std::move(Directive); + } + const Token *Start = Tok; + while (Tok->Kind != tok::eof && !StartsDirective()) + ++Tok; + if (Tok != Start) + PP->Chunks.push_back(PPStructure::Code{ + Token::Range{Code.index(*Start), Code.index(*Tok)}}); + } + return None; + } + + // Parse the rest of a conditional section, after seeing the #if directive. + // Returns after consuming the #end directive. + void parseConditional(PPStructure::Conditional *C) { + assert(C->Branches.size() == 1 && + C->Branches.front().second.Chunks.empty() && + "Should be ready to parse first branch body"); + while (Tok->Kind != tok::eof) { + auto Terminator = parse(&C->Branches.back().second, /*TopLevel=*/false); + if (!Terminator) { + assert(Tok->Kind == tok::eof && "gave up parsing before eof?"); + C->End.Tokens = Token::Range::empty(Code.index(*Tok)); + return; + } + if (classifyDirective(Terminator->Kind) == Cond::End) { + C->End = std::move(*Terminator); + return; + } + assert(classifyDirective(Terminator->Kind) == Cond::Else && + "ended branch unexpectedly"); + C->Branches.emplace_back(); + C->Branches.back().first = std::move(*Terminator); + } + } + + // Parse a directive. Tok is the hash. + void parseDirective(PPStructure::Directive *D) { + assert(Tok->Kind == tok::hash); + + // Directive spans from the hash until the end of line or file. + const Token *Begin = Tok++; + while (Tok->Kind != tok::eof && !Tok->flag(LexFlags::StartsPPLine)) + ++Tok; + ArrayRef Tokens{Begin, Tok}; + D->Tokens = {Code.index(*Tokens.begin()), Code.index(*Tokens.end())}; + + // Directive name is the first non-comment token after the hash. + Tokens = Tokens.drop_front().drop_while( + [](const Token &T) { return T.Kind == tok::comment; }); + if (!Tokens.empty()) + D->Kind = Idents.get(Tokens.front().text()).getPPKeywordID(); + } + + const TokenStream &Code; + const Token *Tok; + clang::IdentifierTable Idents; +}; + +} // namespace + +PPStructure PPStructure::parse(const TokenStream &Code) { + PPStructure Result; + Parser(Code).parse(&Result); + return Result; +} + +static llvm::StringLiteral ppKeywordName(tok::PPKeywordKind kind) { + switch (kind) { +#define PPKEYWORD(x) \ + case tok::pp_##x: \ + return #x; +#include "clang/Basic/TokenKinds.def" + default: + return "unknown"; + } +} + +static void dump(llvm::raw_ostream &OS, const PPStructure &, unsigned Indent); +static void dump(llvm::raw_ostream &OS, const PPStructure::Directive &Directive, + unsigned Indent) { + OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n", + ppKeywordName(Directive.Kind), + Directive.Tokens.size()); +} +static void dump(llvm::raw_ostream &OS, const PPStructure::Code &Code, + unsigned Indent) { + OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n", Code.Tokens.size()); +} +static void dump(llvm::raw_ostream &OS, + const PPStructure::Conditional &Conditional, unsigned Indent) { + for (const auto &Branch : Conditional.Branches) { + dump(OS, Branch.first, Indent); + dump(OS, Branch.second, Indent + 2); + } + dump(OS, Conditional.End, Indent); +} + +static void dump(llvm::raw_ostream &OS, const PPStructure::Chunk &Chunk, + unsigned Indent) { + switch (Chunk.kind()) { + case PPStructure::Chunk::K_Empty: + llvm_unreachable("invalid chunk"); + case PPStructure::Chunk::K_Code: + return dump(OS, (const PPStructure::Code &)Chunk, Indent); + case PPStructure::Chunk::K_Directive: + return dump(OS, (const PPStructure::Directive &)Chunk, Indent); + case PPStructure::Chunk::K_Conditional: + return dump(OS, (const PPStructure::Conditional &)Chunk, Indent); + } +} + +static void dump(llvm::raw_ostream &OS, const PPStructure &PP, + unsigned Indent) { + for (const auto &Chunk : PP.Chunks) + dump(OS, Chunk, Indent); +} + +// Define operator<< in terms of dump() functions above. +#define OSTREAM_DUMP(Type) \ + llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Type &T) { \ + dump(OS, T, 0); \ + return OS; \ + } +OSTREAM_DUMP(PPStructure) +OSTREAM_DUMP(PPStructure::Chunk) +OSTREAM_DUMP(PPStructure::Directive) +OSTREAM_DUMP(PPStructure::Conditional) +OSTREAM_DUMP(PPStructure::Code) +#undef OSTREAM_DUMP + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/lib/Tooling/Syntax/Pseudo/Token.cpp b/clang/lib/Tooling/Syntax/Pseudo/Token.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/Token.cpp @@ -0,0 +1,98 @@ +//===--- Token.cpp - Tokens and token streams in the pseudoparser ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/Token.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/FormatVariadic.h" + +namespace clang { +namespace syntax { +namespace pseudo { + +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) { + OS << llvm::formatv("{0} {1}:{2} ", clang::tok::getTokenName(T.Kind), T.Line, + T.Indent); + OS << '"'; + llvm::printEscapedString(T.text(), OS); + OS << '"'; + if (T.Flags) + OS << llvm::format(" flags=%x", T.Flags); + return OS; +} + +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const TokenStream &TS) { + OS << "Index Kind Line Text\n"; + for (const auto &T : TS.tokens()) { + OS << llvm::format("%5d: %16s %4d:%-2d ", TS.index(T), + clang::tok::getTokenName(T.Kind), T.Line, T.Indent); + OS << '"'; + llvm::printEscapedString(T.text(), OS); + OS << '"'; + if (T.Flags) + OS << llvm::format(" flags=%x", T.Flags); + OS << '\n'; + } + return OS; +} + +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token::Range &R) { + OS << llvm::formatv("[{0},{1})", R.Begin, R.End); + return OS; +} + +TokenStream::TokenStream(std::shared_ptr Payload) + : Payload(std::move(Payload)) { + Storage.emplace_back(); + Storage.back().Kind = clang::tok::eof; +} + +void TokenStream::finalize() { + assert(!isFinalized()); + unsigned LastLine = Storage.back().Line; + Storage.emplace_back(); + Storage.back().Kind = tok::eof; + Storage.back().Line = LastLine + 1; + + Tokens = Storage; + Tokens = Tokens.drop_front().drop_back(); +} + +bool TokenStream::isFinalized() const { + assert(!Storage.empty() && Storage.front().Kind == tok::eof); + if (Storage.size() == 1) + return false; + return Storage.back().Kind == tok::eof; +} + +void TokenStream::print(llvm::raw_ostream &OS) const { + bool FirstToken = true; + unsigned LastLine = -1; + StringRef LastText; + for (const auto &T : tokens()) { + StringRef Text = T.text(); + if (FirstToken) { + FirstToken = false; + } else if (T.Line == LastLine) { + if (LastText.data() + LastText.size() != Text.data()) + OS << ' '; + } else { + OS << '\n'; + OS.indent(T.Indent); + } + OS << Text; + LastLine = T.Line; + LastText = Text; + } + if (!FirstToken) + OS << '\n'; +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/test/Syntax/Inputs/example.c b/clang/test/Syntax/Inputs/example.c new file mode 100644 --- /dev/null +++ b/clang/test/Syntax/Inputs/example.c @@ -0,0 +1,7 @@ +int is_debug() { +#ifndef NDEBUG + return 1; // in debug mode +#else + return 0; +#endif +} diff --git a/clang/test/Syntax/lex.test b/clang/test/Syntax/lex.test new file mode 100644 --- /dev/null +++ b/clang/test/Syntax/lex.test @@ -0,0 +1,38 @@ +// RUN: clang-pseudo -source %S/Inputs/example.c -print-source | FileCheck %s -check-prefix=SOURCE --strict-whitespace + SOURCE: int is_debug() { +SOURCE-NEXT: #ifndef NDEBUG +SOURCE-NEXT: return 1; // in debug mode +SOURCE-NEXT: #else +SOURCE-NEXT: return 0; +SOURCE-NEXT: #end +SOURCE-NEXT: } +// RUN: clang-pseudo -source %S/Inputs/example.c -print-tokens | FileCheck %s -check-prefix=TOKEN +TOKEN: 0: raw_identifier 0:0 "int" flags=1 +TOKEN-NEXT: raw_identifier 0:0 "is_debug" +TOKEN-NEXT: l_paren 0:0 "(" +TOKEN-NEXT: r_paren 0:0 ")" +TOKEN-NEXT: l_brace 0:0 "{" +TOKEN-NEXT: hash 1:0 "#" flags=1 +TOKEN-NEXT: raw_identifier 1:0 "ifndef" +TOKEN-NEXT: raw_identifier 1:0 "NDEBUG" +TOKEN-NEXT: raw_identifier 2:2 "return" flags=1 +TOKEN-NEXT: numeric_constant 2:2 "1" +TOKEN-NEXT: semi 2:2 ";" +TOKEN-NEXT: comment 2:2 "// in debug mode" +TOKEN-NEXT: hash 3:0 "#" flags=1 +TOKEN-NEXT: raw_identifier 3:0 "else" +TOKEN-NEXT: raw_identifier 4:2 "return" flags=1 +TOKEN-NEXT: numeric_constant 4:2 "0" +TOKEN-NEXT: semi 4:2 ";" +TOKEN-NEXT: hash 5:0 "#" flags=1 +TOKEN-NEXT: raw_identifier 5:0 "endif" +TOKEN-NEXT: r_brace 6:0 "}" flags=1 +// RUN: clang-pseudo -source %S/Inputs/example.c -print-pp-structure | FileCheck %s -check-prefix=PPS --strict-whitespace + PPS: code (5 tokens) +PPS-NEXT: #ifndef (3 tokens) +PPS-NEXT: code (4 tokens) +PPS-NEXT: #else (2 tokens) +PPS-NEXT: code (3 tokens) +PPS-NEXT: #endif (2 tokens) +PPS-NEXT: code (1 tokens) + diff --git a/clang/tools/clang-pseudo/ClangPseudo.cpp b/clang/tools/clang-pseudo/ClangPseudo.cpp --- a/clang/tools/clang-pseudo/ClangPseudo.cpp +++ b/clang/tools/clang-pseudo/ClangPseudo.cpp @@ -6,7 +6,10 @@ // //===----------------------------------------------------------------------===// +#include "clang/Basic/LangOptions.h" #include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "clang/Tooling/Syntax/Pseudo/Preprocess.h" +#include "clang/Tooling/Syntax/Pseudo/Token.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormatVariadic.h" @@ -21,19 +24,31 @@ CheckGrammar("check-grammar", desc("Parse and check a BNF grammar file."), init("")); +static opt Source("source", desc("Source file")); +static opt PrintSource("print-source", desc("Print token stream")); +static opt PrintTokens("print-tokens", desc("Print detailed token info")); +static opt + PrintPPStructure("print-pp-structure", + desc("Print directive structure of source code")); + +static std::string readOrDie(llvm::StringRef Path) { + llvm::ErrorOr> Text = + llvm::MemoryBuffer::getFile(Path); + if (std::error_code EC = Text.getError()) { + llvm::errs() << "Error: can't read grammar file '" << CheckGrammar + << "': " << EC.message() << "\n"; + ::exit(1); + } + return Text.get()->getBuffer().str(); +} + int main(int argc, char *argv[]) { llvm::cl::ParseCommandLineOptions(argc, argv, ""); if (CheckGrammar.getNumOccurrences()) { - llvm::ErrorOr> Text = - llvm::MemoryBuffer::getFile(CheckGrammar); - if (std::error_code EC = Text.getError()) { - llvm::errs() << "Error: can't read grammar file '" << CheckGrammar - << "': " << EC.message() << "\n"; - return 1; - } + std::string Text = readOrDie(CheckGrammar); std::vector Diags; - auto RSpecs = Grammar::parseBNF(Text.get()->getBuffer(), Diags); + auto RSpecs = Grammar::parseBNF(Text, Diags); if (!Diags.empty()) { llvm::errs() << llvm::join(Diags, "\n"); @@ -43,5 +58,20 @@ CheckGrammar); return 0; } + + if (Source.getNumOccurrences()) { + std::string Text = readOrDie(Source); + clang::LangOptions LangOpts; // FIXME: use real options. + auto Stream = clang::syntax::pseudo::lex(Text, LangOpts); + auto Structure = clang::syntax::pseudo::PPStructure::parse(Stream); + + if (PrintPPStructure) + llvm::outs() << Structure; + if (PrintSource) + Stream.print(llvm::outs()); + if (PrintTokens) + llvm::outs() << Stream; + } + return 0; } diff --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt --- a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt +++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -4,6 +4,8 @@ add_clang_unittest(ClangPseudoTests GrammarTest.cpp + PreprocessTest.cpp + TokenTest.cpp ) clang_target_link_libraries(ClangPseudoTests diff --git a/clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp new file mode 100644 --- /dev/null +++ b/clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp @@ -0,0 +1,121 @@ +//===--- TokenTest.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/Preprocess.h" + +#include "clang/Basic/LangOptions.h" +#include "clang/Tooling/Syntax/Pseudo/Token.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace clang { +namespace syntax { +namespace pseudo { +namespace { + +using testing::_; +using testing::ElementsAre; +using testing::Matcher; +using testing::Pair; +using testing::StrEq; +using Chunk = PPStructure::Chunk; + +MATCHER_P2(tokensAre, TS, Tokens, "tokens are " + std::string(Tokens)) { + std::vector Texts; + for (const Token &Tok : TS.tokens(arg.Tokens)) + Texts.push_back(Tok.text()); + return Matcher(StrEq(Tokens)) + .MatchAndExplain(llvm::join(Texts, " "), result_listener); +} + +MATCHER_P(chunkKind, K, "") { return arg.kind() == K; } + +TEST(PPStructure, Parse) { + LangOptions Opts; + std::string Code = R"cpp( + #include + + int main() { + #ifdef HAS_FOO + #if HAS_BAR + foo(bar); + #else + foo(0) + #endif + #elif NEEDS_FOO + #error missing_foo + #endif + } + )cpp"; + + TokenStream S = cook(lex(Code, Opts), Opts); + PPStructure PP = PPStructure::parse(S); + + ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Directive), + chunkKind(Chunk::K_Code), + chunkKind(Chunk::K_Conditional), + chunkKind(Chunk::K_Code))); + + EXPECT_THAT((const PPStructure::Directive &)PP.Chunks[0], + tokensAre(S, "# include < foo . h >")); + EXPECT_THAT((const PPStructure::Code &)PP.Chunks[1], + tokensAre(S, "int main ( ) {")); + EXPECT_THAT((const PPStructure::Code &)PP.Chunks[3], tokensAre(S, "}")); + + const PPStructure::Conditional &Ifdef(PP.Chunks[2]); + EXPECT_THAT(Ifdef.Branches, + ElementsAre(Pair(tokensAre(S, "# ifdef HAS_FOO"), _), + Pair(tokensAre(S, "# elif NEEDS_FOO"), _))); + EXPECT_THAT(Ifdef.End, tokensAre(S, "# endif")); + + const PPStructure &HasFoo(Ifdef.Branches[0].second); + const PPStructure &NeedsFoo(Ifdef.Branches[1].second); + + EXPECT_THAT(HasFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Conditional))); + const PPStructure::Conditional &If(HasFoo.Chunks[0]); + EXPECT_THAT(If.Branches, ElementsAre(Pair(tokensAre(S, "# if HAS_BAR"), _), + Pair(tokensAre(S, "# else"), _))); + EXPECT_THAT(If.Branches[0].second.Chunks, + ElementsAre(chunkKind(Chunk::K_Code))); + EXPECT_THAT(If.Branches[1].second.Chunks, + ElementsAre(chunkKind(Chunk::K_Code))); + + EXPECT_THAT(NeedsFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Directive))); + const PPStructure::Directive &Error(NeedsFoo.Chunks[0]); + EXPECT_THAT(Error, tokensAre(S, "# error missing_foo")); + EXPECT_EQ(Error.Kind, tok::pp_error); +} + +TEST(PPStructure, ParseUgly) { + LangOptions Opts; + std::string Code = R"cpp( + /*A*/ # /*B*/ \ + /*C*/ \ +define \ +BAR /*D*/ +/*E*/ +)cpp"; + TokenStream S = cook(lex(Code, Opts), Opts); + PPStructure PP = PPStructure::parse(S); + + ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code), + chunkKind(Chunk::K_Directive), + chunkKind(Chunk::K_Code))); + EXPECT_THAT((const PPStructure::Code &)PP.Chunks[0], tokensAre(S, "/*A*/")); + const PPStructure::Directive &Define(PP.Chunks[1]); + EXPECT_EQ(Define.Kind, tok::pp_define); + EXPECT_THAT(Define, tokensAre(S, "# /*B*/ /*C*/ define BAR /*D*/")); + EXPECT_THAT((const PPStructure::Code &)PP.Chunks[2], tokensAre(S, "/*E*/")); +} + +} // namespace +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/unittests/Tooling/Syntax/Pseudo/TokenTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/TokenTest.cpp new file mode 100644 --- /dev/null +++ b/clang/unittests/Tooling/Syntax/Pseudo/TokenTest.cpp @@ -0,0 +1,175 @@ +//===--- TokenTest.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/Token.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/TokenKinds.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace clang { +namespace syntax { +namespace pseudo { +namespace { + +using testing::AllOf; +using testing::ElementsAre; +using testing::ElementsAreArray; +using testing::Not; + +MATCHER_P2(token, Text, Kind, "") { + return arg.Kind == Kind && arg.text() == Text; +} + +MATCHER_P(hasFlag, Flag, "") { return arg.flag(Flag); } + +MATCHER_P2(lineIndent, Line, Indent, "") { + return arg.Line == (unsigned)Line && arg.Indent == (unsigned)Indent; +} + +TEST(TokenTest, Lex) { + LangOptions Opts; + std::string Code = R"cpp( + #include + int main() { + return 42; // the answer + } + )cpp"; + TokenStream Raw = lex(Code, Opts); + ASSERT_TRUE(Raw.isFinalized()); + EXPECT_THAT(Raw.tokens(), + ElementsAreArray({ + // Lexing of directives is weird, especially strings. + token("#", tok::hash), + token("include", tok::raw_identifier), + token("<", tok::less), + token("stdio", tok::raw_identifier), + token(".", tok::period), + token("h", tok::raw_identifier), + token(">", tok::greater), + + token("int", tok::raw_identifier), + token("main", tok::raw_identifier), + token("(", tok::l_paren), + token(")", tok::r_paren), + token("{", tok::l_brace), + token("return", tok::raw_identifier), + token("42", tok::numeric_constant), + token(";", tok::semi), + token("// the answer", tok::comment), + token("}", tok::r_brace), + })); + + TokenStream Cooked = cook(Raw, Opts); + ASSERT_TRUE(Cooked.isFinalized()); + EXPECT_THAT(Cooked.tokens(), + ElementsAreArray({ + // Cooked identifier types in directives are not meaningful. + token("#", tok::hash), + token("include", tok::identifier), + token("<", tok::less), + token("stdio", tok::identifier), + token(".", tok::period), + token("h", tok::identifier), + token(">", tok::greater), + + token("int", tok::kw_int), + token("main", tok::identifier), + token("(", tok::l_paren), + token(")", tok::r_paren), + token("{", tok::l_brace), + token("return", tok::kw_return), + token("42", tok::numeric_constant), + token(";", tok::semi), + token("// the answer", tok::comment), + token("}", tok::r_brace), + })); + // Check raw tokens point back into original source code. + EXPECT_EQ(Raw.tokens().front().text().begin(), &Code[Code.find('#')]); +} + +TEST(TokenTest, LineContinuation) { + LangOptions Opts; + std::string Code = R"cpp( +one_\ +token +two \ +tokens + )cpp"; + TokenStream Raw = lex(Code, Opts); + EXPECT_THAT(Raw.tokens(), + ElementsAre(AllOf(token("one_\\\ntoken", tok::raw_identifier), + hasFlag(LexFlags::StartsPPLine), + hasFlag(LexFlags::NeedsCleaning)), + AllOf(token("two", tok::raw_identifier), + hasFlag(LexFlags::StartsPPLine), + Not(hasFlag(LexFlags::NeedsCleaning))), + AllOf(token("\\\ntokens", tok::raw_identifier), + Not(hasFlag(LexFlags::StartsPPLine)), + hasFlag(LexFlags::NeedsCleaning)))); + + TokenStream Cooked = cook(Raw, Opts); + EXPECT_THAT(Cooked.tokens(), ElementsAre(token("one_token", tok::identifier), + token("two", tok::identifier), + token("tokens", tok::identifier))); +} + +TEST(TokenTest, EncodedCharacters) { + LangOptions Opts; + Opts.Trigraphs = true; + Opts.Digraphs = true; + Opts.C99 = true; // UCNs + Opts.CXXOperatorNames = true; + std::string Code = R"(and <: ??! '??=' \u00E9)"; + TokenStream Raw = lex(Code, Opts); + EXPECT_THAT( + Raw.tokens(), + ElementsAre( // and is not recognized as && until cook(). + AllOf(token("and", tok::raw_identifier), + Not(hasFlag(LexFlags::NeedsCleaning))), + // Digraphs are just different spellings of tokens. + AllOf(token("<:", tok::l_square), + Not(hasFlag(LexFlags::NeedsCleaning))), + // Trigraps are interpreted, still need text cleaning. + AllOf(token(R"(??!)", tok::pipe), hasFlag(LexFlags::NeedsCleaning)), + // Trigraphs must be substituted inside constants too. + AllOf(token(R"('??=')", tok::char_constant), + hasFlag(LexFlags::NeedsCleaning)), + // UCNs need substitution. + AllOf(token(R"(\u00E9)", tok::raw_identifier), + hasFlag(LexFlags::NeedsCleaning)))); + + TokenStream Cooked = cook(Raw, Opts); + EXPECT_THAT( + Cooked.tokens(), + ElementsAre(token("and", tok::ampamp), // alternate spelling recognized + token("<:", tok::l_square), + token("|", tok::pipe), // trigraph substituted + token("'#'", tok::char_constant), // trigraph substituted + token("é", tok::identifier))); // UCN substituted +} + +TEST(TokenTest, Indentation) { + LangOptions Opts; + std::string Code = R"cpp( hello world +no_indent \ + line_was_continued +)cpp"; + TokenStream Raw = lex(Code, Opts); + EXPECT_THAT(Raw.tokens(), ElementsAreArray({ + lineIndent(0, 3), + lineIndent(0, 3), + lineIndent(1, 0), + lineIndent(2, 0), + })); +} + +} // namespace +} // namespace pseudo +} // namespace syntax +} // namespace clang