diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h b/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h new file mode 100644 --- /dev/null +++ b/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h @@ -0,0 +1,160 @@ +//===--- Preprocess.h - Preprocess token streams -----------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The pseudoparser tries to match a token stream to the C++ grammar. +// Preprocessor #defines and other directives are not part of this grammar, and +// should be removed before the file can be parsed. +// +// Conditional blocks like #if...#else...#endif are particularly tricky, as +// simply stripping the directives may not produce a grammatical result: +// +// return +// #ifndef DEBUG +// 1 +// #else +// 0 +// #endif +// ; +// +// This header supports analyzing and removing the directives in a source file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H +#define LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H + +#include "clang/Basic/TokenKinds.h" +#include "clang/Tooling/Syntax/Pseudo/Token.h" +#include + +namespace clang { +class LangOptions; +namespace syntax { +namespace pseudo { + +/// Describes the structure of a source file, as seen by the preprocessor. +/// +/// The structure is a tree, whose leaves are plain source code and directives, +/// and whose internal nodes are #if...#endif sections. +/// +/// (root) +/// |-+ Directive #include +/// |-+ Code int main() { +/// | ` printf("hello, "); +/// |-+ Conditional -+ Directive #ifndef NDEBUG +/// | |-+ Code printf("debug\n"); +/// | |-+ Directive #else +/// | |-+ Code printf("production\n"); +/// | `-+ Directive #endif +/// |-+ Code return 0; +/// ` } +/// +/// Unlike the clang preprocessor, we model the full tree explicitly. +/// This class does not recognize macro usage, only directives. +struct PPStructure { + /// A range of code containing no directives. + struct Code { + Token::Range Tokens; + }; + /// A preprocessor directive. + struct Directive { + /// Raw tokens making up the directive, starting with `#`. + Token::Range Tokens; + clang::tok::PPKeywordKind Kind = clang::tok::pp_not_keyword; + }; + /// A preprocessor conditional section. + /// + /// This starts with an #if, #ifdef, #ifndef etc directive. + /// It covers all #else branches, and spans until the matching #endif. + struct Conditional { + /// The sequence of directives that introduce top-level alternative parses. + /// + /// The first branch will have an #if type directive. + /// Subsequent branches will have #else type directives. + std::vector> Branches; + /// The directive terminating the conditional, should be #endif. + Directive End; + }; + + /// Some piece of the file. {One of Code, Directive, Conditional}. + class Chunk; // Defined below. + std::vector Chunks; + + /// Extract preprocessor structure by examining the raw tokens. + static PPStructure parse(const TokenStream &); + + /// Determine heuristically a set of conditional branches to take. + /// + /// Current heuristics (in preference order): + /// - respect constants: `#if 1`, `#elif false` etc. + /// - avoid paths that reach #error + /// - maximize non-comment tokens seen + /// - maximize number of directives seen + void chooseBranches(const TokenStream &) { + llvm_unreachable("unimplemented"); + } + + /// Produce a derived token stream without directives and not-taken branches. + /// + /// Additionally, raw identifiers are "cooked", converting them to identifiers + /// or keywords according to the LangOptions. + /// + /// The input TokenStream should be the one this structure describes. + TokenStream preprocess(const TokenStream &, + const clang::LangOptions &) const { + llvm_unreachable("unimplemented"); + } +}; +llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure &); + +// FIXME: This approximates std::variant. +// Switch once we can use C++17. +class PPStructure::Chunk { +public: + enum Kind { K_Empty, K_Code, K_Directive, K_Conditional }; + Kind kind() const { + return CodeVariant ? K_Code + : DirectiveVariant ? K_Directive + : K_Conditional; + } + + Chunk() = delete; + Chunk(const Chunk &) = delete; + Chunk(Chunk &&) = default; + Chunk &operator=(const Chunk &) = delete; + Chunk &operator=(Chunk &&) = default; + ~Chunk() = default; + + // T => Chunk constructor. + Chunk(Code C) : CodeVariant(std::move(C)) {} + Chunk(Directive C) : DirectiveVariant(std::move(C)) {} + Chunk(Conditional C) : ConditionalVariant(std::move(C)) {} + + // Chunk => T& and const T& conversions. +#define CONVERSION(CONST, V) \ + explicit operator CONST V &() CONST { return *V##Variant; } + CONVERSION(const, Code); + CONVERSION(, Code); + CONVERSION(const, Directive); + CONVERSION(, Directive); + CONVERSION(const, Conditional); + CONVERSION(, Conditional); +#undef CONVERSION + +private: + // Wasteful, a union variant would be better! + llvm::Optional CodeVariant; + llvm::Optional DirectiveVariant; + llvm::Optional ConditionalVariant; +}; + +} // namespace pseudo +} // namespace syntax +} // namespace clang + +#endif diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Token.h b/clang/include/clang/Tooling/Syntax/Pseudo/Token.h new file mode 100644 --- /dev/null +++ b/clang/include/clang/Tooling/Syntax/Pseudo/Token.h @@ -0,0 +1,172 @@ +//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Tokens are the first level of abstraction above bytes used in pseudoparsing. +// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor). +// The tokens is wrapped into pseudo::Token, along with line/indent info. +// +// Unlike clang, we make multiple passes over the whole file, out-of-order. +// Therefore we retain the whole token sequence in memory. (This is feasible as +// we process one file at a time). pseudo::TokenStream holds such a stream. +// The initial stream holds the raw tokens read from the file, later passes +// operate on derived TokenStreams (e.g. with directives stripped). +// +// Similar facilities from clang that are *not* used: +// - SourceManager: designed around multiple files and precise macro expansion. +// - clang::Token: coupled to SourceManager, doesn't retain layout info. +// (pseudo::Token is similar, but without SourceLocations). +// - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros. +// (pseudo::TokenStream is similar, but a flat token list). +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H +#define LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H + +#include "clang/Basic/LLVM.h" +#include "clang/Basic/TokenKinds.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include + +namespace clang { +class LangOptions; +namespace syntax { +namespace pseudo { + +/// A single C++ or preprocessor token. +/// +/// Unlike clang::Token and syntax::Token, these tokens are not connected to a +/// SourceManager - we are not dealing with multiple files. +struct Token { + /// An Index identifies a token within a stream. + using Index = uint32_t; + /// A sentinel Index indicating no token. + constexpr static Index Invalid = std::numeric_limits::max(); + struct Range; + + /// The token text. + /// + /// Typically from the original source file, but may have been synthesized. + StringRef text() const { return StringRef(Data, Length); } + const char *Data; + uint32_t Length; + + /// Zero-based line number. + uint32_t Line = 0; + /// Width of whitespace before the first token on this line. + uint8_t Indent = 0; + /// Flags have some meaning defined by the function that produced this stream. + uint8_t Flags = 0; + // Helpers to get/set Flags based on `enum class`. + template bool flag(T Mask) const { + return Flags & uint8_t{static_cast>(Mask)}; + } + template void setFlag(T Mask) { + Flags |= uint8_t{static_cast>(Mask)}; + } + + /// The type of token as determined by clang's lexer. + clang::tok::TokenKind Kind = clang::tok::unknown; + /// If this token is a bracket, the index of the matching bracket. + Index Pair = Invalid; + + const Token &next() const { return *(this + 1); } + const Token &prev() const { return *(this - 1); } + Token &next() { return *(this + 1); } + Token &prev() { return *(this - 1); } +}; +static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!"); +llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &); + +/// A half-open range of tokens within a stream. +struct Token::Range { + Token::Index Begin = 0; + Token::Index End = 0; + + uint32_t size() const { return End - Begin; } + static Range empty(unsigned Index) { return Range{Index, Index}; } +}; +llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &); + +/// A complete sequence of Tokens representing a source file. +/// +/// This may match a raw file from disk, or be derived from a previous stream. +/// For example, stripping comments from a TokenStream results in a new stream. +/// +/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes: +/// int main ( ) ; +/// eof kw_int ident l_paren r_paren semi eof +/// front() back() +/// 0 1 2 3 4 5 +class TokenStream { +public: + /// Create an empty stream. + /// + /// Initially, the stream is mutable and not finalized. + /// It may only be read after Finalize() is called. + /// + /// Payload is an opaque object which will be owned by the stream. + /// e.g. an allocator to hold backing storage for synthesized token text. + explicit TokenStream(std::shared_ptr Payload = nullptr); + + /// Append a token to the stream, which must not be finalized. + void push(Token T) { Storage.push_back(std::move(T)); } + + /// Finalize the token stream, allowing it to be read, but no longer written. + void finalize(); + + /// Returns the index of T within the stream. + /// + /// T must be within the stream or the end sentinel (not the start sentinel). + Token::Index index(const Token &T) const { + assert(&T != Storage.data() && "start sentinel"); + assert(&T >= Storage.data() && &T < Storage.data() + Storage.size()); + return &T - Tokens.data(); + } + + MutableArrayRef tokens() { return Tokens; } + ArrayRef tokens() const { return Tokens; } + MutableArrayRef tokens(Token::Range R) { + return Tokens.slice(R.Begin, R.End); + } + ArrayRef tokens(Token::Range R) const { + return Tokens.slice(R.Begin, R.End); + } + + /// May return the end sentinel if the stream is empty. + Token &front() { return Storage[1]; } + const Token &front() const { return Storage[1]; } + + /// Print the tokens in this stream to the output stream. + /// + /// The presence of newlines/spaces is preserved, but not the quantity. + void print(llvm::raw_ostream &) const; + +private: + std::shared_ptr Payload; + + MutableArrayRef Tokens; + std::vector Storage; +}; +llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &); + +/// Extracts a token stream from the source code. +/// +/// The tokens will reference the data of the provided string. +TokenStream lex(const std::string &, const clang::LangOptions &); +enum class LexFlags : uint8_t { DirtyIdentifier, StartsPPLine }; + +} // namespace pseudo +} // namespace syntax +} // namespace clang + +#endif diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt --- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt +++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt @@ -3,7 +3,10 @@ add_clang_library(clangToolingSyntaxPseudo Grammar.cpp GrammarBNF.cpp - + Lex.cpp + Preprocess.cpp + Token.cpp + LINK_LIBS clangBasic clangLex diff --git a/clang/lib/Tooling/Syntax/Pseudo/Lex.cpp b/clang/lib/Tooling/Syntax/Pseudo/Lex.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/Lex.cpp @@ -0,0 +1,77 @@ +//===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/SourceLocation.h" +#include "clang/Lex/Lexer.h" +#include "clang/Tooling/Syntax/Pseudo/Token.h" + +namespace clang { +namespace syntax { +namespace pseudo { + +TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) { + clang::SourceLocation Start; + // Tokenize using clang's lexer in raw mode. + // std::string guarantees null-termination, which the lexer needs. + clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(), + Code.data() + Code.size()); + Lexer.SetCommentRetentionState(true); + + TokenStream Result; + clang::Token CT; + unsigned LastOffset = 0; + unsigned Line = 0; + unsigned Indent = 0; + for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof; + Lexer.LexFromRawLexer(CT)) { + unsigned Offset = + CT.getLocation().getRawEncoding() - Start.getRawEncoding(); + + Token Tok; + Tok.Data = &Code[Offset]; + Tok.Length = CT.getLength(); + Tok.Kind = CT.getKind(); + + // Update current line number and indentation from raw source code. + unsigned NewLineStart = 0; + for (unsigned i = LastOffset; i < Offset; ++i) { + if (Code[i] == '\n') { + NewLineStart = i + 1; + ++Line; + } + } + // Indentation isn't always well defined when lines are continued. + if ((NewLineStart || !LastOffset) && CT.isAtStartOfLine()) { + Indent = 0; + for (char c : StringRef(Code).slice(NewLineStart, Offset)) { + if (c == ' ') + ++Indent; + else if (c == '\t') + Indent += 8; + else + break; + } + } + Tok.Indent = Indent; + Tok.Line = Line; + + if (CT.isAtStartOfLine()) + Tok.setFlag(LexFlags::StartsPPLine); + if (CT.needsCleaning() || CT.hasUCN()) + Tok.setFlag(LexFlags::DirtyIdentifier); + + Result.push(Tok); + LastOffset = Offset; + } + Result.finalize(); + return Result; +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp b/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp @@ -0,0 +1,197 @@ +//===--- Preprocess.cpp - Preprocess token streams ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/Preprocess.h" +#include "clang/Basic/IdentifierTable.h" +#include "llvm/Support/FormatVariadic.h" + +namespace clang { +namespace syntax { +namespace pseudo { +namespace { + +class Parser { +public: + explicit Parser(const TokenStream &Code) : Code(Code), Tok(&Code.front()) {} + void parse(PPStructure *result) { parse(result, /*TopLevel=*/true); } + +private: + // Roles that a directive might take within a conditional block. + enum class Cond { None, If, Else, End }; + static Cond classifyDirective(tok::PPKeywordKind kind) { + switch (kind) { + case clang::tok::pp_if: + case clang::tok::pp_ifdef: + case clang::tok::pp_ifndef: + return Cond::If; + case clang::tok::pp_elif: + case clang::tok::pp_elifdef: + case clang::tok::pp_elifndef: + case clang::tok::pp_else: + return Cond::Else; + case clang::tok::pp_endif: + return Cond::End; + default: + return Cond::None; + } + } + + // Parses tokens starting at Tok into PP. + // If we reach an #end or #else directive that ends PP, returns it. + // If TopLevel is true, then we do not expect #end and always return None. + llvm::Optional parse(PPStructure *PP, bool TopLevel) { + auto StartsDirective = + [&, AllowDirectiveAt((const Token *)nullptr)]() mutable { + if (Tok->flag(LexFlags::StartsPPLine)) { + // If we considered a comment at the start of a PP-line, it doesn't + // start a directive but the directive can still start after it. + if (Tok->Kind == tok::comment) + AllowDirectiveAt = Tok + 1; + return Tok->Kind == tok::hash; + } + return Tok->Kind == tok::hash && AllowDirectiveAt == Tok; + }; + while (Tok->Kind != tok::eof) { + while (StartsDirective()) { + PPStructure::Directive Directive; + parseDirective(&Directive); + Cond Kind = classifyDirective(Directive.Kind); + if (Kind == Cond::If) { + PPStructure::Conditional Conditional; + Conditional.Branches.emplace_back(); + Conditional.Branches.back().first = std::move(Directive); + parseConditional(&Conditional); + PP->Chunks.push_back(std::move(Conditional)); + continue; + } + // Unexpected #else or #endif at top level; parse as normal directives. + if (Kind == Cond::None || TopLevel) { + PP->Chunks.push_back(std::move(Directive)); + continue; + } + assert(Kind == Cond::Else || Kind == Cond::End); + return std::move(Directive); + } + const Token *Start = Tok; + while (Tok->Kind != tok::eof && !StartsDirective()) + ++Tok; + if (Tok != Start) + PP->Chunks.push_back(PPStructure::Code{ + Token::Range{Code.index(*Start), Code.index(*Tok)}}); + } + return None; + } + + // Parse the rest of a conditional section, after seeing the #if directive. + // Returns after consuming the #end directive. + void parseConditional(PPStructure::Conditional *C) { + assert(C->Branches.size() == 1 && + C->Branches.front().second.Chunks.empty() && + "Should be ready to parse first branch body"); + while (Tok->Kind != tok::eof) { + auto Terminator = parse(&C->Branches.back().second, /*TopLevel=*/false); + if (!Terminator) { + assert(Tok->Kind == tok::eof && "gave up parsing before eof?"); + C->End.Tokens = Token::Range::empty(Code.index(*Tok)); + return; + } + if (classifyDirective(Terminator->Kind) == Cond::End) { + C->End = std::move(*Terminator); + return; + } + assert(classifyDirective(Terminator->Kind) == Cond::Else && + "ended branch unexpectedly"); + C->Branches.emplace_back(); + C->Branches.back().first = std::move(*Terminator); + } + } + + // Parse a directive. Tok is the hash. + void parseDirective(PPStructure::Directive *D) { + assert(Tok->Kind == tok::hash); + D->Tokens.Begin = Code.index(*Tok); + do { + ++Tok; + } while (Tok->Kind == tok::comment && !Tok->flag(LexFlags::StartsPPLine)); + // Technically directive names can be spelled with UCNs or split over lines. + // In practice, this never happens. + if (Tok->Kind == tok::raw_identifier) + D->Kind = Idents.get(Tok->text()).getPPKeywordID(); + while (Tok->Kind != tok::eof && !Tok->flag(LexFlags::StartsPPLine)) + ++Tok; + D->Tokens.End = Code.index(*Tok); + } + + const TokenStream &Code; + const Token *Tok; + clang::IdentifierTable Idents; +}; + +} // namespace + +PPStructure PPStructure::parse(const TokenStream &Code) { + PPStructure Result; + Parser(Code).parse(&Result); + return Result; +} + +static llvm::StringLiteral ppKeywordName(tok::PPKeywordKind kind) { + switch (kind) { +#define PPKEYWORD(x) \ + case tok::pp_##x: \ + return #x; +#include "clang/Basic/TokenKinds.def" + default: + return "unknown"; + } +} + +static void dump(llvm::raw_ostream &OS, const PPStructure &PP, + unsigned Indent) { + auto DumpDirective = [&](const PPStructure::Directive &Directive) { + OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n", + ppKeywordName(Directive.Kind), + Directive.Tokens.size()); + }; + + for (const auto &Chunk : PP.Chunks) { + switch (Chunk.kind()) { + case PPStructure::Chunk::K_Empty: + llvm_unreachable("invalid chunk"); + case PPStructure::Chunk::K_Code: { + const PPStructure::Code &Code(Chunk); + OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n", + Code.Tokens.size()); + break; + } + case PPStructure::Chunk::K_Directive: { + const PPStructure::Directive &Directive(Chunk); + DumpDirective(Directive); + break; + } + case PPStructure::Chunk::K_Conditional: { + const PPStructure::Conditional &Conditional(Chunk); + for (const auto &Branch : Conditional.Branches) { + DumpDirective(Branch.first); + dump(OS, Branch.second, Indent + 2); + } + DumpDirective(Conditional.End); + break; + } + } + } +} + +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const PPStructure &PP) { + dump(OS, PP, 0); + return OS; +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/lib/Tooling/Syntax/Pseudo/Token.cpp b/clang/lib/Tooling/Syntax/Pseudo/Token.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Tooling/Syntax/Pseudo/Token.cpp @@ -0,0 +1,94 @@ +//===--- Token.cpp - Tokens and token streams in the pseudoparser ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Tooling/Syntax/Pseudo/Token.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/FormatVariadic.h" + +namespace clang { +namespace syntax { +namespace pseudo { + +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) { + OS << llvm::formatv("{0} {1}:{2} ", clang::tok::getTokenName(T.Kind), T.Line, + T.Indent); + OS << '"'; + llvm::printEscapedString(T.text(), OS); + OS << '"'; + if (T.Pair != Token::Invalid) + OS << " pair=" << T.Pair; + if (T.Flags) + OS << llvm::format(" flags=%2x", T.Flags); + return OS; +} + +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const TokenStream &TS) { + OS << "Index Kind Line Text\n"; + for (const auto &T : TS.tokens()) { + OS << llvm::format("%5d: %16s %4d:%-2d ", TS.index(T), + clang::tok::getTokenName(T.Kind), T.Line, T.Indent); + OS << '"'; + llvm::printEscapedString(T.text(), OS); + OS << '"'; + if (T.Pair != Token::Invalid) + OS << " pair=" << T.Pair; + if (T.Flags) + OS << llvm::format(" flags=%x", T.Flags); + OS << '\n'; + } + return OS; +} + +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token::Range &R) { + OS << llvm::formatv("[{0},{1})", R.Begin, R.End); + return OS; +} + +TokenStream::TokenStream(std::shared_ptr Payload) + : Payload(std::move(Payload)) { + Storage.emplace_back(); + Storage.back().Kind = clang::tok::eof; +} + +void TokenStream::finalize() { + unsigned LastLine = Storage.back().Line; + Storage.emplace_back(); + Storage.back().Kind = tok::eof; + Storage.back().Line = LastLine + 1; + + Tokens = Storage; + Tokens = Tokens.drop_front().drop_back(); +} + +void TokenStream::print(llvm::raw_ostream &OS) const { + bool FirstToken = true; + unsigned LastLine = -1; + StringRef LastText; + for (const auto &T : tokens()) { + StringRef Text = T.text(); + if (FirstToken) { + FirstToken = false; + } else if (T.Line == LastLine) { + if (LastText.data() + LastText.size() != Text.data()) + OS << ' '; + } else { + OS << '\n'; + OS.indent(T.Indent); + } + OS << Text; + LastLine = T.Line; + LastText = Text; + } + if (!FirstToken) + OS << '\n'; +} + +} // namespace pseudo +} // namespace syntax +} // namespace clang diff --git a/clang/test/Syntax/Inputs/example.c b/clang/test/Syntax/Inputs/example.c new file mode 100644 --- /dev/null +++ b/clang/test/Syntax/Inputs/example.c @@ -0,0 +1,7 @@ +int is_debug() { +#ifndef NDEBUG + return 1; // in debug mode +#else + return 0; +#endif +} diff --git a/clang/test/Syntax/lex.test b/clang/test/Syntax/lex.test new file mode 100644 --- /dev/null +++ b/clang/test/Syntax/lex.test @@ -0,0 +1,38 @@ +// RUN: clang-pseudo -source %S/Inputs/example.c -print-source | FileCheck %s -check-prefix=SOURCE --strict-whitespace + SOURCE: int is_debug() { +SOURCE-NEXT: #ifndef NDEBUG +SOURCE-NEXT: return 1; // in debug mode +SOURCE-NEXT: #else +SOURCE-NEXT: return 0; +SOURCE-NEXT: #end +SOURCE-NEXT: } +// RUN: clang-pseudo -source %S/Inputs/example.c -print-tokens | FileCheck %s -check-prefix=TOKEN +TOKEN: 0: raw_identifier 0:0 "int" flags=1 +TOKEN-NEXT: raw_identifier 0:0 "is_debug" +TOKEN-NEXT: l_paren 0:0 "(" +TOKEN-NEXT: r_paren 0:0 ")" +TOKEN-NEXT: l_brace 0:0 "{" +TOKEN-NEXT: hash 1:0 "#" flags=1 +TOKEN-NEXT: raw_identifier 1:0 "ifndef" +TOKEN-NEXT: raw_identifier 1:0 "NDEBUG" +TOKEN-NEXT: raw_identifier 2:2 "return" flags=1 +TOKEN-NEXT: numeric_constant 2:2 "1" +TOKEN-NEXT: semi 2:2 ";" +TOKEN-NEXT: comment 2:2 "// in debug mode" +TOKEN-NEXT: hash 3:0 "#" flags=1 +TOKEN-NEXT: raw_identifier 3:0 "else" +TOKEN-NEXT: raw_identifier 4:2 "return" flags=1 +TOKEN-NEXT: numeric_constant 4:2 "0" +TOKEN-NEXT: semi 4:2 ";" +TOKEN-NEXT: hash 5:0 "#" flags=1 +TOKEN-NEXT: raw_identifier 5:0 "endif" +TOKEN-NEXT: r_brace 6:0 "}" flags=1 +// RUN: clang-pseudo -source %S/Inputs/example.c -print-pp-structure | FileCheck %s -check-prefix=PPS --strict-whitespace + PPS: code (5 tokens) +PPS-NEXT: #ifndef (3 tokens) +PPS-NEXT: code (4 tokens) +PPS-NEXT: #else (2 tokens) +PPS-NEXT: code (3 tokens) +PPS-NEXT: #endif (2 tokens) +PPS-NEXT: code (1 tokens) + diff --git a/clang/tools/clang-pseudo/ClangPseudo.cpp b/clang/tools/clang-pseudo/ClangPseudo.cpp --- a/clang/tools/clang-pseudo/ClangPseudo.cpp +++ b/clang/tools/clang-pseudo/ClangPseudo.cpp @@ -6,7 +6,10 @@ // //===----------------------------------------------------------------------===// +#include "clang/Basic/LangOptions.h" #include "clang/Tooling/Syntax/Pseudo/Grammar.h" +#include "clang/Tooling/Syntax/Pseudo/Preprocess.h" +#include "clang/Tooling/Syntax/Pseudo/Token.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormatVariadic.h" @@ -21,19 +24,31 @@ CheckGrammar("check-grammar", desc("Parse and check a BNF grammar file."), init("")); +static opt Source("source", desc("Source file")); +static opt PrintSource("print-source", desc("Print token stream")); +static opt PrintTokens("print-tokens", desc("Print detailed token info")); +static opt + PrintPPStructure("print-pp-structure", + desc("Print directive structure of source code")); + +static std::string readOrDie(llvm::StringRef Path) { + llvm::ErrorOr> Text = + llvm::MemoryBuffer::getFile(Path); + if (std::error_code EC = Text.getError()) { + llvm::errs() << "Error: can't read grammar file '" << CheckGrammar + << "': " << EC.message() << "\n"; + ::exit(1); + } + return Text.get()->getBuffer().str(); +} + int main(int argc, char *argv[]) { llvm::cl::ParseCommandLineOptions(argc, argv, ""); if (CheckGrammar.getNumOccurrences()) { - llvm::ErrorOr> Text = - llvm::MemoryBuffer::getFile(CheckGrammar); - if (std::error_code EC = Text.getError()) { - llvm::errs() << "Error: can't read grammar file '" << CheckGrammar - << "': " << EC.message() << "\n"; - return 1; - } + std::string Text = readOrDie(CheckGrammar); std::vector Diags; - auto RSpecs = Grammar::parseBNF(Text.get()->getBuffer(), Diags); + auto RSpecs = Grammar::parseBNF(Text, Diags); if (!Diags.empty()) { llvm::errs() << llvm::join(Diags, "\n"); @@ -43,5 +58,20 @@ CheckGrammar); return 0; } + + if (Source.getNumOccurrences()) { + std::string Text = readOrDie(Source); + clang::LangOptions LangOpts; // FIXME: use real options. + auto Stream = clang::syntax::pseudo::lex(Text, LangOpts); + auto Structure = clang::syntax::pseudo::PPStructure::parse(Stream); + + if (PrintPPStructure) + llvm::outs() << Structure; + if (PrintSource) + Stream.print(llvm::outs()); + if (PrintTokens) + llvm::outs() << Stream; + } + return 0; }