Index: include/clang/Basic/DiagnosticFrontendKinds.td =================================================================== --- include/clang/Basic/DiagnosticFrontendKinds.td +++ include/clang/Basic/DiagnosticFrontendKinds.td @@ -258,6 +258,9 @@ "as the %select{aliasee|resolver}2">, InGroup; +def err_minimize_source_to_dependency_directives_failed : Error< + "dependency directives minimization failed for given source">; + let CategoryName = "Instrumentation Issue" in { def warn_profile_data_out_of_date : Warning< "profile data may be out of date: of %0 function%s0, %1 %plural{1:has|:have}1" Index: include/clang/Driver/CC1Options.td =================================================================== --- include/clang/Driver/CC1Options.td +++ include/clang/Driver/CC1Options.td @@ -585,6 +585,9 @@ HelpText<"Migrate source code">; def compiler_options_dump : Flag<["-"], "compiler-options-dump">, HelpText<"Dump the compiler configuration options">; +def print_dependency_directives_minimized_source : Flag<["-"], + "print-dependency-directives-minimized-source">, + HelpText<"Print the output of the dependency directives source minimizer">; } def emit_llvm_uselists : Flag<["-"], "emit-llvm-uselists">, Index: include/clang/Frontend/FrontendActions.h =================================================================== --- include/clang/Frontend/FrontendActions.h +++ include/clang/Frontend/FrontendActions.h @@ -240,6 +240,17 @@ bool usesPreprocessorOnly() const override { return true; } }; +class PrintDependencyDirectivesSourceMinimizerAction : public FrontendAction { +protected: + void ExecuteAction() override; + std::unique_ptr CreateASTConsumer(CompilerInstance &, + StringRef) override { + return nullptr; + } + + bool usesPreprocessorOnly() const override { return true; } +}; + //===----------------------------------------------------------------------===// // Preprocessor Actions //===----------------------------------------------------------------------===// Index: include/clang/Frontend/FrontendOptions.h =================================================================== --- include/clang/Frontend/FrontendOptions.h +++ include/clang/Frontend/FrontendOptions.h @@ -127,7 +127,10 @@ MigrateSource, /// Just lex, no output. - RunPreprocessorOnly + RunPreprocessorOnly, + + /// Print the output of the dependency directives source minimizer. + PrintDependencyDirectivesSourceMinimizerOutput }; } // namespace frontend Index: include/clang/Lex/DependencyDirectivesSourceMinimizer.h =================================================================== --- /dev/null +++ include/clang/Lex/DependencyDirectivesSourceMinimizer.h @@ -0,0 +1,80 @@ +//===- clang/Lex/DependencyDirectivesSourceMinimizer.h - ----------*- C++ -*-// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This is the interface for minimizing header and source files to the +/// minimum necessary preprocessor directives for evaluating includes. It +/// reduces the source down to #define, #include, #import, @import, and any +/// conditional preprocessor logic that contains one of those. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LEX_DEPENDENCY_DIRECTIVES_SOURCE_MINIMIZER_H +#define LLVM_CLANG_LEX_DEPENDENCY_DIRECTIVES_SOURCE_MINIMIZER_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" + +namespace clang { +namespace minimize_source_to_dependency_directives { + +/// Represents the kind of preprocessor directive that is tracked by the source +/// minimizer in its token output. +enum TokenKind { + pp_none, + pp_include, + pp___include_macros, + pp_define, + pp_undef, + pp_import, + pp_at_import, + pp_pragma_import, + pp_include_next, + pp_if, + pp_ifdef, + pp_ifndef, + pp_elif, + pp_else, + pp_endif, + pp_eof, +}; + +/// Represents a simplified token that's lexed as part of the source +/// minimization. It's used to track the location of various preprocessor +/// directives that could potentially have an effect on the depedencies. +struct Token { + /// The kind of token. + TokenKind K = pp_none; + + /// Offset into the output byte stream of where the directive begins. + int Offset = -1; + + Token(TokenKind K, int Offset) : K(K), Offset(Offset) {} +}; + +} // end namespace minimize_source_to_dependency_directives + +/// Minimize the input down to the preprocessor directives that might have +/// an effect on the dependencies for a compilation unit. +/// +/// This function deletes all non-preprocessor code, and strips anything that +/// can't affect what gets included. It canonicalizes whitespace where +/// convenient to stabilize the output against formatting changes in the input. +/// +/// Clears the output vectors at the beginning of the call. +/// +/// \returns false on success, true on error. +bool minimizeSourceToDependencyDirectives( + llvm::StringRef Input, llvm::SmallVectorImpl &Output, + llvm::SmallVectorImpl + &Tokens); + +} // end namespace clang + +#endif // LLVM_CLANG_LEX_DEPENDENCY_DIRECTIVES_SOURCE_MINIMIZER_H Index: lib/Frontend/CompilerInvocation.cpp =================================================================== --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -1670,6 +1670,10 @@ Opts.ProgramAction = frontend::MigrateSource; break; case OPT_Eonly: Opts.ProgramAction = frontend::RunPreprocessorOnly; break; + case OPT_print_dependency_directives_minimized_source: + Opts.ProgramAction = + frontend::PrintDependencyDirectivesSourceMinimizerOutput; + break; } } @@ -3079,6 +3083,7 @@ case frontend::PrintPreprocessedInput: case frontend::RewriteMacros: case frontend::RunPreprocessorOnly: + case frontend::PrintDependencyDirectivesSourceMinimizerOutput: return true; } llvm_unreachable("invalid frontend action"); Index: lib/Frontend/FrontendActions.cpp =================================================================== --- lib/Frontend/FrontendActions.cpp +++ lib/Frontend/FrontendActions.cpp @@ -14,6 +14,7 @@ #include "clang/Frontend/FrontendDiagnostic.h" #include "clang/Frontend/MultiplexConsumer.h" #include "clang/Frontend/Utils.h" +#include "clang/Lex/DependencyDirectivesSourceMinimizer.h" #include "clang/Lex/HeaderSearch.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" @@ -909,3 +910,19 @@ OS << "}"; } + +void PrintDependencyDirectivesSourceMinimizerAction::ExecuteAction() { + CompilerInstance &CI = getCompilerInstance(); + auto Buffer = CI.getFileManager().getBufferForFile(getCurrentFile()); + if (!Buffer) + return; + llvm::SmallString<1024> Output; + llvm::SmallVector Toks; + if (minimizeSourceToDependencyDirectives((*Buffer)->getBuffer(), Output, + Toks)) { + CI.getDiagnostics().Report( + diag::err_minimize_source_to_dependency_directives_failed); + return; + } + llvm::outs() << Output; +} Index: lib/FrontendTool/ExecuteCompilerInvocation.cpp =================================================================== --- lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -116,6 +116,8 @@ case RunAnalysis: Action = "RunAnalysis"; break; #endif case RunPreprocessorOnly: return llvm::make_unique(); + case PrintDependencyDirectivesSourceMinimizerOutput: + return llvm::make_unique(); } #if !CLANG_ENABLE_ARCMT || !CLANG_ENABLE_STATIC_ANALYZER \ Index: lib/Lex/CMakeLists.txt =================================================================== --- lib/Lex/CMakeLists.txt +++ lib/Lex/CMakeLists.txt @@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS support) add_clang_library(clangLex + DependencyDirectivesSourceMinimizer.cpp HeaderMap.cpp HeaderSearch.cpp Lexer.cpp Index: lib/Lex/DependencyDirectivesSourceMinimizer.cpp =================================================================== --- /dev/null +++ lib/Lex/DependencyDirectivesSourceMinimizer.cpp @@ -0,0 +1,708 @@ +//===- DependencyDirectivesSourceMinimizer.cpp - -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This is the implementation for minimizing header and source files to the +/// minimum necessary preprocessor directives for evaluating includes. It +/// reduces the source down to #define, #include, #import, @import, and any +/// conditional preprocessor logic that contains one of those. +/// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/DependencyDirectivesSourceMinimizer.h" +#include "clang/Basic/CharInfo.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/MemoryBuffer.h" + +using namespace llvm; +using namespace clang; +using namespace clang::minimize_source_to_dependency_directives; + +namespace { + +struct Lexer { + SmallVectorImpl &Out; + SmallVectorImpl &Tokens; + + Lexer(SmallVectorImpl &Out, SmallVectorImpl &Tokens) + : Out(Out), Tokens(Tokens) {} + + bool lex(StringRef Bytes); + + StringMap SplitIds; + +private: + struct IdInfo { + const char *Last; + StringRef Name; + }; + + /// Lex an identifier. + /// + /// \pre First points at a valid identifier head. + LLVM_NODISCARD IdInfo lexIdentifier(const char *First, const char *const End); + LLVM_NODISCARD bool isNextIdentifier(StringRef Id, const char *&First, + const char *const End); + LLVM_NODISCARD bool lexImpl(const char *First, const char *const End); + LLVM_NODISCARD bool lexPPLine(const char *&First, const char *const End); + LLVM_NODISCARD bool lexAt(const char *&First, const char *const End); + LLVM_NODISCARD bool lexDefine(const char *&First, const char *const End); + LLVM_NODISCARD bool lexPragma(const char *&First, const char *const End); + LLVM_NODISCARD bool lexEndif(const char *&First, const char *const End); + LLVM_NODISCARD bool lexDefault(TokenKind Kind, StringRef Directive, + const char *&First, const char *const End); + Token &makeToken(TokenKind K) { + Tokens.emplace_back(K, Out.size()); + return Tokens.back(); + } + void popToken() { + Out.resize(Tokens.back().Offset); + Tokens.pop_back(); + } + TokenKind top() const { return Tokens.empty() ? pp_none : Tokens.back().K; } + + Lexer &put(char Byte) { + Out.push_back(Byte); + return *this; + } + Lexer &append(StringRef S) { return append(S.begin(), S.end()); } + Lexer &append(const char *First, const char *Last) { + Out.append(First, Last); + return *this; + } + + void printToNewline(const char *&First, const char *const End); + void printAdjacentModuleNameParts(const char *&First, const char *const End); + LLVM_NODISCARD bool printAtImportBody(const char *&First, + const char *const End); + void printDirectiveBody(const char *&First, const char *const End); + void printAdjacentMacroArgs(const char *&First, const char *const End); + LLVM_NODISCARD bool printMacroArgs(const char *&First, const char *const End); +}; + +} // end anonymous namespace + +static void skipOverSpaces(const char *&First, const char *const End) { + while (First != End && isHorizontalWhitespace(*First)) + ++First; +} + +LLVM_NODISCARD static bool isRawStringLiteral(const char *First, + const char *Current) { + assert(First <= Current); + + // Check if we can even back up. + if (*Current != '\"' || First == Current) + return false; + + // Check for an "R". + --Current; + if (*Current != 'R') + return false; + if (First == Current || !isIdentifierBody(*--Current)) + return true; + + // Check for a prefix of "u", "U", or "L". + if (*Current == 'u' || *Current == 'U' || *Current == 'L') + return First == Current || !isIdentifierBody(*--Current); + + // Check for a prefix of "u8". + if (*Current != '8' || First == Current || *Current-- != 'u') + return false; + return First == Current || !isIdentifierBody(*--Current); +} + +static void skipRawString(const char *&First, const char *const End) { + assert(First[0] == '\"'); + assert(First[-1] == 'R'); + + const char *Last = ++First; + while (Last != End && *Last != '(') + ++Last; + if (Last == End) { + First = Last; // Hit the end... just give up. + return; + } + + StringRef Terminator(First, Last - First); + for (;;) { + // Move First to just past the next ")". + First = Last; + while (First != End && *First != ')') + ++First; + if (First == End) + return; + ++First; + + // Look ahead for the terminator sequence. + Last = First; + while (Last != End && size_t(Last - First) < Terminator.size() && + Terminator[Last - First] == *Last) + ++Last; + + // Check if we hit it (or the end of the file). + if (Last == End) { + First = Last; + return; + } + if (size_t(Last - First) < Terminator.size()) + continue; + if (*Last != '\"') + continue; + First = Last + 1; + return; + } +} + +static void skipString(const char *&First, const char *const End) { + assert(*First == '\'' || *First == '\"'); + const char Terminator = *First; + for (++First; First != End && *First != Terminator; ++First) + if (*First == '\\') + if (++First == End) + return; + if (First != End) + ++First; // Finish off the string. +} + +static void skipNewline(const char *&First, const char *End) { + assert(isVerticalWhitespace(*First)); + ++First; + if (First == End) + return; + + // Check for "\n\r" and "\r\n". + if (LLVM_UNLIKELY(isVerticalWhitespace(*First) && First[-1] != First[0])) + ++First; +} + +static void skipToNewlineRaw(const char *&First, const char *const End) { + for (;;) { + if (First == End) + return; + + if (isVerticalWhitespace(*First)) + return; + + while (!isVerticalWhitespace(*First)) + if (++First == End) + return; + + if (First[-1] != '\\') + return; + + ++First; // Keep going... + } +} + +static const char *reverseOverSpaces(const char *First, const char *Last) { + while (First != Last && isHorizontalWhitespace(Last[-1])) + --Last; + return Last; +} + +static void skipLineComment(const char *&First, const char *const End) { + assert(First[0] == '/' && First[1] == '/'); + First += 2; + skipToNewlineRaw(First, End); +} + +static void skipBlockComment(const char *&First, const char *const End) { + assert(First[0] == '/' && First[1] == '*'); + if (End - First < 4) { + First = End; + return; + } + for (First += 3; First != End; ++First) + if (First[-1] == '*' && First[0] == '/') { + ++First; + return; + } +} + +static void skipLine(const char *&First, const char *const End) { + do { + assert(First <= End); + if (First == End) + return; + + if (isVerticalWhitespace(*First)) { + skipNewline(First, End); + return; + } + const char *Start = First; + while (First != End && !isVerticalWhitespace(*First)) { + // Iterate over strings correctly to avoid comments and newlines. + if (*First == '\"' || *First == '\'') { + if (isRawStringLiteral(Start, First)) + skipRawString(First, End); + else + skipString(First, End); + continue; + } + + // Iterate over comments correctly. + if (*First != '/' || End - First < 2) { + ++First; + continue; + } + + if (First[1] == '/') { + // "//...". + skipLineComment(First, End); + continue; + } + + if (First[1] != '*') { + ++First; + continue; + } + + // "/*...*/". + skipBlockComment(First, End); + } + if (First == End) + return; + + // Skip over the newline. + assert(isVerticalWhitespace(*First)); + skipNewline(First, End); + } while (First[-2] == '\\'); // Continue past line-continuations. +} + +static void skipDirective(StringRef Name, const char *&First, + const char *const End) { + if (llvm::StringSwitch(Name) + .Case("warning", true) + .Case("error", true) + .Default(false)) + // Do not process quotes or comments. + skipToNewlineRaw(First, End); + else + skipLine(First, End); +} + +void Lexer::printToNewline(const char *&First, const char *const End) { + while (First != End && !isVerticalWhitespace(*First)) { + const char *Last = First; + do { + // Iterate over strings correctly to avoid comments and newlines. + if (*Last == '\"' || *Last == '\'') { + if (LLVM_UNLIKELY(isRawStringLiteral(First, Last))) + skipRawString(Last, End); + else + skipString(Last, End); + continue; + } + if (*Last != '/' || End - Last < 2) { + ++Last; + continue; // Gather the rest up to print verbatim. + } + + if (Last[1] != '/' && Last[1] != '*') { + ++Last; + continue; + } + + // Deal with "//..." and "/*...*/". + append(First, reverseOverSpaces(First, Last)); + First = Last; + + if (Last[1] == '/') { + skipLineComment(First, End); + return; + } + + put(' '); + skipBlockComment(First, End); + skipOverSpaces(First, End); + Last = First; + } while (Last != End && !isVerticalWhitespace(*Last)); + + // Print out the string. + if (Last == End || Last == First || Last[-1] != '\\') { + append(First, reverseOverSpaces(First, Last)); + return; + } + + // Print up to the backslash, backing up over spaces. + append(First, reverseOverSpaces(First, Last - 1)); + + First = Last; + skipNewline(First, End); + skipOverSpaces(First, End); + } +} + +static void skipWhitespace(const char *&First, const char *const End) { + for (;;) { + assert(First <= End); + skipOverSpaces(First, End); + + if (End - First < 2) + return; + + if (First[0] == '\\' && isVerticalWhitespace(First[1])) { + skipNewline(++First, End); + continue; + } + + // Check for a non-comment character. + if (First[0] != '/') + return; + + // "// ...". + if (First[1] == '/') { + skipLineComment(First, End); + return; + } + + // Cannot be a comment. + if (First[1] != '*') + return; + + // "/*...*/". + skipBlockComment(First, End); + } +} + +void Lexer::printAdjacentModuleNameParts(const char *&First, + const char *const End) { + // Skip over parts of the body. + const char *Last = First; + do + ++Last; + while (Last != End && (isIdentifierBody(*Last) || *Last == '.')); + append(First, Last); + First = Last; +} + +bool Lexer::printAtImportBody(const char *&First, const char *const End) { + for (;;) { + skipWhitespace(First, End); + if (First == End) + return true; + + if (isVerticalWhitespace(*First)) { + skipNewline(First, End); + continue; + } + + // Found a semicolon. + if (*First == ';') { + put(*First++).put('\n'); + return false; + } + + // Don't handle macro expansions inside @import for now. + if (!isIdentifierBody(*First) && *First != '.') + return true; + + printAdjacentModuleNameParts(First, End); + } +} + +void Lexer::printDirectiveBody(const char *&First, const char *const End) { + skipWhitespace(First, End); // Skip initial whitespace. + printToNewline(First, End); + while (Out.back() == ' ') + Out.pop_back(); + put('\n'); +} + +LLVM_NODISCARD static const char *lexRawIdentifier(const char *First, + const char *const End) { + assert(isIdentifierBody(*First) && "invalid identifer"); + const char *Last = First + 1; + while (Last != End && isIdentifierBody(*Last)) + ++Last; + return Last; +} + +LLVM_NODISCARD static const char * +getIdentifierContinuation(const char *First, const char *const End) { + if (End - First < 3 || First[0] != '\\' || !isVerticalWhitespace(First[1])) + return nullptr; + + ++First; + skipNewline(First, End); + if (First == End) + return nullptr; + return isIdentifierBody(First[0]) ? First : nullptr; +} + +Lexer::IdInfo Lexer::lexIdentifier(const char *First, const char *const End) { + const char *Last = lexRawIdentifier(First, End); + const char *Next = getIdentifierContinuation(Last, End); + if (LLVM_LIKELY(!Next)) + return IdInfo{Last, StringRef(First, Last - First)}; + + // Slow path, where identifiers are split over lines. + SmallVector Id(First, Last); + while (Next) { + Last = lexRawIdentifier(Next, End); + Id.append(Next, Last); + Next = getIdentifierContinuation(Last, End); + } + return IdInfo{ + Last, + SplitIds.try_emplace(StringRef(Id.begin(), Id.size()), 0).first->first()}; +} + +void Lexer::printAdjacentMacroArgs(const char *&First, const char *const End) { + // Skip over parts of the body. + const char *Last = First; + do + ++Last; + while (Last != End && + (isIdentifierBody(*Last) || *Last == '.' || *Last == ',')); + append(First, Last); + First = Last; +} + +bool Lexer::printMacroArgs(const char *&First, const char *const End) { + assert(*First == '('); + put(*First++); + for (;;) { + skipWhitespace(First, End); + if (First == End) + return true; + + if (*First == ')') { + put(*First++); + return false; + } + + // This is intentionally fairly liberal. + if (!(isIdentifierBody(*First) || *First == '.' || *First == ',')) + return true; + + printAdjacentMacroArgs(First, End); + } +} + +/// Looks for an identifier starting from Last. +/// +/// Updates "First" to just past the next identifier, if any. Returns true iff +/// the identifier matches "Id". +bool Lexer::isNextIdentifier(StringRef Id, const char *&First, + const char *const End) { + skipWhitespace(First, End); + if (First == End || !isIdentifierHead(*First)) + return false; + + IdInfo FoundId = lexIdentifier(First, End); + First = FoundId.Last; + return FoundId.Name == Id; +} + +bool Lexer::lexAt(const char *&First, const char *const End) { + // Handle "@import". + ++First; + if (!isNextIdentifier("import", First, End)) { + skipLine(First, End); + return false; + } + makeToken(pp_at_import); + append("@import "); + if (printAtImportBody(First, End)) + return true; // Error: Could not find semi-colon. + skipWhitespace(First, End); + if (First == End) + return false; + if (!isVerticalWhitespace(*First)) + return true; // Error: Nothing expected after semi-colon. + + skipNewline(First, End); + return false; +} + +bool Lexer::lexDefine(const char *&First, const char *const End) { + makeToken(pp_define); + append("#define "); + skipWhitespace(First, End); + + if (!isIdentifierHead(*First)) + return true; // Error: Don't understand this #define. + + IdInfo Id = lexIdentifier(First, End); + const char *Last = Id.Last; + append(Id.Name); + if (Last == End) + return false; + if (*Last == '(') { + size_t Size = Out.size(); + if (printMacroArgs(Last, End)) { + // Be robust to bad macro arguments, since they can show up in disabled + // code. + Out.resize(Size); + append("(/* invalid */\n"); + skipLine(Last, End); + return false; + } + } + skipWhitespace(Last, End); + if (Last == End) + return false; + if (!isVerticalWhitespace(*Last)) + put(' '); + printDirectiveBody(Last, End); + First = Last; + return false; +} + +bool Lexer::lexPragma(const char *&First, const char *const End) { + // #pragma. + if (!isNextIdentifier("clang", First, End)) { + skipLine(First, End); + return false; + } + + // #pragma clang. + if (!isNextIdentifier("module", First, End)) { + skipLine(First, End); + return false; + } + + // #pragma clang module. + if (!isNextIdentifier("import", First, End)) { + skipLine(First, End); + return false; + } + + // #pragma clang module import. + makeToken(pp_pragma_import); + append("#pragma clang module import "); + printDirectiveBody(First, End); + return false; +} + +bool Lexer::lexEndif(const char *&First, const char *const End) { + // Strip out "#else" if it's empty. + if (top() == pp_else) + popToken(); + + // Strip out "#elif" if they're empty. + while (top() == pp_elif) + popToken(); + + // If "#if" is empty, strip it and skip the "#endif". + if (top() == pp_if || top() == pp_ifdef || top() == pp_ifndef) { + popToken(); + skipLine(First, End); + return false; + } + + return lexDefault(pp_endif, "endif", First, End); +} + +bool Lexer::lexDefault(TokenKind Kind, StringRef Directive, const char *&First, + const char *const End) { + makeToken(Kind); + put('#').append(Directive).put(' '); + printDirectiveBody(First, End); + return false; +} + +bool Lexer::lexPPLine(const char *&First, const char *const End) { + assert(First != End); + + skipWhitespace(First, End); + assert(First <= End); + if (First == End) + return false; + + if (*First != '#' && *First != '@') { + skipLine(First, End); + assert(First <= End); + return false; + } + + // Handle "@import". + if (*First == '@') + return lexAt(First, End); + + // Handle preprocessing directives. + ++First; // Skip over '#'. + skipWhitespace(First, End); + + if (First == End) + return true; // Error: Invalid preprocessor directive. + + if (!isIdentifierHead(*First)) { + skipLine(First, End); + return false; + } + + // Figure out the token. + IdInfo Id = lexIdentifier(First, End); + First = Id.Last; + auto Kind = llvm::StringSwitch(Id.Name) + .Case("include", pp_include) + .Case("__include_macros", pp___include_macros) + .Case("define", pp_define) + .Case("undef", pp_undef) + .Case("import", pp_import) + .Case("include_next", pp_include_next) + .Case("if", pp_if) + .Case("ifdef", pp_ifdef) + .Case("ifndef", pp_ifndef) + .Case("elif", pp_elif) + .Case("else", pp_else) + .Case("endif", pp_endif) + .Case("pragma", pp_pragma_import) + .Default(pp_none); + if (Kind == pp_none) { + skipDirective(Id.Name, First, End); + return false; + } + + if (Kind == pp_endif) + return lexEndif(First, End); + + if (Kind == pp_define) + return lexDefine(First, End); + + if (Kind == pp_pragma_import) + return lexPragma(First, End); + + // Everything else. + return lexDefault(Kind, Id.Name, First, End); +} + +bool Lexer::lexImpl(const char *First, const char *const End) { + while (First != End) + if (lexPPLine(First, End)) + return true; + return false; +} + +bool Lexer::lex(StringRef Bytes) { + bool Error = lexImpl(Bytes.begin(), Bytes.end()); + + if (!Error) { + // Add a trailing newline and an EOF on success. + if (!Out.empty() && Out.back() != '\n') + Out.push_back('\n'); + makeToken(pp_eof); + } + + // Null-terminate the output. This way the memory buffer that's passed to + // Clang will not have to worry about the terminating '\0'. + Out.push_back(0); + Out.pop_back(); + return Error; +} + +bool clang::minimizeSourceToDependencyDirectives( + StringRef Input, SmallVectorImpl &Output, + SmallVectorImpl &Tokens) { + Output.clear(); + Tokens.clear(); + return Lexer(Output, Tokens).lex(Input); +} Index: test/Frontend/minimize_source_to_dependency_directives.c =================================================================== --- /dev/null +++ test/Frontend/minimize_source_to_dependency_directives.c @@ -0,0 +1,14 @@ +// RUN: %clang_cc1 -print-dependency-directives-minimized-source %s > %t +// RUN: echo END. >> %t +// RUN: FileCheck < %t %s + +#ifdef FOO +#include "a.h" +#else +void skipThisCode(); +#endif + +// CHECK: #ifdef FOO +// CHECK-NEXT: #include "a.h" +// CHECK-NEXT: #endif +// CHECK-NEXT: END. Index: test/Frontend/minimize_source_to_dependency_directives_error.c =================================================================== --- /dev/null +++ test/Frontend/minimize_source_to_dependency_directives_error.c @@ -0,0 +1,4 @@ +// RUN: not %clang_cc1 -print-dependency-directives-minimized-source %s 2>&1 | FileCheck %s + +#define 0 0 +// CHECK: dependency directives minimization failed for given source Index: unittests/Lex/CMakeLists.txt =================================================================== --- unittests/Lex/CMakeLists.txt +++ unittests/Lex/CMakeLists.txt @@ -3,6 +3,7 @@ ) add_clang_unittest(LexTests + DependencyDirectivesSourceMinimizerTest.cpp HeaderMapTest.cpp HeaderSearchTest.cpp LexerTest.cpp Index: unittests/Lex/DependencyDirectivesSourceMinimizerTest.cpp =================================================================== --- /dev/null +++ unittests/Lex/DependencyDirectivesSourceMinimizerTest.cpp @@ -0,0 +1,494 @@ +//===- unittests/Lex/DependencyDirectivesSourceMinimizer.cpp - -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/DependencyDirectivesSourceMinimizer.h" +#include "llvm/ADT/SmallString.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace clang; +using namespace clang::minimize_source_to_dependency_directives; + +namespace clang { + +bool minimizeSourceToDependencyDirectives(StringRef Input, + SmallVectorImpl &Out) { + SmallVector Tokens; + return minimizeSourceToDependencyDirectives(Input, Out, Tokens); +} + +} // end namespace clang + +namespace { + +TEST(MinimizeSourceToDependencyDirectivesTest, Empty) { + SmallVector Out; + SmallVector Tokens; + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("", Out, Tokens)); + EXPECT_TRUE(Out.empty()); + ASSERT_EQ(1u, Tokens.size()); + ASSERT_EQ(pp_eof, Tokens.back().K); + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("abc def\nxyz", Out, Tokens)); + EXPECT_TRUE(Out.empty()); + ASSERT_EQ(1u, Tokens.size()); + ASSERT_EQ(pp_eof, Tokens.back().K); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, AllTokens) { + SmallVector Out; + SmallVector Tokens; + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define A\n" + "#undef A\n" + "#endif\n" + "#if A\n" + "#ifdef A\n" + "#ifndef A\n" + "#elif A\n" + "#else\n" + "#include \n" + "#include_next \n" + "#__include_macros \n" + "#import \n" + "@import A;\n" + "#pragma clang module import A\n", + Out, Tokens)); + EXPECT_EQ(pp_define, Tokens[0].K); + EXPECT_EQ(pp_undef, Tokens[1].K); + EXPECT_EQ(pp_endif, Tokens[2].K); + EXPECT_EQ(pp_if, Tokens[3].K); + EXPECT_EQ(pp_ifdef, Tokens[4].K); + EXPECT_EQ(pp_ifndef, Tokens[5].K); + EXPECT_EQ(pp_elif, Tokens[6].K); + EXPECT_EQ(pp_else, Tokens[7].K); + EXPECT_EQ(pp_include, Tokens[8].K); + EXPECT_EQ(pp_include_next, Tokens[9].K); + EXPECT_EQ(pp___include_macros, Tokens[10].K); + EXPECT_EQ(pp_import, Tokens[11].K); + EXPECT_EQ(pp_at_import, Tokens[12].K); + EXPECT_EQ(pp_pragma_import, Tokens[13].K); + EXPECT_EQ(pp_eof, Tokens[14].K); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, Define) { + SmallVector Out; + SmallVector Tokens; + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define MACRO", Out, Tokens)); + EXPECT_STREQ("#define MACRO\n", Out.data()); + ASSERT_EQ(2u, Tokens.size()); + ASSERT_EQ(pp_define, Tokens.front().K); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, DefineSpacing) { + SmallVector Out; + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define MACRO\n\n\n", Out)); + EXPECT_STREQ("#define MACRO\n", Out.data()); + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define MACRO \n\n\n", Out)); + EXPECT_STREQ("#define MACRO\n", Out.data()); + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define MACRO a \n\n\n", Out)); + EXPECT_STREQ("#define MACRO a\n", Out.data()); + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define MACRO\n\n\n", Out)); + EXPECT_STREQ("#define MACRO\n", Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, DefineMacroArguments) { + SmallVector Out; + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO()", Out)); + EXPECT_STREQ("#define MACRO()\n", Out.data()); + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define MACRO(a, b...)", Out)); + EXPECT_STREQ("#define MACRO(a,b...)\n", Out.data()); + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define MACRO content", Out)); + EXPECT_STREQ("#define MACRO content\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives( + "#define MACRO con tent ", Out)); + EXPECT_STREQ("#define MACRO con tent\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives( + "#define MACRO() con tent ", Out)); + EXPECT_STREQ("#define MACRO() con tent\n", Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, DefineInvalidMacroArguments) { + SmallVector Out; + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO((a))", Out)); + EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO(", Out)); + EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data()); + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define MACRO(a * b)", Out)); + EXPECT_STREQ("#define MACRO(/* invalid */\n", Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, DefineHorizontalWhitespace) { + SmallVector Out; + + ASSERT_FALSE(minimizeSourceToDependencyDirectives( + "#define MACRO(\t)\tcon \t tent\t", Out)); + EXPECT_STREQ("#define MACRO() con \t tent\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives( + "#define MACRO(\f)\fcon \f tent\f", Out)); + EXPECT_STREQ("#define MACRO() con \f tent\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives( + "#define MACRO(\v)\vcon \v tent\v", Out)); + EXPECT_STREQ("#define MACRO() con \v tent\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives( + "#define MACRO \t\v\f\v\t con\f\t\vtent\v\f \v", Out)); + EXPECT_STREQ("#define MACRO con\f\t\vtent\n", Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, DefineMultilineArgs) { + SmallVector Out; + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define MACRO(a \\\n" + " )", + Out)); + EXPECT_STREQ("#define MACRO(a)\n", Out.data()); + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define MACRO(a, \\\n" + " b) \\\n" + " call((a), \\\n" + " (b))", + Out)); + EXPECT_STREQ("#define MACRO(a,b) call((a),(b))\n", Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, + DefineMultilineArgsCarriageReturn) { + SmallVector Out; + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define MACRO(a, \\\r" + " b) \\\r" + " call((a), \\\r" + " (b))", + Out)); + EXPECT_STREQ("#define MACRO(a,b) call((a),(b))\n", Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, + DefineMultilineArgsCarriageReturnNewline) { + SmallVector Out; + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define MACRO(a, \\\r\n" + " b) \\\r\n" + " call((a), \\\r\n" + " (b))", + Out)); + EXPECT_STREQ("#define MACRO(a,b) call((a),(b))\n", Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, + DefineMultilineArgsNewlineCarriageReturn) { + SmallVector Out; + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define MACRO(a, \\\n\r" + " b) \\\n\r" + " call((a), \\\n\r" + " (b))", + Out)); + EXPECT_STREQ("#define MACRO(a,b) call((a),(b))\n", Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, DefineNumber) { + SmallVector Out; + + ASSERT_TRUE(minimizeSourceToDependencyDirectives("#define 0\n", Out)); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, DefineNoName) { + SmallVector Out; + + ASSERT_TRUE(minimizeSourceToDependencyDirectives("#define &\n", Out)); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, DefineNoWhitespace) { + SmallVector Out; + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define AND&\n", Out)); + EXPECT_STREQ("#define AND &\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define AND\\\n" + "&\n", + Out)); + EXPECT_STREQ("#define AND &\n", Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, MultilineComment) { + SmallVector Out; + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#define MACRO a/*\n" + " /*\n" + "#define MISSING abc\n" + " /*\n" + " /* something */ \n" + "#include /* \"def\" */ \n", + Out)); + EXPECT_STREQ("#define MACRO a\n" + "#include \n", + Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, MultilineCommentInStrings) { + SmallVector Out; + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO1 \"/*\"\n" + "#define MACRO2 \"*/\"\n", + Out)); + EXPECT_STREQ("#define MACRO1 \"/*\"\n" + "#define MACRO2 \"*/\"\n", + Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, Ifdef) { + SmallVector Out; + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#ifdef A\n" + "#define B\n" + "#endif\n", + Out)); + EXPECT_STREQ("#ifdef A\n" + "#define B\n" + "#endif\n", + Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#ifdef A\n" + "#define B\n" + "#elif B\n" + "#define C\n" + "#elif C\n" + "#define D\n" + "#else\n" + "#define E\n" + "#endif\n", + Out)); + EXPECT_STREQ("#ifdef A\n" + "#define B\n" + "#elif B\n" + "#define C\n" + "#elif C\n" + "#define D\n" + "#else\n" + "#define E\n" + "#endif\n", + Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, EmptyIfdef) { + SmallVector Out; + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#ifdef A\n" + "#elif B\n" + "#elif C\n" + "#else D\n" + "#endif\n", + Out)); + EXPECT_STREQ("", Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, Pragma) { + SmallVector Out; + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#pragma A\n", Out)); + EXPECT_STREQ("", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#pragma clang\n", Out)); + EXPECT_STREQ("", Out.data()); + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#pragma clang module\n", Out)); + EXPECT_STREQ("", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives( + "#pragma clang module impor\n", Out)); + EXPECT_STREQ("", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives( + "#pragma clang module import\n", Out)); + EXPECT_STREQ("#pragma clang module import\n", Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, Include) { + SmallVector Out; + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#include \"A\"\n", Out)); + EXPECT_STREQ("#include \"A\"\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#include \n", Out)); + EXPECT_STREQ("#include \n", Out.data()); + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#include_next \n", Out)); + EXPECT_STREQ("#include_next \n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#import \n", Out)); + EXPECT_STREQ("#import \n", Out.data()); + + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("#__include_macros \n", Out)); + EXPECT_STREQ("#__include_macros \n", Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, AtImport) { + SmallVector Out; + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("@import A;\n", Out)); + EXPECT_STREQ("@import A;\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives(" @ import A;\n", Out)); + EXPECT_STREQ("@import A;\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("@import A\n;", Out)); + EXPECT_STREQ("@import A;\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("@import A.B;\n", Out)); + EXPECT_STREQ("@import A.B;\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives( + "@import /*x*/ A /*x*/ . /*x*/ B /*x*/ \n /*x*/ ; /*x*/", Out)); + EXPECT_STREQ("@import A.B;\n", Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, AtImportFailures) { + SmallVector Out; + + ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import A\n", Out)); + ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import MACRO(A);\n", Out)); + ASSERT_TRUE(minimizeSourceToDependencyDirectives("@import \" \";\n", Out)); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, RawStringLiteral) { + SmallVector Out; + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#ifndef GUARD\n" + "#define GUARD\n" + "R\"()\"\n" + "#endif\n", + Out)); + EXPECT_STREQ("#ifndef GUARD\n" + "#define GUARD\n" + "#endif\n", + Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives( + "#ifndef GUARD\n" + "#define GUARD\n" + R"raw(static constexpr char bytes[] = R"(-?:\,[]{}#&*!|>'"%@`)";)raw" + "\n" + "#endif\n", + Out)); + EXPECT_STREQ("#ifndef GUARD\n" + "#define GUARD\n" + "#endif\n", + Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives( + "#ifndef GUARD\n" + "#define GUARD\n" + R"raw(static constexpr char bytes[] = R"abc(-?:\,[]{}#&*!|>'"%@`)abc";)raw" + "\n" + "#endif\n", + Out)); + EXPECT_STREQ("#ifndef GUARD\n" + "#define GUARD\n" + "#endif\n", + Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, SplitIdentifier) { + SmallVector Out; + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#if\\\n" + "ndef GUARD\n" + "#define GUARD\n" + "#endif\n", + Out)); + EXPECT_STREQ("#ifndef GUARD\n" + "#define GUARD\n" + "#endif\n", + Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\n" + "RD\n", + Out)); + EXPECT_STREQ("#define GUARD\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\r" + "RD\n", + Out)); + EXPECT_STREQ("#define GUARD\n", Out.data()); + + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define GUA\\\n" + " RD\n", + Out)); + EXPECT_STREQ("#define GUA RD\n", Out.data()); +} + +TEST(MinimizeSourceToDependencyDirectivesTest, PoundWarningAndError) { + SmallVector Out; + + for (auto Source : { + "#warning '\n#include \n", + "#warning \"\n#include \n", + "#warning /*\n#include \n", + "#warning \\\n#include \n#include \n", + "#error '\n#include \n", + "#error \"\n#include \n", + "#error /*\n#include \n", + "#error \\\n#include \n#include \n", + }) { + ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out)); + EXPECT_STREQ("#include \n", Out.data()); + } + + for (auto Source : { + "#warning \\\n#include \n", + "#error \\\n#include \n", + "#if MACRO\n#warning '\n#endif\n", + "#if MACRO\n#warning \"\n#endif\n", + "#if MACRO\n#warning /*\n#endif\n", + "#if MACRO\n#error '\n#endif\n", + "#if MACRO\n#error \"\n#endif\n", + "#if MACRO\n#error /*\n#endif\n", + }) { + ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out)); + EXPECT_STREQ("", Out.data()); + } +} + +} // end anonymous namespace