diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h --- a/clang/include/clang/Lex/Lexer.h +++ b/clang/include/clang/Lex/Lexer.h @@ -16,6 +16,7 @@ #include "clang/Basic/LangOptions.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/TokenKinds.h" +#include "llvm/ADT/FunctionExtras.h" #include "clang/Lex/DependencyDirectivesScanner.h" #include "clang/Lex/PreprocessorLexer.h" #include "clang/Lex/Token.h" @@ -71,6 +72,11 @@ : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {} }; +/// TryGrowLexerBuffer - Callback that gets called by Lexer when it reaches eof +/// to try getting newly grown buffer and continue the lexing. If this callback +/// returns nullopt, the Lexer will stop lexing and process eof. +using TryGrowLexerBuffer = llvm::unique_function()>; + /// Lexer - This provides a simple interface that turns a text buffer into a /// stream of tokens. This provides no support for file reading or buffering, /// or buffering/seeking of tokens, only forward lexing is supported. It relies @@ -157,16 +163,20 @@ /// next token to use from the current dependency directive. unsigned NextDepDirectiveTokenIndex = 0; + // TryGrowBuffer - The TryGrowLexerBuffer callback to grow the buffer if possible. + TryGrowLexerBuffer TryGrowBuffer; + void InitLexer(const char *BufStart, unsigned BufferOffset, unsigned BufferSize); public: + /// Lexer constructor - Create a new lexer object for the specified buffer /// with the specified preprocessor managing the lexing process. This lexer /// assumes that the associated file buffer and Preprocessor objects will /// outlive it, so it doesn't take ownership of either of them. Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, - bool IsFirstIncludeOfFile = true); + bool IsFirstIncludeOfFile = true, TryGrowLexerBuffer TryGrowBuffer = nullptr); /// Lexer constructor - Create a new raw lexer object. This object is only /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the @@ -611,9 +621,9 @@ bool LexUnicodeIdentifierStart(Token &Result, uint32_t C, unsigned CurOffset); /// FormTokenWithChars - When we lex a token, we have identified a span - /// starting at BufferPtr, going to TokEnd that forms the token. This method + /// starting at BufferOffset, going to TokEnd that forms the token. This method /// takes that range and assigns it to the token as its location and size. In - /// addition, since tokens cannot overlap, this also updates BufferPtr to be + /// addition, since tokens cannot overlap, this also updates BufferOffset to be /// TokEnd. void FormTokenWithChars(Token &Result, unsigned TokEnd, tok::TokenKind Kind) { unsigned TokLen = TokEnd - BufferOffset; @@ -623,6 +633,13 @@ BufferOffset = TokEnd; } + /// SetTokString - If the buffer can be grown, it's unsafe to set the original + /// string pointer from the buffer to literal data of a token. When the buffer + /// has a posibility of growing, this method will copy the string into scratch + /// buffer and set it to literal data of the token. Otherwise, it will just set + /// the passed string pointer to literal data of the token as it is. + void SetTokLiteralData(Token& Tok, const char* Str); + /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a /// tok::l_paren token, 0 if it is something else and 2 if there are no more /// tokens in the buffer controlled by this lexer. @@ -804,8 +821,12 @@ /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier /// character was lexed, \c false otherwise. bool tryConsumeIdentifierUTF8Char(unsigned &CurOffset); -}; + /// Try to grow the buffer if possible by calling TryGrowLexerBuffer callback. + /// \return \c true if it has grown the buffer so that Lexer should continue on + /// lexing, \c false otherwise. + bool tryGrowLexerBuffer(); +}; } // namespace clang #endif // LLVM_CLANG_LEX_LEXER_H diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -44,6 +44,7 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/MemoryBufferRef.h" #include "llvm/Support/Registry.h" #include #include @@ -78,6 +79,7 @@ class PreprocessorLexer; class PreprocessorOptions; class ScratchBuffer; +class SourceFileGrower; class TargetInfo; namespace Builtin { @@ -282,6 +284,8 @@ /// Empty line handler. EmptylineHandler *Emptyline = nullptr; + /// File grower. + SourceFileGrower *FileGrower = nullptr; public: /// The kind of translation unit we are processing. const TranslationUnitKind TUKind; @@ -1783,6 +1787,11 @@ const_cast(getLangOpts()).IncrementalExtensions = value; } + void setSourceFileGrower(SourceFileGrower* Val) { + FileGrower = Val; + } + SourceFileGrower* getSourceFileGrower() const { return FileGrower; } + /// Specify the point at which code-completion will be performed. /// /// \param File the file in which code completion should occur. If @@ -2265,6 +2274,8 @@ void EnterSubmodule(Module *M, SourceLocation ImportLoc, bool ForPragma); Module *LeaveSubmodule(bool ForPragma); + /// Try growing file by using SourceFileGrower. + std::optional TryGrowFile(); private: friend void TokenLexer::ExpandFunctionArguments(); @@ -2711,6 +2722,16 @@ virtual void HandleEmptyline(SourceRange Range) = 0; }; +/// Abstract base class that will receive the ID of source +/// file that reached eof by Lexer and grow that file if possible. +class SourceFileGrower { +public: + virtual ~SourceFileGrower(); + + // This method should return true if it has grown the specified file. + virtual bool TryGrowFile(FileID FileID) = 0; +}; + /// Registry of pragma handlers added by plugins using PragmaHandlerRegistry = llvm::Registry; diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -133,12 +133,14 @@ /// assumes that the associated file buffer and Preprocessor objects will /// outlive it, so it doesn't take ownership of either of them. Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, - Preprocessor &PP, bool IsFirstIncludeOfFile) + Preprocessor &PP, bool IsFirstIncludeOfFile, TryGrowLexerBuffer TryGrowBuffer) : PreprocessorLexer(&PP, FID), FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment), - IsFirstTimeLexingFile(IsFirstIncludeOfFile) { - InitLexer(InputFile.getBufferStart(), 0, InputFile.getBufferSize()); + IsFirstTimeLexingFile(IsFirstIncludeOfFile), + TryGrowBuffer(std::move(TryGrowBuffer)) { + InitLexer(InputFile.getBufferStart(), 0, + InputFile.getBufferSize()); resetExtendedTokenMode(); } @@ -444,6 +446,19 @@ return getSpellingSlow(Tok, TokStart, LangOpts, const_cast(Buffer)); } + bool Lexer::tryGrowLexerBuffer() { + if (!TryGrowBuffer) + return false; + + auto NewBuffer = TryGrowBuffer(); + if (!NewBuffer) + return false; + + BufferStart = NewBuffer->getBufferStart(); + BufferSize = NewBuffer->getBufferSize(); + return true; +} + /// MeasureTokenLength - Relex the token at the specified location and return /// its length in bytes in the input file. If the token needs cleaning (e.g. /// includes a trigraph or an escaped newline) then this count includes bytes @@ -1363,6 +1378,12 @@ Size += EscapedNewLineSize; Offset += EscapedNewLineSize; + if (BufferStart[Offset] == 0 && Offset == BufferSize) + tryGrowLexerBuffer(); + + if (BufferStart[Offset] == 0 && Offset == BufferSize) + tryGrowLexerBuffer(); + // Use slow version to accumulate a correct size field. return getCharAndSizeSlow(Offset, Size, Tok); } @@ -1861,7 +1882,7 @@ const char *IdStart = BufferStart + BufferOffset; FormTokenWithChars(Result, CurOffset, tok::raw_identifier); - Result.setRawIdentifierData(IdStart); + SetTokLiteralData(Result, IdStart); // If we are in raw mode, return this identifier raw. There is no need to // look up identifier information or attempt to macro expand it. @@ -1982,7 +2003,7 @@ // Update the location of token as well as BufferPtr. const char *TokStart = BufferStart + BufferOffset; FormTokenWithChars(Result, CurOffset, tok::numeric_constant); - Result.setLiteralData(TokStart); + SetTokLiteralData(Result, TokStart); return true; } @@ -2138,7 +2159,7 @@ // Update the location of the token as well as the BufferPtr instance var. const char *TokStart = BufferStart + BufferOffset; FormTokenWithChars(Result, CurOffset, Kind); - Result.setLiteralData(TokStart); + SetTokLiteralData(Result, TokStart); return true; } @@ -2221,7 +2242,7 @@ // Update the location of token as well as BufferPtr. const char *TokStart = &BufferStart[BufferOffset]; FormTokenWithChars(Result, CurOffset, Kind); - Result.setLiteralData(TokStart); + SetTokLiteralData(Result, TokStart); return true; } @@ -2266,7 +2287,7 @@ // Update the location of token as well as BufferPtr. const char *TokStart = &BufferStart[BufferOffset]; FormTokenWithChars(Result, CurOffset, tok::header_name); - Result.setLiteralData(TokStart); + SetTokLiteralData(Result, TokStart); return true; } @@ -2330,8 +2351,8 @@ if (C == '\\') C = getAndAdvanceChar(CurOffset, Result); - if (C == '\n' || C == '\r' || // Newline. - (C == 0 && CurOffset - 1 == BufferSize)) { // End of file. + if (C == '\n' || C == '\r' || // Newline. + (C == 0 && CurOffset-1 == BufferSize && !tryGrowLexerBuffer())) { // End of file. if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) Diag(BufferOffset, diag::ext_unterminated_char_or_string) << 0; FormTokenWithChars(Result, CurOffset - 1, tok::unknown); @@ -2362,7 +2383,7 @@ // Update the location of token as well as BufferPtr. const char *TokStart = BufferStart + BufferOffset; FormTokenWithChars(Result, CurOffset, Kind); - Result.setLiteralData(TokStart); + SetTokLiteralData(Result, TokStart); return true; } @@ -2392,6 +2413,11 @@ while (isHorizontalWhitespace(Char)) Char = BufferStart[++CurOffset]; + if (Char == 0 && CurOffset == BufferSize+1 && tryGrowLexerBuffer()) { + --CurOffset; + continue; + } + // Otherwise if we have something other than whitespace, we're done. if (!isVerticalWhitespace(Char)) break; @@ -2478,10 +2504,14 @@ while (true) { C = BufferStart[CurOffset]; // Skip over characters in the fast loop. - while (isASCII(C) && C != 0 && // Potentially EOF. - C != '\n' && C != '\r') { // Newline or DOS-style newline. - C = BufferStart[++CurOffset]; - UnicodeDecodingAlreadyDiagnosed = false; + while (true) { + while (isASCII(C) && C != 0 && // Potentially EOF. + C != '\n' && C != '\r') { // Newline or DOS-style newline. + C = BufferStart[++CurOffset]; + UnicodeDecodingAlreadyDiagnosed = false; + } + if (C != 0 || CurOffset != BufferSize + 1 || !tryGrowLexerBuffer()) break; + C = BufferStart[CurOffset]; } if (!isASCII(C)) { @@ -2568,7 +2598,15 @@ } } - if (C == '\r' || C == '\n' || CurOffset == BufferSize + 1) { + if (CurOffset == BufferSize + 1) { + if (!tryGrowLexerBuffer()) { + --CurOffset; + break; + } + continue; + } + + if (C == '\r' || C == '\n') { --CurOffset; break; } @@ -2750,7 +2788,7 @@ unsigned CharSize; unsigned char C = getCharAndSize(CurOffset, CharSize); CurOffset += CharSize; - if (C == 0 && CurOffset == BufferSize + 1) { + if (C == 0 && CurOffset == BufferSize+1 && !tryGrowLexerBuffer()) { if (!isLexingRawMode()) Diag(BufferOffset, diag::err_unterminated_block_comment); --CurOffset; @@ -2778,6 +2816,9 @@ bool UnicodeDecodingAlreadyDiagnosed = false; while (true) { + if (CurOffset + 24 >= BufferSize) { + tryGrowLexerBuffer(); + } // Skip over all non-interesting characters until we find end of buffer or a // (probably ending) '/' character. if (CurOffset + 24 < BufferSize && @@ -2820,14 +2861,14 @@ '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/' }; - while (CurPtr + 16 < BufferEnd) { + while (CurOffset + 16 < BufferSize) { if (LLVM_UNLIKELY( - vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF))) + vec_any_ge(*(const __vector unsigned char *)(BufferStart + CurOffset), LongUTF))) goto MultiByteUTF8; - if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) { + if (vec_any_eq(*(const __vector unsigned char *)(BufferStart + CurOffset), Slashes)) { break; } - CurPtr += 16; + CurOffset += 16; } #else @@ -2900,7 +2941,10 @@ if (!isLexingRawMode()) Diag(CurOffset - 1, diag::warn_nested_block_comment); } - } else if (C == 0 && CurOffset == BufferSize + 1) { + } else if (C == 0 && CurOffset == BufferSize+1 && tryGrowLexerBuffer()) { + --CurOffset; + continue; + } else if (C == 0 && CurOffset == BufferSize+1) { if (!isLexingRawMode()) Diag(BufferOffset, diag::err_unterminated_block_comment); // Note: the user probably forgot a */. We could continue immediately @@ -2978,8 +3022,8 @@ break; case 0: // Null. // Found end of file? - if (CurOffset - 1 != BufferSize) { - if (isCodeCompletionPoint(CurOffset - 1)) { + if (CurOffset-1 != BufferSize && !tryGrowLexerBuffer()) { + if (isCodeCompletionPoint(CurOffset-1)) { PP->CodeCompleteNaturalLanguage(); cutOffLexing(); return; @@ -3264,8 +3308,9 @@ "Not a placeholder!"); if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) return false; - const char *End = - findPlaceholderEnd(BufferStart + CurOffset + 1, BufferStart + BufferSize); + if (CurOffset + 1 == BufferSize) + tryGrowLexerBuffer(); + const char *End = findPlaceholderEnd(BufferStart + CurOffset + 1, BufferStart + BufferSize); if (!End) return false; const char *Start = BufferStart + CurOffset - 1; @@ -3273,7 +3318,7 @@ Diag(CurOffset - 1, diag::err_placeholder_in_source); Result.startToken(); FormTokenWithChars(Result, End - BufferStart, tok::raw_identifier); - Result.setRawIdentifierData(Start); + SetTokLiteralData(Result, Start); PP->LookUpIdentifierInfo(Result); Result.setFlag(Token::IsEditorPlaceholder); BufferOffset = End - BufferStart; @@ -3628,8 +3673,10 @@ // Small amounts of horizontal whitespace is very common between tokens. if (isHorizontalWhitespace(BufferStart[CurOffset])) { do { - ++CurOffset; - } while (isHorizontalWhitespace(BufferStart[CurOffset])); + do { + ++CurOffset; + } while (isHorizontalWhitespace(BufferStart[CurOffset])); + } while (BufferStart[CurOffset] == 0 && CurOffset == BufferSize && tryGrowLexerBuffer()); // If we are keeping whitespace and other tokens, just return what we just // skipped. The next lexer invocation will return the token after the @@ -3656,8 +3703,12 @@ switch (Char) { case 0: // Null. // Found end of file? - if (CurOffset - 1 == BufferSize) - return LexEndOfFile(Result, CurOffset - 1); + if (CurOffset-1 == BufferSize) { + if (tryGrowLexerBuffer()) { + goto LexNextToken; + } + return LexEndOfFile(Result, CurOffset-1); + } // Check if we are performing code completion. if (isCodeCompletionPoint(CurOffset - 1)) { @@ -4487,7 +4538,7 @@ return true; } if (Result.isLiteral()) { - Result.setLiteralData(TokPtr); + SetTokLiteralData(Result, TokPtr); return true; } if (Result.is(tok::colon) && @@ -4571,3 +4622,18 @@ convertDependencyDirectiveToken(DDTok, Result); return false; } + +void Lexer::SetTokLiteralData(Token& Tok, const char* Str) { + if (TryGrowBuffer) { + assert(PP); + SourceLocation Loc = Tok.getLocation(); + PP->CreateString(StringRef(Str, Tok.getLength()), Tok); + Tok.setLocation(Loc); + return; + } + + if (Tok.is(tok::raw_identifier)) + Tok.setRawIdentifierData(Str); + else + Tok.setLiteralData(Str); +} diff --git a/clang/lib/Lex/PPLexerChange.cpp b/clang/lib/Lex/PPLexerChange.cpp --- a/clang/lib/Lex/PPLexerChange.cpp +++ b/clang/lib/Lex/PPLexerChange.cpp @@ -15,6 +15,7 @@ #include "clang/Basic/SourceManager.h" #include "clang/Lex/HeaderSearch.h" #include "clang/Lex/LexDiagnostic.h" +#include "clang/Lex/Lexer.h" #include "clang/Lex/MacroInfo.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" @@ -92,7 +93,12 @@ CodeCompletionFileLoc.getLocWithOffset(CodeCompletionOffset); } - Lexer *TheLexer = new Lexer(FID, *InputFile, *this, IsFirstIncludeOfFile); + TryGrowLexerBuffer TryGrowBuffer; + if (FileGrower) + TryGrowBuffer = [this]() { + return this->TryGrowFile(); + }; + Lexer *TheLexer = new Lexer(FID, *InputFile, *this, IsFirstIncludeOfFile, std::move(TryGrowBuffer)); if (getPreprocessorOpts().DependencyDirectivesForFile && FID != PredefinesFileID) { if (OptionalFileEntryRef File = SourceMgr.getFileEntryRefForID(FID)) { diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -181,6 +181,15 @@ delete &HeaderInfo; } +std::optional Preprocessor::TryGrowFile() { + assert(FileGrower && "FileGrower must be set when TryGrowFile is called"); + if (!FileGrower->TryGrowFile(CurLexer->getFileID())) + return std::nullopt; + const FileEntry* Entry = SourceMgr.getFileEntryForID((CurLexer->getFileID())); + assert(Entry && "TryGrowFile must be only called inside Lexer on File Entry"); + return SourceMgr.getMemoryBufferForFileOrNone(Entry); +} + void Preprocessor::Initialize(const TargetInfo &Target, const TargetInfo *AuxTarget) { assert((!this->Target || this->Target == &Target) && @@ -1470,6 +1479,8 @@ CodeCompletionHandler::~CodeCompletionHandler() = default; +SourceFileGrower::~SourceFileGrower() = default; + void Preprocessor::createPreprocessingRecord() { if (Record) return; diff --git a/clang/unittests/Lex/LexerTest.cpp b/clang/unittests/Lex/LexerTest.cpp --- a/clang/unittests/Lex/LexerTest.cpp +++ b/clang/unittests/Lex/LexerTest.cpp @@ -26,9 +26,12 @@ #include "clang/Lex/PreprocessorOptions.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/MemoryBuffer.h" #include "gmock/gmock.h" #include "gtest/gtest.h" +#include #include +#include #include namespace { @@ -49,12 +52,7 @@ Target = TargetInfo::CreateTargetInfo(Diags, TargetOpts); } - std::unique_ptr CreatePP(StringRef Source, - TrivialModuleLoader &ModLoader) { - std::unique_ptr Buf = - llvm::MemoryBuffer::getMemBuffer(Source); - SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf))); - + std::unique_ptr CreatePP(TrivialModuleLoader &ModLoader) { HeaderSearch HeaderInfo(std::make_shared(), SourceMgr, Diags, LangOpts, Target.get()); std::unique_ptr PP = std::make_unique( @@ -63,6 +61,14 @@ /*IILookup =*/nullptr, /*OwnsHeaderSearch =*/false); PP->Initialize(*Target); + return PP; + } + + std::unique_ptr CreatePP(StringRef Source, TrivialModuleLoader &ModLoader) { + std::unique_ptr Buf = + llvm::MemoryBuffer::getMemBuffer(Source); + SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf))); + std::unique_ptr PP = CreatePP(ModLoader); PP->EnterMainSourceFile(); return PP; } @@ -660,4 +666,59 @@ } EXPECT_TRUE(ToksView.empty()); } + +TEST_F(LexerTest, BasicSourceFileGrower) { + std::deque SourceLines = { + "int main() {", + " return 0;", + "}" + }; + + TrivialModuleLoader ModLoader; + PP = CreatePP(ModLoader); + auto &SM = PP->getSourceManager(); + + struct FileGrower : public SourceFileGrower { + FileGrower(SourceManager& SM, std::deque SourceLines) : SM(SM), SourceLines(SourceLines) { + FE = SM.getFileManager().getVirtualFile("main.cpp", 1024, 0); + CurFileID = SM.createFileID(FE, SourceLocation(), SrcMgr::C_User); + SM.overrideFileContents(FE, llvm::MemoryBufferRef("", "")); + } + ~FileGrower() = default; + + bool TryGrowFile(FileID FileID) override { + if (FileID != CurFileID) + return false; + if (SourceLines.empty()) + return false; + CurStr += SourceLines.front(); + CurStr.push_back('\n'); + SourceLines.pop_front(); + CurBuf = llvm::MemoryBuffer::getMemBuffer(CurStr); + SM.overrideFileContents(FE, CurBuf->getMemBufferRef()); + return true; + } + SourceManager& SM; + std::deque SourceLines; + std::string CurStr; + std::unique_ptr CurBuf; + FileID CurFileID; + const FileEntry* FE; + } FG(SM, SourceLines); + + PP->setSourceFileGrower(&FG); + PP->EnterSourceFile(FG.CurFileID, nullptr, SourceLocation()); + + std::vector Toks; + while (1) { + Token tok; + PP->Lex(tok); + if (tok.is(tok::eof)) + break; + Toks.push_back(getSourceText(tok, tok)); + } + + EXPECT_THAT(Toks, ElementsAre("int", "main", "(", ")", "{", + "return", "0", ";", "}")); +} } // anonymous namespace