diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h --- a/clang/include/clang/Lex/Lexer.h +++ b/clang/include/clang/Lex/Lexer.h @@ -16,6 +16,7 @@ #include "clang/Basic/LangOptions.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/TokenKinds.h" +#include "llvm/ADT/FunctionExtras.h" #include "clang/Lex/DependencyDirectivesScanner.h" #include "clang/Lex/PreprocessorLexer.h" #include "clang/Lex/Token.h" @@ -146,6 +147,8 @@ // NewLineOffset - A offset to new line character '\n' being lexed. For // '\r\n', it also points to '\n.' Optional NewLineOffset; + using GrowBufferCallback = llvm::unique_function()>; + GrowBufferCallback GrowBuffer; // CurrentConflictMarkerState - The kind of conflict marker we are handling. ConflictMarkerKind CurrentConflictMarkerState; @@ -166,7 +169,7 @@ /// assumes that the associated file buffer and Preprocessor objects will /// outlive it, so it doesn't take ownership of either of them. Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, - bool IsFirstIncludeOfFile = true); + bool IsFirstIncludeOfFile = true, GrowBufferCallback GrowBuffer = nullptr); /// Lexer constructor - Create a new raw lexer object. This object is only /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the @@ -185,6 +188,8 @@ Lexer(const Lexer &) = delete; Lexer &operator=(const Lexer &) = delete; + bool TryExpandBuffer(); + /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for /// _Pragma expansion. This has a variety of magic semantics that this method /// sets up. It returns a new'd Lexer that must be delete'd when done. diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -133,12 +133,14 @@ /// assumes that the associated file buffer and Preprocessor objects will /// outlive it, so it doesn't take ownership of either of them. Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, - Preprocessor &PP, bool IsFirstIncludeOfFile) + Preprocessor &PP, bool IsFirstIncludeOfFile, GrowBufferCallback GrowBuffer) : PreprocessorLexer(&PP, FID), FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment), - IsFirstTimeLexingFile(IsFirstIncludeOfFile) { - InitLexer(InputFile.getBufferStart(), 0, InputFile.getBufferSize()); + IsFirstTimeLexingFile(IsFirstIncludeOfFile), + GrowBuffer(std::move(GrowBuffer)) { + InitLexer(InputFile.getBufferStart(), 0, + InputFile.getBufferSize()); resetExtendedTokenMode(); } @@ -444,6 +446,19 @@ return getSpellingSlow(Tok, TokStart, LangOpts, const_cast(Buffer)); } + bool Lexer::TryExpandBuffer() { + if (!GrowBuffer) + return false; + + auto NewBuffer = GrowBuffer(); + if (!NewBuffer) + return false; + + BufferStart = NewBuffer->getBufferStart(); + BufferSize = NewBuffer->getBufferSize(); + return true; +} + /// MeasureTokenLength - Relex the token at the specified location and return /// its length in bytes in the input file. If the token needs cleaning (e.g. /// includes a trigraph or an escaped newline) then this count includes bytes @@ -1363,6 +1378,12 @@ Size += EscapedNewLineSize; Offset += EscapedNewLineSize; + if (BufferStart[Offset] == 0 && Offset == BufferSize) + TryExpandBuffer(); + + if (BufferStart[Offset] == 0 && Offset == BufferSize) + TryExpandBuffer(); + // Use slow version to accumulate a correct size field. return getCharAndSizeSlow(Offset, Size, Tok); } @@ -2330,8 +2351,8 @@ if (C == '\\') C = getAndAdvanceChar(CurOffset, Result); - if (C == '\n' || C == '\r' || // Newline. - (C == 0 && CurOffset - 1 == BufferSize)) { // End of file. + if (C == '\n' || C == '\r' || // Newline. + (C == 0 && CurOffset-1 == BufferSize && !TryExpandBuffer())) { // End of file. if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) Diag(BufferOffset, diag::ext_unterminated_char_or_string) << 0; FormTokenWithChars(Result, CurOffset - 1, tok::unknown); @@ -2392,6 +2413,11 @@ while (isHorizontalWhitespace(Char)) Char = BufferStart[++CurOffset]; + if (Char == 0 && CurOffset == BufferSize+1 && TryExpandBuffer()) { + --CurOffset; + continue; + } + // Otherwise if we have something other than whitespace, we're done. if (!isVerticalWhitespace(Char)) break; @@ -2478,10 +2504,14 @@ while (true) { C = BufferStart[CurOffset]; // Skip over characters in the fast loop. - while (isASCII(C) && C != 0 && // Potentially EOF. - C != '\n' && C != '\r') { // Newline or DOS-style newline. - C = BufferStart[++CurOffset]; - UnicodeDecodingAlreadyDiagnosed = false; + while (true) { + while (isASCII(C) && C != 0 && // Potentially EOF. + C != '\n' && C != '\r') { // Newline or DOS-style newline. + C = BufferStart[++CurOffset]; + UnicodeDecodingAlreadyDiagnosed = false; + } + if (C != 0 || CurOffset != BufferSize + 1 || !TryExpandBuffer()) break; + C = BufferStart[CurOffset]; } if (!isASCII(C)) { @@ -2568,7 +2598,15 @@ } } - if (C == '\r' || C == '\n' || CurOffset == BufferSize + 1) { + if (CurOffset == BufferSize + 1) { + if (!TryExpandBuffer()) { + --CurOffset; + break; + } + continue; + } + + if (C == '\r' || C == '\n') { --CurOffset; break; } @@ -2750,7 +2788,7 @@ unsigned CharSize; unsigned char C = getCharAndSize(CurOffset, CharSize); CurOffset += CharSize; - if (C == 0 && CurOffset == BufferSize + 1) { + if (C == 0 && CurOffset == BufferSize+1 && !TryExpandBuffer()) { if (!isLexingRawMode()) Diag(BufferOffset, diag::err_unterminated_block_comment); --CurOffset; @@ -2778,6 +2816,9 @@ bool UnicodeDecodingAlreadyDiagnosed = false; while (true) { + if (CurOffset + 24 >= BufferSize) { + TryExpandBuffer(); + } // Skip over all non-interesting characters until we find end of buffer or a // (probably ending) '/' character. if (CurOffset + 24 < BufferSize && @@ -2900,7 +2941,10 @@ if (!isLexingRawMode()) Diag(CurOffset - 1, diag::warn_nested_block_comment); } - } else if (C == 0 && CurOffset == BufferSize + 1) { + } else if (C == 0 && CurOffset == BufferSize+1 && TryExpandBuffer()) { + --CurOffset; + continue; + } else if (C == 0 && CurOffset == BufferSize+1) { if (!isLexingRawMode()) Diag(BufferOffset, diag::err_unterminated_block_comment); // Note: the user probably forgot a */. We could continue immediately @@ -2978,8 +3022,8 @@ break; case 0: // Null. // Found end of file? - if (CurOffset - 1 != BufferSize) { - if (isCodeCompletionPoint(CurOffset - 1)) { + if (CurOffset-1 != BufferSize && !TryExpandBuffer()) { + if (isCodeCompletionPoint(CurOffset-1)) { PP->CodeCompleteNaturalLanguage(); cutOffLexing(); return; @@ -3264,8 +3308,9 @@ "Not a placeholder!"); if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) return false; - const char *End = - findPlaceholderEnd(BufferStart + CurOffset + 1, BufferStart + BufferSize); + if (CurOffset + 1 == BufferSize) + TryExpandBuffer(); + const char *End = findPlaceholderEnd(BufferStart + CurOffset + 1, BufferStart + BufferSize); if (!End) return false; const char *Start = BufferStart + CurOffset - 1; @@ -3622,8 +3667,10 @@ // Small amounts of horizontal whitespace is very common between tokens. if (isHorizontalWhitespace(BufferStart[CurOffset])) { do { - ++CurOffset; - } while (isHorizontalWhitespace(BufferStart[CurOffset])); + do { + ++CurOffset; + } while (isHorizontalWhitespace(BufferStart[CurOffset])); + } while (BufferStart[CurOffset] == 0 && CurOffset == BufferSize && TryExpandBuffer()); // If we are keeping whitespace and other tokens, just return what we just // skipped. The next lexer invocation will return the token after the @@ -3650,8 +3697,12 @@ switch (Char) { case 0: // Null. // Found end of file? - if (CurOffset - 1 == BufferSize) - return LexEndOfFile(Result, CurOffset - 1); + if (CurOffset-1 == BufferSize) { + if (TryExpandBuffer()) { + goto LexNextToken; + } + return LexEndOfFile(Result, CurOffset-1); + } // Check if we are performing code completion. if (isCodeCompletionPoint(CurOffset - 1)) {