diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -214,6 +214,8 @@ (`#64987 `_) - Support MSVC predefined macro expressions in constant expressions and in local structs. +- Correctly parse non-ascii identifiers that appear immediately after a line splicing + (`#65156 `_`) Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h --- a/clang/include/clang/Lex/Lexer.h +++ b/clang/include/clang/Lex/Lexer.h @@ -805,9 +805,10 @@ /// Try to consume an identifier character encoded in UTF-8. /// \param CurPtr Points to the start of the (potential) UTF-8 code unit /// sequence. On success, updated to point past the end of it. + /// \param Result The token being formed. /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier /// character was lexed, \c false otherwise. - bool tryConsumeIdentifierUTF8Char(const char *&CurPtr); + bool tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result); }; } // namespace clang diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -1750,15 +1750,21 @@ return true; } -bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { - const char *UnicodePtr = CurPtr; +bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) { llvm::UTF32 CodePoint; - llvm::ConversionResult Result = - llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr, - (const llvm::UTF8 *)BufferEnd, - &CodePoint, - llvm::strictConversion); - if (Result != llvm::conversionOK) + + // If a UTF-8 codepoint appears immediately after an escaped new line, + // CurPtr may point to the splicing \ on the preceding line, + // so we need to skip it. + unsigned FirstCodeUnitSize; + getCharAndSize(CurPtr, FirstCodeUnitSize); + const char *CharStart = CurPtr + FirstCodeUnitSize - 1; + const char *UnicodePtr = CharStart; + + llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence( + (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd, + &CodePoint, llvm::strictConversion); + if (ConvResult != llvm::conversionOK) return false; bool IsExtension = false; @@ -1771,21 +1777,26 @@ !PP->isPreprocessedOutput()) diagnoseInvalidUnicodeCodepointInIdentifier( PP->getDiagnostics(), LangOpts, CodePoint, - makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false); + makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false); // We got a unicode codepoint that is neither a space nor a // a valid identifier part. Carry on as if the codepoint was // valid for recovery purposes. } else if (!isLexingRawMode()) { if (IsExtension) - diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint, - makeCharRange(*this, CurPtr, UnicodePtr)); + diagnoseExtensionInIdentifier( + PP->getDiagnostics(), CodePoint, + makeCharRange(*this, CharStart, UnicodePtr)); maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, - makeCharRange(*this, CurPtr, UnicodePtr), + makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false); maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, - makeCharRange(*this, CurPtr, UnicodePtr)); + makeCharRange(*this, CharStart, UnicodePtr)); } + // Once we sucessfully parsed some UTF-8, + // calling ConsumeChar ensures the NeedsCleaning flag is set on the token + // being lexed, and that warnings about trailing spaces are emitted. + ConsumeChar(CurPtr, FirstCodeUnitSize, Result); CurPtr = UnicodePtr; return true; } @@ -1865,7 +1876,7 @@ } if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) continue; - if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) + if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) continue; // Neither an expected Unicode codepoint nor a UCN. break; @@ -1985,7 +1996,7 @@ // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) return LexNumericConstant(Result, CurPtr); - if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) + if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) return LexNumericConstant(Result, CurPtr); // Update the location of token as well as BufferPtr. @@ -2009,7 +2020,7 @@ if (!isAsciiIdentifierStart(C)) { if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) Consumed = true; - else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) + else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) Consumed = true; else return CurPtr; @@ -2079,7 +2090,7 @@ if (isAsciiIdentifierContinue(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { - } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { + } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) { } else break; } diff --git a/clang/test/Lexer/escape_newline_unicode.c b/clang/test/Lexer/escape_newline_unicode.c new file mode 100644 --- /dev/null +++ b/clang/test/Lexer/escape_newline_unicode.c @@ -0,0 +1,38 @@ +// RUN: %clang_cc1 -verify=expected,c -x c -Wunused %s +// RUN: %clang_cc1 -verify=expected,cpp -x c++ -Wunused %s + +void gh65156(void) { + +int a\ +ス = 42; +// expected-warning@-2 {{unused variable 'aス'}} + +int b\ +\ +ス = 42; +// expected-warning@-2 {{backslash and newline separated by space}} +// expected-warning@-4 {{backslash and newline separated by space}} +// expected-warning@-5 {{unused variable 'bス'}} + +int ス\ +ス = 42; +// expected-warning@-2 {{unused variable 'スス'}} + +int \ +ス = 42; +// expected-warning@-2 {{unused variable 'ス'}} + +} + +void gh65156_err(void) { + +int \ +❌ = 0; +// cpp-error@-2 {{expected unqualified-id}} +// c-error@-3 {{expected identifier}} + + +int a\ +❌ = 0; +// expected-error@-1 {{character not allowed in an identifier}} +}