diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -705,6 +705,7 @@ - Implemented "char8_t Compatibility and Portability Fix" (`P2513R3 `_). This change was applied to C++20 as a Defect Report. - Implemented "Permitting static constexpr variables in constexpr functions" (`P2647R1 _`). +- Implemented `CWG2640 Allow more characters in an n-char sequence _`. CUDA/HIP Language Changes in Clang ---------------------------------- diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h --- a/clang/include/clang/Lex/Lexer.h +++ b/clang/include/clang/Lex/Lexer.h @@ -772,7 +772,7 @@ llvm::Optional tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result); llvm::Optional tryReadNamedUCN(const char *&StartPtr, - Token *Result); + const char *SlashLoc, Token *Result); /// Read a universal character name. /// diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -1194,15 +1194,16 @@ /// whether trigraphs are enabled or not. static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) { char Res = GetTrigraphCharForLetter(*CP); - if (!Res || !L) return Res; + if (!Res) + return Res; if (!Trigraphs) { - if (!L->isLexingRawMode()) + if (L && !L->isLexingRawMode()) L->Diag(CP-2, diag::trigraph_ignored); return 0; } - if (!L->isLexingRawMode()) + if (L && !L->isLexingRawMode()) L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); return Res; } @@ -3241,7 +3242,7 @@ if (!Delimited) break; if (Diagnose) - Diag(BufferPtr, diag::warn_delimited_ucn_incomplete) + Diag(SlashLoc, diag::warn_delimited_ucn_incomplete) << StringRef(KindLoc, 1); return std::nullopt; } @@ -3260,7 +3261,7 @@ if (Count == 0) { if (Diagnose) - Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty + Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty : diag::warn_ucn_escape_no_digits) << StringRef(KindLoc, 1); return std::nullopt; @@ -3268,13 +3269,13 @@ if (Delimited && Kind == 'U') { if (Diagnose) - Diag(StartPtr, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1); + Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1); return std::nullopt; } if (!Delimited && Count != NumHexDigits) { if (Diagnose) { - Diag(BufferPtr, diag::warn_ucn_escape_incomplete); + Diag(SlashLoc, diag::warn_ucn_escape_incomplete); // If the user wrote \U1234, suggest a fixit to \u. if (Count == 4 && NumHexDigits == 8) { CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); @@ -3286,15 +3287,18 @@ } if (Delimited && PP) { - Diag(BufferPtr, PP->getLangOpts().CPlusPlus2b - ? diag::warn_cxx2b_delimited_escape_sequence - : diag::ext_delimited_escape_sequence) + Diag(SlashLoc, PP->getLangOpts().CPlusPlus2b + ? diag::warn_cxx2b_delimited_escape_sequence + : diag::ext_delimited_escape_sequence) << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0); } if (Result) { Result->setFlag(Token::HasUCN); - if (CurPtr - StartPtr == (ptrdiff_t)(Count + 2 + (Delimited ? 2 : 0))) + // If the UCN contains either a trigraph or a line splicing, + // we need to call getAndAdvanceChar again to set the appropriate flags + // on Result. + if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0))) StartPtr = CurPtr; else while (StartPtr != CurPtr) @@ -3306,6 +3310,7 @@ } llvm::Optional Lexer::tryReadNamedUCN(const char *&StartPtr, + const char *SlashLoc, Token *Result) { unsigned CharSize; bool Diagnose = Result && !isLexingRawMode(); @@ -3319,7 +3324,7 @@ C = getCharAndSize(CurPtr, CharSize); if (C != '{') { if (Diagnose) - Diag(StartPtr, diag::warn_ucn_escape_incomplete); + Diag(SlashLoc, diag::warn_ucn_escape_incomplete); return std::nullopt; } CurPtr += CharSize; @@ -3334,28 +3339,29 @@ break; } - if (!isAlphanumeric(C) && C != '_' && C != '-' && C != ' ') + if (isVerticalWhitespace(C)) break; Buffer.push_back(C); } if (!FoundEndDelimiter || Buffer.empty()) { if (Diagnose) - Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty + Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty : diag::warn_delimited_ucn_incomplete) << StringRef(KindLoc, 1); return std::nullopt; } StringRef Name(Buffer.data(), Buffer.size()); - llvm::Optional Res = + llvm::Optional Match = llvm::sys::unicode::nameToCodepointStrict(Name); llvm::Optional LooseMatch; - if (!Res) { - if (!isLexingRawMode()) { - Diag(StartPtr, diag::err_invalid_ucn_name) - << StringRef(Buffer.data(), Buffer.size()); - LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name); + if (!Match) { + LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name); + if (Diagnose) { + Diag(StartName, diag::err_invalid_ucn_name) + << StringRef(Buffer.data(), Buffer.size()) + << makeCharRange(*this, StartName, CurPtr - CharSize); if (LooseMatch) { Diag(StartName, diag::note_invalid_ucn_name_loose_matching) << FixItHint::CreateReplacement( @@ -3363,27 +3369,30 @@ LooseMatch->Name); } } - // When finding a match using Unicode loose matching rules - // recover after having emitted a diagnostic. - if (!LooseMatch) - return std::nullopt; // We do not offer misspelled character names suggestions here // as the set of what would be a valid suggestion depends on context, // and we should not make invalid suggestions. } - if (Diagnose && PP && !LooseMatch) - Diag(BufferPtr, PP->getLangOpts().CPlusPlus2b - ? diag::warn_cxx2b_delimited_escape_sequence - : diag::ext_delimited_escape_sequence) + if (Diagnose && Match) + Diag(SlashLoc, PP->getLangOpts().CPlusPlus2b + ? diag::warn_cxx2b_delimited_escape_sequence + : diag::ext_delimited_escape_sequence) << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0); - if (LooseMatch) - Res = LooseMatch->CodePoint; + // If no diagnostic has been emitted yet, likely because we are doing a + // tentative lexing, we do not want to recover here to make sure the token + // will not be incorrectly considered valid. This function will be called + // again and a diagnostic emitted then. + if (LooseMatch && Diagnose) + Match = LooseMatch->CodePoint; if (Result) { Result->setFlag(Token::HasUCN); - if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 4)) + // If the UCN contains either a trigraph or a line splicing, + // we need to call getAndAdvanceChar again to set the appropriate flags + // on Result. + if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3)) StartPtr = CurPtr; else while (StartPtr != CurPtr) @@ -3391,7 +3400,7 @@ } else { StartPtr = CurPtr; } - return *Res; + return Match ? llvm::Optional(*Match) : std::nullopt; } uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, @@ -3403,7 +3412,7 @@ if (Kind == 'u' || Kind == 'U') CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result); else if (Kind == 'N') - CodePointOpt = tryReadNamedUCN(StartPtr, Result); + CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result); if (!CodePointOpt) return 0; diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -548,11 +548,10 @@ return false; } ThisTokBuf++; - const char *ClosingBrace = - std::find_if_not(ThisTokBuf, ThisTokEnd, [](char C) { - return llvm::isAlnum(C) || llvm::isSpace(C) || C == '_' || C == '-'; - }); - bool Incomplete = ClosingBrace == ThisTokEnd || *ClosingBrace != '}'; + const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) { + return C == '}' || isVerticalWhitespace(C); + }); + bool Incomplete = ClosingBrace == ThisTokEnd; bool Empty = ClosingBrace == ThisTokBuf; if (Incomplete || Empty) { if (Diags) { diff --git a/clang/test/CXX/drs/dr26xx.cpp b/clang/test/CXX/drs/dr26xx.cpp --- a/clang/test/CXX/drs/dr26xx.cpp +++ b/clang/test/CXX/drs/dr26xx.cpp @@ -59,6 +59,21 @@ // dr2636: na +namespace dr2640 { // dr2640: 16 + +int \N{Λ} = 0; //expected-error {{'Λ' is not a valid Unicode character name}} \ + //expected-error {{expected unqualified-id}} +const char* emoji = "\N{🤡}"; // expected-error {{'🤡' is not a valid Unicode character name}} \ + // expected-note 5{{did you mean}} + +#define z(x) 0 +#define dr2640_a z( +int x = dr2640_a\N{abc}); // expected-error {{'abc' is not a valid Unicode character name}} +int y = dr2640_a\N{LOTUS}); // expected-error {{character not allowed in an identifier}} \ + // expected-error {{use of undeclared identifier 'dr2640_a🪷'}} \ + // expected-error {{extraneous ')' before ';'}} +} + // dr2642: na namespace dr2644 { // dr2644: yes diff --git a/clang/test/Lexer/char-escapes-delimited.c b/clang/test/Lexer/char-escapes-delimited.c --- a/clang/test/Lexer/char-escapes-delimited.c +++ b/clang/test/Lexer/char-escapes-delimited.c @@ -96,6 +96,11 @@ unsigned i = u'\N{GREEK CAPITAL LETTER DELTA}'; // ext-warning {{extension}} cxx2b-warning {{C++2b}} char j = '\NN'; // expected-error {{expected '{' after '\N' escape sequence}} expected-warning {{multi-character character constant}} unsigned k = u'\N{LOTUS'; // expected-error {{incomplete universal character name}} + + const char* emoji = "\N{🤡}"; // expected-error {{'🤡' is not a valid Unicode character name}} \ + // expected-note 5{{did you mean}} + const char* nested = "\N{\N{SPARKLE}}"; // expected-error {{'\N{SPARKLE' is not a valid Unicode character name}} \ + // expected-note 5{{did you mean}} } void separators(void) { diff --git a/clang/test/Lexer/unicode.c b/clang/test/Lexer/unicode.c --- a/clang/test/Lexer/unicode.c +++ b/clang/test/Lexer/unicode.c @@ -43,6 +43,7 @@ extern int \U0001E4D0; // 𞓐 NAG MUNDARI LETTER O - Added in Unicode 15 extern int _\N{TANGSA LETTER GA}; extern int _\N{TANGSALETTERGA}; // expected-error {{'TANGSALETTERGA' is not a valid Unicode character name}} \ + // expected-error {{expected ';' after top level declarator}} \ // expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespace}} diff --git a/clang/test/Preprocessor/ucn-pp-identifier.c b/clang/test/Preprocessor/ucn-pp-identifier.c --- a/clang/test/Preprocessor/ucn-pp-identifier.c +++ b/clang/test/Preprocessor/ucn-pp-identifier.c @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify=expected,ext -Wundef -// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef -// RUN: %clang_cc1 %s -fsyntax-only -x c++ -std=c++2b -pedantic -ftrigraphs -verify=expected,cxx2b -Wundef -Wpre-c++2b-compat +// RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify=expected,ext -Wundef -DTRIGRAPHS=1 +// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef -fno-trigraphs +// RUN: %clang_cc1 %s -fsyntax-only -x c++ -std=c++2b -pedantic -ftrigraphs -DTRIGRAPHS=1 -verify=expected,cxx2b -Wundef -Wpre-c++2b-compat // RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef -ftrigraphs -DTRIGRAPHS=1 // RUN: not %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -Wundef 2>&1 | FileCheck -strict-whitespace %s @@ -40,7 +40,6 @@ // ext-warning {{extension}} cxx2b-warning {{before C++2b}} #define \N{WASTEBASKET} // expected-error {{macro name must be an identifier}} \ // ext-warning {{extension}} cxx2b-warning {{before C++2b}} - #define a\u0024 #if \u0110 // expected-warning {{is not defined, evaluates to 0}} @@ -121,20 +120,39 @@ #define \u{123456789} // expected-error {{hex escape sequence out of range}} expected-error {{macro name must be an identifier}} #define \u{ // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}} #define \u{fgh} // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}} -#define \N{ // expected-warning {{incomplete delimited universal character name; treating as '\' 'N' '{' identifier}} expected-error {{macro name must be an identifier}} +#define \N{ +// expected-warning@-1 {{incomplete delimited universal character name; treating as '\' 'N' '{' identifier}} +// expected-error@-2 {{macro name must be an identifier}} #define \N{} // expected-warning {{empty delimited universal character name; treating as '\' 'N' '{' '}'}} expected-error {{macro name must be an identifier}} #define \N{NOTATHING} // expected-error {{'NOTATHING' is not a valid Unicode character name}} \ // expected-error {{macro name must be an identifier}} #define \NN // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{macro name must be an identifier}} #define \N{GREEK_SMALL-LETTERALPHA} // expected-error {{'GREEK_SMALL-LETTERALPHA' is not a valid Unicode character name}} \ // expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespaces}} +#define \N{🤡} // expected-error {{'🤡' is not a valid Unicode character name}} \ + // expected-error {{macro name must be an identifier}} #define CONCAT(A, B) A##B -int CONCAT(\N{GREEK, CAPITALLETTERALPHA}); // expected-error{{expected}} \ - // expected-warning {{incomplete delimited universal character name}} +int CONCAT(\N{GREEK +, CAPITALLETTERALPHA}); +// expected-error@-2 {{expected}} \ +// expected-warning@-2 {{incomplete delimited universal character name}} + +int \N{\ +LATIN CAPITAL LETTER A WITH GRAVE}; +//ext-warning@-2 {{extension}} cxx2b-warning@-2 {{before C++2b}} #ifdef TRIGRAPHS -int \N?? = 0; // expected-warning{{extension}} cxx2b-warning {{before C++2b}} \ +int \N?? = 0; // cxx2b-warning {{before C++2b}} \ + //ext-warning {{extension}}\ // expected-warning 2{{trigraph converted}} +int a\N{LATIN CAPITAL LETTER A WITH GRAVE??>; // expected-warning {{trigraph converted}} +#endif + +#ifndef TRIGRAPHS +int a\N{LATIN CAPITAL LETTER A WITH GRAVE??>; +// expected-warning@-1 {{trigraph ignored}}\ +// expected-warning@-1 {{incomplete}}\ +// expected-error@-1 {{expected ';' after top level declarator}} #endif diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -15647,7 +15647,7 @@ 2640 accepted Allow more characters in an n-char sequence - Unknown + Clang 16 2641