diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -205,6 +205,9 @@ bool b = nullptr; // Was incorrectly rejected by Clang, is now accepted. +- Implemented `WG14 N3124 _`, + which allows any universal character name to appear in character and string literals. + Non-comprehensive list of changes in this release ------------------------------------------------- @@ -585,6 +588,9 @@ - Correcly diagnose jumps into statement expressions. This ensures the behavior of Clang is consistent with GCC. (`#63682 `_) + (`#38717 _`). +- Fix an assertion when using ``\u0024`` (``$``) as an identifier, by disallowing + that construct (`#62133 _`). Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -197,6 +197,14 @@ def warn_cxx98_compat_literal_ucn_control_character : Warning< "universal character name referring to a control character " "is incompatible with C++98">, InGroup, DefaultIgnore; +def warn_c2x_compat_literal_ucn_escape_basic_scs : Warning< + "specifying character '%0' with a universal character name is " + "incompatible with C standards before C2x">, + InGroup, DefaultIgnore; +def warn_c2x_compat_literal_ucn_control_character : Warning< + "universal character name referring to a control character " + "is incompatible with C standards before C2x">, + InGroup, DefaultIgnore; def warn_ucn_not_valid_in_c89 : Warning< "universal character names are only valid in C99 or C++; " "treating as '\\' followed by identifier">, InGroup; diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -3484,9 +3484,14 @@ if (LangOpts.AsmPreprocessor) return CodePoint; - // C99 6.4.3p2: A universal character name shall not specify a character whose - // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or - // 0060 (`), nor one in the range D800 through DFFF inclusive.) + // C2x 6.4.3p2: A universal character name shall not designate a code point + // where the hexadecimal value is: + // - in the range D800 through DFFF inclusive; or + // - greater than 10FFFF. + // A universal-character-name outside the c-char-sequence of a character + // constant, or the s-char-sequence of a string-literal shall not designate + // a control character or a character in the basic character set. + // C++11 [lex.charset]p2: If the hexadecimal value for a // universal-character-name corresponds to a surrogate code point (in the // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, @@ -3496,9 +3501,6 @@ // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the // basic source character set, the program is ill-formed. if (CodePoint < 0xA0) { - if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60) - return CodePoint; - // We don't use isLexingRawMode() here because we need to warn about bad // UCNs even when skipping preprocessing tokens in a #if block. if (Result && PP) { diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -640,22 +640,28 @@ return false; } - // C++11 allows UCNs that refer to control characters and basic source - // characters inside character and string literals + // C2x and C++11 allow UCNs that refer to control characters + // and basic source characters inside character and string literals if (UcnVal < 0xa0 && - (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, ` - bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal); + // $, @, ` are allowed in all language modes + (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { + bool IsError = + (!(Features.CPlusPlus11 || Features.C2x) || !in_char_string_literal); if (Diags) { char BasicSCSChar = UcnVal; if (UcnVal >= 0x20 && UcnVal < 0x7f) Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, - IsError ? diag::err_ucn_escape_basic_scs : - diag::warn_cxx98_compat_literal_ucn_escape_basic_scs) + IsError ? diag::err_ucn_escape_basic_scs + : Features.CPlusPlus + ? diag::warn_cxx98_compat_literal_ucn_escape_basic_scs + : diag::warn_c2x_compat_literal_ucn_escape_basic_scs) << StringRef(&BasicSCSChar, 1); else Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, - IsError ? diag::err_ucn_control_character : - diag::warn_cxx98_compat_literal_ucn_control_character); + IsError ? diag::err_ucn_control_character + : Features.CPlusPlus + ? diag::warn_cxx98_compat_literal_ucn_control_character + : diag::warn_c2x_compat_literal_ucn_control_character); } if (IsError) return false; diff --git a/clang/test/Lexer/char-literal.cpp b/clang/test/Lexer/char-literal.cpp --- a/clang/test/Lexer/char-literal.cpp +++ b/clang/test/Lexer/char-literal.cpp @@ -1,8 +1,9 @@ +// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++03 -Wfour-char-constants -fsyntax-only -verify=cxx03,expected %s // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++11 -Wfour-char-constants -fsyntax-only -verify=cxx,expected %s // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++17 -Wfour-char-constants -fsyntax-only -verify=cxx,expected %s // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++20 -Wfour-char-constants -fsyntax-only -verify=cxx,expected %s -// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c11 -x c -Wfour-char-constants -fsyntax-only -verify=c,expected %s -// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c2x -x c -Wfour-char-constants -fsyntax-only -verify=c,expected %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c11 -x c -Wfour-char-constants -fsyntax-only -verify=c11,expected %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c2x -x c -Wfour-char-constants -fsyntax-only -verify=c2x,expected %s #ifndef __cplusplus typedef __WCHAR_TYPE__ wchar_t; @@ -17,6 +18,7 @@ char d = '⌘'; // expected-error {{character too large for enclosing character literal type}} char e = '\u2318'; // expected-error {{character too large for enclosing character literal type}} +#if !defined(__cplusplus) || __cplusplus > 201100L #ifdef __cplusplus auto f = '\xE2\x8C\x98'; // expected-warning {{multi-character character constant}} #endif @@ -44,18 +46,19 @@ // UTF-8 character literal code point ranges. #if __cplusplus >= 201703L || __STDC_VERSION__ >= 201710L -_Static_assert(u8'\U00000000' == 0x00, ""); // c-error {{universal character name refers to a control character}} -_Static_assert(u8'\U0000007F' == 0x7F, ""); // c-error {{universal character name refers to a control character}} -_Static_assert(u8'\U00000080', ""); // c-error {{universal character name refers to a control character}} +_Static_assert(u8'\U00000000' == 0x00, ""); // c11-error {{universal character name refers to a control character}} +_Static_assert(u8'\U0000007F' == 0x7F, ""); // c11-error {{universal character name refers to a control character}} +_Static_assert(u8'\U00000080', ""); // c11-error {{universal character name refers to a control character}} // cxx-error@-1 {{character too large for enclosing character literal type}} + // c2x-error@-2 {{character too large for enclosing character literal type}} _Static_assert((unsigned char)u8'\xFF' == (unsigned char)0xFF, ""); #endif // UTF-8 string literal code point ranges. -_Static_assert(u8"\U00000000"[0] == 0x00, ""); // c-error {{universal character name refers to a control character}} -_Static_assert(u8"\U0000007F"[0] == 0x7F, ""); // c-error {{universal character name refers to a control character}} -_Static_assert((unsigned char)u8"\U00000080"[0] == (unsigned char)0xC2, ""); // c-error {{universal character name refers to a control character}} -_Static_assert((unsigned char)u8"\U00000080"[1] == (unsigned char)0x80, ""); // c-error {{universal character name refers to a control character}} +_Static_assert(u8"\U00000000"[0] == 0x00, ""); // c11-error {{universal character name refers to a control character}} +_Static_assert(u8"\U0000007F"[0] == 0x7F, ""); // c11-error {{universal character name refers to a control character}} +_Static_assert((unsigned char)u8"\U00000080"[0] == (unsigned char)0xC2, ""); // c11-error {{universal character name refers to a control character}} +_Static_assert((unsigned char)u8"\U00000080"[1] == (unsigned char)0x80, ""); // c11-error {{universal character name refers to a control character}} _Static_assert((unsigned char)u8"\U000007FF"[0] == (unsigned char)0xDF, ""); _Static_assert((unsigned char)u8"\U000007FF"[1] == (unsigned char)0xBF, ""); _Static_assert((unsigned char)u8"\U00000800"[0] == (unsigned char)0xE0, ""); @@ -84,14 +87,14 @@ #endif // UTF-16 character literal code point ranges. -_Static_assert(u'\U00000000' == 0x0000, ""); // c-error {{universal character name refers to a control character}} +_Static_assert(u'\U00000000' == 0x0000, ""); // c11-error {{universal character name refers to a control character}} _Static_assert(u'\U0000D800', ""); // expected-error {{invalid universal character}} _Static_assert(u'\U0000DFFF', ""); // expected-error {{invalid universal character}} _Static_assert(u'\U0000FFFF' == 0xFFFF, ""); _Static_assert(u'\U00010000', ""); // expected-error {{character too large for enclosing character literal type}} // UTF-16 string literal code point ranges. -_Static_assert(u"\U00000000"[0] == 0x0000, ""); // c-error {{universal character name refers to a control character}} +_Static_assert(u"\U00000000"[0] == 0x0000, ""); // c11-error {{universal character name refers to a control character}} _Static_assert(u"\U0000D800"[0], ""); // expected-error {{invalid universal character}} _Static_assert(u"\U0000DFFF"[0], ""); // expected-error {{invalid universal character}} _Static_assert(u"\U0000FFFF"[0] == 0xFFFF, ""); @@ -109,13 +112,24 @@ #endif // UTF-32 character literal code point ranges. -_Static_assert(U'\U00000000' == 0x00000000, ""); // c-error {{universal character name refers to a control character}} +_Static_assert(U'\U00000000' == 0x00000000, ""); // c11-error {{universal character name refers to a control character}} _Static_assert(U'\U0010FFFF' == 0x0010FFFF, ""); _Static_assert(U'\U00110000', ""); // expected-error {{invalid universal character}} // UTF-32 string literal code point ranges. -_Static_assert(U"\U00000000"[0] == 0x00000000, ""); // c-error {{universal character name refers to a control character}} +_Static_assert(U"\U00000000"[0] == 0x00000000, ""); // c11-error {{universal character name refers to a control character}} _Static_assert(U"\U0000D800"[0], ""); // expected-error {{invalid universal character}} _Static_assert(U"\U0000DFFF"[0], ""); // expected-error {{invalid universal character}} _Static_assert(U"\U0010FFFF"[0] == 0x0010FFFF, ""); _Static_assert(U"\U00110000"[0], ""); // expected-error {{invalid universal character}} + +#endif // !defined(__cplusplus) || __cplusplus > 201100L + +_Static_assert('\u0024' == '$', ""); +_Static_assert('\u0040' == '@', ""); +_Static_assert('\u0060' == '`', ""); + +_Static_assert('\u0061' == 'a', ""); // c11-error {{character 'a' cannot be specified by a universal character name}} \ + // cxx03-error {{character 'a' cannot be specified by a universal character name}} +_Static_assert('\u0000' == '\0', ""); // c11-error {{universal character name refers to a control character}} \ + // cxx03-error {{universal character name refers to a control character}} diff --git a/clang/test/Lexer/utf8-char-literal.cpp b/clang/test/Lexer/utf8-char-literal.cpp --- a/clang/test/Lexer/utf8-char-literal.cpp +++ b/clang/test/Lexer/utf8-char-literal.cpp @@ -19,7 +19,7 @@ #elif __STDC_VERSION__ >= 202000L char a = u8'ñ'; // expected-error {{character too large for enclosing character literal type}} char b = u8'\x80'; // ok -char c = u8'\u0080'; // expected-error {{universal character name refers to a control character}} +char c = u8'\u0000'; // ok char d = u8'\u1234'; // expected-error {{character too large for enclosing character literal type}} char e = u8'ሴ'; // expected-error {{character too large for enclosing character literal type}} char f = u8'ab'; // expected-error {{Unicode character literals may not contain multiple characters}} diff --git a/clang/test/Preprocessor/ucn-allowed-chars.c b/clang/test/Preprocessor/ucn-allowed-chars.c --- a/clang/test/Preprocessor/ucn-allowed-chars.c +++ b/clang/test/Preprocessor/ucn-allowed-chars.c @@ -1,4 +1,5 @@ // RUN: %clang_cc1 %s -fsyntax-only -std=c99 -verify +// RUN: %clang_cc1 %s -fsyntax-only -std=c2x -Wc99-compat -verify // RUN: %clang_cc1 %s -fsyntax-only -std=c11 -Wc99-compat -verify // RUN: %clang_cc1 %s -fsyntax-only -x c++ -std=c++03 -Wc++11-compat -verify // RUN: %clang_cc1 %s -fsyntax-only -x c++ -std=c++11 -Wc++98-compat -verify @@ -13,7 +14,6 @@ - // Identifier initial characters extern char \u0E50; // C++03, C11, C++11 extern char \u0300; // disallowed in C99/C++03 @@ -38,8 +38,8 @@ #if __cplusplus -// expected-error@9 {{character not allowed in an identifier}} -// expected-error@11 {{character not allowed in an identifier}} +// expected-error@10 {{character not allowed in an identifier}} +// expected-error@12 {{character not allowed in an identifier}} // expected-error@18 {{expected unqualified-id}} # if __cplusplus >= 201103L // C++11 @@ -53,23 +53,49 @@ # endif #else -# if __STDC_VERSION__ >= 201112L +# if __STDC_VERSION__ >= 201800L +// C2X +// expected-warning@8 {{using this character in an identifier is incompatible with C99}} +// expected-error@10 {{character not allowed in an identifier}} +// expected-error@12 {{character not allowed in an identifier}} +// expected-error@18 {{expected identifier}} +// expected-error@19 {{expected identifier}} +// expected-error@33 {{invalid universal character}} +# elif __STDC_VERSION__ >= 201112L // C11 -// expected-warning@7 {{using this character in an identifier is incompatible with C99}} -// expected-warning@9 {{using this character in an identifier is incompatible with C99}} -// expected-error@11 {{character not allowed in an identifier}} +// expected-warning@8 {{using this character in an identifier is incompatible with C99}} +// expected-warning@10 {{using this character in an identifier is incompatible with C99}} +// expected-error@12 {{character not allowed in an identifier}} // expected-warning@18 {{starting an identifier with this character is incompatible with C99}} // expected-error@19 {{expected identifier}} // expected-error@33 {{invalid universal character}} # else // C99 -// expected-error@7 {{not allowed in an identifier}} -// expected-error@9 {{not allowed in an identifier}} -// expected-error@11 {{not allowed in an identifier}} +// expected-error@8 {{not allowed in an identifier}} +// expected-error@10 {{not allowed in an identifier}} +// expected-error@12 {{not allowed in an identifier}} // expected-error@18 {{expected identifier}} // expected-error@19 {{expected identifier}} // expected-error@33 {{invalid universal character}} # endif #endif + +#define AAA\u0024 // expected-error {{character '$' cannot be specified by a universal character name}} \ + // expected-warning {{whitespace}} +#define AAB\u0040 // expected-error {{character '@' cannot be specified by a universal character name}} \ + // expected-warning {{whitespace}} +#define AAC\u0060 // expected-error {{character '`' cannot be specified by a universal character name}} \ + // expected-warning {{whitespace}} + +#define ABA \u0024 // expected-error {{character '$' cannot be specified by a universal character name}} +#define ABB \u0040 // expected-error {{character '@' cannot be specified by a universal character name}} +#define ABC \u0060 // expected-error {{character '`' cannot be specified by a universal character name}} + +int GH62133_a\u0024; // expected-error {{character '$' cannot be specified by a universal character name}} \ + // expected-error {{}} +int GH62133_b\u0040; // expected-error {{character '@' cannot be specified by a universal character name}} \ + // expected-error {{}} +int GH62133_c\u0060; // expected-error {{character '`' cannot be specified by a universal character name}} \ + // expected-error {{}} diff --git a/clang/test/Preprocessor/ucn-pp-identifier.c b/clang/test/Preprocessor/ucn-pp-identifier.c --- a/clang/test/Preprocessor/ucn-pp-identifier.c +++ b/clang/test/Preprocessor/ucn-pp-identifier.c @@ -1,4 +1,5 @@ // RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify=expected,ext -Wundef -DTRIGRAPHS=1 +// RUN: %clang_cc1 %s -fsyntax-only -std=c2x -pedantic -verify=expected,ext -Wundef -DTRIGRAPHS=1 // RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef -fno-trigraphs // RUN: %clang_cc1 %s -fsyntax-only -x c++ -std=c++23 -pedantic -ftrigraphs -DTRIGRAPHS=1 -verify=expected,cxx23 -Wundef -Wpre-c++23-compat // RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef -ftrigraphs -DTRIGRAPHS=1 @@ -40,7 +41,8 @@ // ext-warning {{extension}} cxx23-warning {{before C++23}} #define \N{WASTEBASKET} // expected-error {{macro name must be an identifier}} \ // ext-warning {{extension}} cxx23-warning {{before C++23}} -#define a\u0024 +#define a\u0024a // expected-error {{character '$' cannot be specified by a universal character name}} \ + // expected-warning {{requires whitespace after the macro name}} #if \u0110 // expected-warning {{is not defined, evaluates to 0}} #endif @@ -112,7 +114,7 @@ #define capital_u_\U00FC // expected-warning@-1 {{incomplete universal character name}} expected-note@-1 {{did you mean to use '\u'?}} expected-warning@-1 {{whitespace}} // CHECK: note: did you mean to use '\u'? -// CHECK-NEXT: {{^ 112 | #define capital_u_\U00FC}} +// CHECK-NEXT: {{^ .* | #define capital_u_\U00FC}} // CHECK-NEXT: {{^ | \^}} // CHECK-NEXT: {{^ | u}} @@ -155,5 +157,5 @@ int a\N{LATIN CAPITAL LETTER A WITH GRAVE??>; // expected-warning@-1 {{trigraph ignored}}\ // expected-warning@-1 {{incomplete}}\ -// expected-error@-1 {{expected ';' after top level declarator}} +// expected-error@-1 {{expected unqualified-id}} #endif