Index: clang/docs/ReleaseNotes.rst =================================================================== --- clang/docs/ReleaseNotes.rst +++ clang/docs/ReleaseNotes.rst @@ -321,6 +321,8 @@ template parameter, to conform to the Itanium C++ ABI and be compatible with GCC. This breaks binary compatibility with code compiled with earlier versions of clang; use the ``-fclang-abi-compat=14`` option to get the old mangling. +- Preprocessor character literals with a ``u8`` prefix are now correctly treated as + unsigned character literals. This fixes `Issue 54886 `_. C++20 Feature Support ^^^^^^^^^^^^^^^^^^^^^ Index: clang/lib/Lex/PPExpressions.cpp =================================================================== --- clang/lib/Lex/PPExpressions.cpp +++ clang/lib/Lex/PPExpressions.cpp @@ -408,9 +408,18 @@ // Set the value. Val = Literal.getValue(); // Set the signedness. UTF-16 and UTF-32 are always unsigned + // UTF-8 is unsigned if -fchar8_t is specified. if (Literal.isWide()) Val.setIsUnsigned(!TargetInfo::isTypeSigned(TI.getWCharType())); - else if (!Literal.isUTF16() && !Literal.isUTF32()) + else if (Literal.isUTF16() || Literal.isUTF32()) + Val.setIsUnsigned(true); + else if (Literal.isUTF8()) { + if (PP.getLangOpts().CPlusPlus) + Val.setIsUnsigned( + PP.getLangOpts().Char8 ? true : !PP.getLangOpts().CharIsSigned); + else + Val.setIsUnsigned(true); + } else Val.setIsUnsigned(!PP.getLangOpts().CharIsSigned); if (Result.Val.getBitWidth() > Val.getBitWidth()) { Index: clang/test/Lexer/utf8-char-literal.cpp =================================================================== --- clang/test/Lexer/utf8-char-literal.cpp +++ clang/test/Lexer/utf8-char-literal.cpp @@ -1,7 +1,10 @@ -// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++11 -fsyntax-only -verify %s // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c11 -x c -fsyntax-only -verify %s // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c2x -x c -fsyntax-only -verify %s -// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++1z -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++11 -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++17 -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++17 -fsyntax-only -fchar8_t -verify %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++20 -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++20 -fsyntax-only -fno-char8_t -verify %s int array0[u'ñ' == u'\xf1'? 1 : -1]; int array1['\xF1' != u'\xf1'? 1 : -1]; @@ -13,7 +16,7 @@ char d = u8'\u1234'; // expected-error {{character too large for enclosing character literal type}} char e = u8'ሴ'; // expected-error {{character too large for enclosing character literal type}} char f = u8'ab'; // expected-error {{Unicode character literals may not contain multiple characters}} -#elif __STDC_VERSION__ > 202000L +#elif __STDC_VERSION__ >= 202000L char a = u8'ñ'; // expected-error {{character too large for enclosing character literal type}} char b = u8'\x80'; // ok char c = u8'\u0080'; // expected-error {{universal character name refers to a control character}} @@ -26,3 +29,29 @@ unsigned char : 1), "Surprise!"); #endif + + +// UTF-8 character literals are enabled in C++17 and later. If `-fchar8_t` is not enabled +// (as is the case in C++17), then UTF-8 character literals may produce signed or +// unsigned values depending on whether char is a signed type. If `-fchar8_t` is enabled +// (which is the default behavior for C++20), then UTF-8 character literals always +// produce unsigned values. The tests below depend on the target having a signed +// 8-bit char so that '\xff' produces a negative value. +#if __cplusplus >= 201703L +# if !defined(__cpp_char8_t) +# if !(u8'\xff' == '\xff') +# error UTF-8 character value did not match ordinary character literal; this is unexpected +# endif +# else +# if u8'\xff' == '\xff' // expected-warning {{right side of operator converted from negative value to unsigned}} +# error UTF-8 character value matched ordinary character literal; this is unexpected +# endif +# endif +#endif + +/// In C2x, u8 char literals are always unsigned. +#if __STDC_VERSION__ >= 202000L +# if !(u8'\xff' == 0xff) +# error u8 char literal is not unsigned +# endif +#endif