diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -125,10 +125,11 @@ "identifier contains Unicode character that is invisible in " "some environments">, InGroup>; -def ext_delimited_escape_sequence : Extension<"delimited escape sequence is a clang extension">, - InGroup>; +def ext_delimited_escape_sequence : Extension< + "delimited escape sequences are a clang extension">, + InGroup>; def err_delimited_escape_empty : Error< - "empty escape sequence">; + "delimited escape sequence cannot be empty">; def err_delimited_escape_missing_brace: Error< "expected '{' after '\\%0' escape sequence">; def err_delimited_escape_invalid : Error< @@ -140,6 +141,12 @@ "treating as '\\' followed by identifier">, InGroup; def err_ucn_escape_incomplete : Error< "incomplete universal character name">; +def warn_delimited_ucn_incomplete : Warning< + "incomplete delimited universal character name; " + "treating as '\\' 'u' '{' identifier">, InGroup; +def warn_delimited_ucn_empty : Warning< + "empty delimited universal character name; " + "treating as '\\' 'u' '{' '}'">, InGroup; def warn_ucn_escape_incomplete : Warning< "incomplete universal character name; " "treating as '\\' followed by identifier">, InGroup; diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -3009,6 +3009,10 @@ Token *Result) { unsigned CharSize; char Kind = getCharAndSize(StartPtr, CharSize); + bool Delimited = false; + bool FoundEndDelimiter = false; + unsigned Count = 0; + bool Diagnose = Result && !isLexingRawMode(); unsigned NumHexDigits; if (Kind == 'u') @@ -3019,7 +3023,7 @@ return 0; if (!LangOpts.CPlusPlus && !LangOpts.C99) { - if (Result && !isLexingRawMode()) + if (Diagnose) Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); return 0; } @@ -3028,39 +3032,70 @@ const char *KindLoc = &CurPtr[-1]; uint32_t CodePoint = 0; - for (unsigned i = 0; i < NumHexDigits; ++i) { + while (Count != NumHexDigits || Delimited) { char C = getCharAndSize(CurPtr, CharSize); + if (!Delimited && C == '{') { + Delimited = true; + CurPtr += CharSize; + continue; + } + + if (Delimited && C == '}') { + CurPtr += CharSize; + FoundEndDelimiter = true; + break; + } unsigned Value = llvm::hexDigitValue(C); if (Value == -1U) { - if (Result && !isLexingRawMode()) { - if (i == 0) { - Diag(BufferPtr, diag::warn_ucn_escape_no_digits) - << StringRef(KindLoc, 1); - } else { - Diag(BufferPtr, diag::warn_ucn_escape_incomplete); - - // If the user wrote \U1234, suggest a fixit to \u. - if (i == 4 && NumHexDigits == 8) { - CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); - Diag(KindLoc, diag::note_ucn_four_not_eight) - << FixItHint::CreateReplacement(URange, "u"); - } - } - } + if (!Delimited) + break; + if (Diagnose) + Diag(BufferPtr, diag::warn_delimited_ucn_incomplete) + << StringRef(&C, 1); + return 0; + } + if (CodePoint & 0xF000'0000) { + if (Diagnose) + Diag(KindLoc, diag::err_escape_too_large) << 0; return 0; } CodePoint <<= 4; - CodePoint += Value; - + CodePoint |= Value; CurPtr += CharSize; + Count++; + } + + if (Count == 0) { + if (Diagnose) + Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty + : diag::warn_ucn_escape_no_digits) + << StringRef(KindLoc, 1); + return 0; + } + + if (!Delimited && Count != NumHexDigits) { + if (Diagnose) { + Diag(BufferPtr, diag::warn_ucn_escape_incomplete); + // If the user wrote \U1234, suggest a fixit to \u. + if (Count == 4 && NumHexDigits == 8) { + CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); + Diag(KindLoc, diag::note_ucn_four_not_eight) + << FixItHint::CreateReplacement(URange, "u"); + } + } + return 0; + } + + if (Delimited && PP) { + Diag(BufferPtr, diag::ext_delimited_escape_sequence); } if (Result) { Result->setFlag(Token::HasUCN); - if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2) + if (CurPtr - StartPtr == (ptrdiff_t)(Count + 2 + Delimited) ? 2 : 0) StartPtr = CurPtr; else while (StartPtr != CurPtr) diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -170,6 +170,7 @@ } int CharVal = llvm::hexDigitValue(*ThisTokBuf); if (CharVal == -1) { + // Non delimited hex escape sequences stop at the first non-hex digit if (!Delimited) break; HadError = true; @@ -333,18 +334,32 @@ } ++I; - assert(*I == 'u' || *I == 'U'); + const char Type = *I; + ++I; + + assert(Type == 'u' || Type == 'U'); + uint32_t CodePoint = 0; + + if (Type == 'u' && *I == '{') { + for (++I; *I != '}'; ++I) { + unsigned Value = llvm::hexDigitValue(*I); + assert(Value != -1U); + CodePoint <<= 4; + CodePoint += Value; + } + appendCodePoint(CodePoint, Buf); + continue; + } unsigned NumHexDigits; - if (*I == 'u') + if (Type == 'u') NumHexDigits = 4; else NumHexDigits = 8; assert(I + NumHexDigits <= E); - uint32_t CodePoint = 0; - for (++I; NumHexDigits != 0; ++I, --NumHexDigits) { + for (; NumHexDigits != 0; ++I, --NumHexDigits) { unsigned Value = llvm::hexDigitValue(*I); assert(Value != -1U); @@ -476,15 +491,13 @@ return false; } - if (!Features.CPlusPlus && !Features.C99 && Diags) { + if (!Features.CPlusPlus && !Features.C99 && Diags) Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, diag::warn_ucn_not_valid_in_c89_literal); - } - if (Delimited && Diags) { + if (Delimited && Diags) Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, diag::ext_delimited_escape_sequence); - } return true; } diff --git a/clang/test/Lexer/char-escapes-delimited.c b/clang/test/Lexer/char-escapes-delimited.c --- a/clang/test/Lexer/char-escapes-delimited.c +++ b/clang/test/Lexer/char-escapes-delimited.c @@ -17,7 +17,7 @@ void ucn() { char a = '\u{1234}'; // expected-error {{character too large for enclosing character literal type}} - // expected-warning@-1 {{delimited escape sequence is a clang extension}} + // expected-warning@-1 {{delimited escape sequences are a clang extension}} unsigned b = U'\u{1234}'; // expected-warning {{extension}} @@ -36,7 +36,7 @@ char b = '\x{abcdegggggabc}'; // expected-error 5{{invalid digit 'g' in escape sequence}} char c = '\x{ff1}'; // expected-error {{hex escape sequence out of range}} -#if WCHAR_MAX == 0xFFFFFFFF +#if __WCHAR_MAX__ > 0xFFFF unsigned d = L'\x{FFFFFFFF}'; // expected-warning {{extension}} unsigned e = L'\x{100000000}'; // expected-error {{hex escape sequence out of range}} #else @@ -51,7 +51,7 @@ char a = '\o{1}'; // expected-warning {{extension}} char b = '\o{12345678881238}'; // expected-error 4{{invalid digit '8' in escape sequence}} char c = '\o{777}'; // //expected-error {{octal escape sequence out of range}} -#if WCHAR_MAX == 0xFFFFFFFF +#if __WCHAR_MAX__ > 0xFFFF unsigned d = L'\o{37777777777}'; // expected-warning {{extension}} unsigned e = L'\o{40000000000}'; // expected-error {{octal escape sequence out of range}} #else @@ -61,12 +61,9 @@ } void concat() { - (void)"\x{" - "12}"; // expected-error {{expected '}'}} - (void)"\u{" - "12}"; // expected-error {{expected '}'}} - (void)"\o{" - "12}"; // expected-error {{expected '}'}} + (void)"\x{" "12}"; // expected-error {{expected '}'}} + (void)"\u{" "12}"; // expected-error {{expected '}'}} + (void)"\o{" "12}"; // expected-error {{expected '}'}} (void)"\x{12" "}"; // expected-error {{expected '}'}} (void)"\u{12" "}"; // expected-error {{expected '}'}} diff --git a/clang/test/Parser/cxx11-user-defined-literals.cpp b/clang/test/Parser/cxx11-user-defined-literals.cpp --- a/clang/test/Parser/cxx11-user-defined-literals.cpp +++ b/clang/test/Parser/cxx11-user-defined-literals.cpp @@ -129,6 +129,9 @@ int operator""_℮""_\u212e""_\U0000212e""(const char*, size_t); int operator""_\u212e""_\U0000212e""_℮""(const char*, size_t); int operator""_\U0000212e""_℮""_\u212e""(const char*, size_t); + +int operator""_\u{212f}(char); + int mix_ucn_utf8 = ""_℮""_\u212e""_\U0000212e""; void operator""_℮""_ℯ(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}} diff --git a/clang/test/Preprocessor/ucn-pp-identifier.c b/clang/test/Preprocessor/ucn-pp-identifier.c --- a/clang/test/Preprocessor/ucn-pp-identifier.c +++ b/clang/test/Preprocessor/ucn-pp-identifier.c @@ -16,6 +16,10 @@ #error "This should never happen" #endif +#if a\u{FD}() //expected-warning {{clang extension}} +#error "This should never happen" +#endif + #if \uarecool // expected-warning{{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}} #endif #if \uwerecool // expected-warning{{\u used with no following hex digits; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}} @@ -27,6 +31,7 @@ #define \ufffe // expected-error {{macro name must be an identifier}} #define \U10000000 // expected-error {{macro name must be an identifier}} #define \u0061 // expected-error {{character 'a' cannot be specified by a universal character name}} expected-error {{macro name must be an identifier}} +#define \u{fffe} // expected-error {{macro name must be an identifier}} expected-warning {{clang extension}} #define a\u0024 @@ -103,3 +108,8 @@ // CHECK-NEXT: #define capital_u_\U00FC // CHECK-NEXT: {{^ \^}} // CHECK-NEXT: {{^ u}} + +#define \u{} // expected-warning {{empty delimited universal character name; treating as '\' 'u' '{' '}'}} expected-error {{macro name must be an identifier}} +#define \u{123456789} // expected-error {{hex escape sequence out of range}} expected-error {{macro name must be an identifier}} +#define \u{ // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}} +#define \u{fgh} // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}} diff --git a/clang/test/Sema/ucn-identifiers.c b/clang/test/Sema/ucn-identifiers.c --- a/clang/test/Sema/ucn-identifiers.c +++ b/clang/test/Sema/ucn-identifiers.c @@ -17,6 +17,7 @@ \u00fcber(1); über(2); \U000000FCber(3); + \u{FC}ber(4); // expected-warning {{clang extension}} } void badCalls() { @@ -24,7 +25,7 @@ \u00fcber = 0; // expected-error{{non-object type 'void (int)' is not assignable}} über(1, 2); - \U000000FCber(); + \U000000FCber(); #ifdef __cplusplus // expected-error@-3 {{no matching function}} // expected-error@-3 {{no matching function}}