diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -127,6 +127,15 @@ "identifier contains Unicode character that is invisible in " "some environments">, InGroup>; +def ext_delimited_escape_sequence : Extension< + "delimited escape sequences are a Clang extension">, + InGroup>; +def err_delimited_escape_empty : Error< + "delimited escape sequence cannot be empty">; +def err_delimited_escape_missing_brace: Error< + "expected '{' after '\\%0' escape sequence">; +def err_delimited_escape_invalid : Error< + "invalid digit '%0' in escape sequence">; def err_hex_escape_no_digits : Error< "\\%0 used with no following hex digits">; def warn_ucn_escape_no_digits : Warning< @@ -134,6 +143,12 @@ "treating as '\\' followed by identifier">, InGroup; def err_ucn_escape_incomplete : Error< "incomplete universal character name">; +def warn_delimited_ucn_incomplete : Warning< + "incomplete delimited universal character name; " + "treating as '\\' 'u' '{' identifier">, InGroup; +def warn_delimited_ucn_empty : Warning< + "empty delimited universal character name; " + "treating as '\\' 'u' '{' '}'">, InGroup; def warn_ucn_escape_incomplete : Warning< "incomplete universal character name; " "treating as '\\' followed by identifier">, InGroup; diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -3085,6 +3085,10 @@ Token *Result) { unsigned CharSize; char Kind = getCharAndSize(StartPtr, CharSize); + bool Delimited = false; + bool FoundEndDelimiter = false; + unsigned Count = 0; + bool Diagnose = Result && !isLexingRawMode(); unsigned NumHexDigits; if (Kind == 'u') @@ -3095,7 +3099,7 @@ return 0; if (!LangOpts.CPlusPlus && !LangOpts.C99) { - if (Result && !isLexingRawMode()) + if (Diagnose) Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); return 0; } @@ -3104,39 +3108,70 @@ const char *KindLoc = &CurPtr[-1]; uint32_t CodePoint = 0; - for (unsigned i = 0; i < NumHexDigits; ++i) { + while (Count != NumHexDigits || Delimited) { char C = getCharAndSize(CurPtr, CharSize); + if (!Delimited && C == '{') { + Delimited = true; + CurPtr += CharSize; + continue; + } + + if (Delimited && C == '}') { + CurPtr += CharSize; + FoundEndDelimiter = true; + break; + } unsigned Value = llvm::hexDigitValue(C); if (Value == -1U) { - if (Result && !isLexingRawMode()) { - if (i == 0) { - Diag(BufferPtr, diag::warn_ucn_escape_no_digits) - << StringRef(KindLoc, 1); - } else { - Diag(BufferPtr, diag::warn_ucn_escape_incomplete); - - // If the user wrote \U1234, suggest a fixit to \u. - if (i == 4 && NumHexDigits == 8) { - CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); - Diag(KindLoc, diag::note_ucn_four_not_eight) - << FixItHint::CreateReplacement(URange, "u"); - } - } - } + if (!Delimited) + break; + if (Diagnose) + Diag(BufferPtr, diag::warn_delimited_ucn_incomplete) + << StringRef(&C, 1); + return 0; + } + if (CodePoint & 0xF000'0000) { + if (Diagnose) + Diag(KindLoc, diag::err_escape_too_large) << 0; return 0; } CodePoint <<= 4; - CodePoint += Value; - + CodePoint |= Value; CurPtr += CharSize; + Count++; + } + + if (Count == 0) { + if (Diagnose) + Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty + : diag::warn_ucn_escape_no_digits) + << StringRef(KindLoc, 1); + return 0; + } + + if (!Delimited && Count != NumHexDigits) { + if (Diagnose) { + Diag(BufferPtr, diag::warn_ucn_escape_incomplete); + // If the user wrote \U1234, suggest a fixit to \u. + if (Count == 4 && NumHexDigits == 8) { + CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); + Diag(KindLoc, diag::note_ucn_four_not_eight) + << FixItHint::CreateReplacement(URange, "u"); + } + } + return 0; + } + + if (Delimited && PP) { + Diag(BufferPtr, diag::ext_delimited_escape_sequence); } if (Result) { Result->setFlag(Token::HasUCN); - if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2) + if (CurPtr - StartPtr == (ptrdiff_t)(Count + 2 + (Delimited ? 2 : 0))) StartPtr = CurPtr; else while (StartPtr != CurPtr) diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -95,6 +95,8 @@ DiagnosticsEngine *Diags, const LangOptions &Features) { const char *EscapeBegin = ThisTokBuf; + bool Delimited = false; + bool EndDelimiterFound = false; // Skip the '\' char. ++ThisTokBuf; @@ -143,26 +145,47 @@ break; case 'x': { // Hex escape. ResultChar = 0; - if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { + if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') { + Delimited = true; + ThisTokBuf++; + if (*ThisTokBuf == '}') { + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_delimited_escape_empty); + return ResultChar; + } + } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, diag::err_hex_escape_no_digits) << "x"; - HadError = true; - break; + return ResultChar; } // Hex escapes are a maximal series of hex digits. bool Overflow = false; for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) { - int CharVal = llvm::hexDigitValue(ThisTokBuf[0]); - if (CharVal == -1) break; + if (Delimited && *ThisTokBuf == '}') { + ThisTokBuf++; + EndDelimiterFound = true; + break; + } + int CharVal = llvm::hexDigitValue(*ThisTokBuf); + if (CharVal == -1) { + // Non delimited hex escape sequences stop at the first non-hex digit. + if (!Delimited) + break; + HadError = true; + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_delimited_escape_invalid) + << StringRef(ThisTokBuf, 1); + continue; + } // About to shift out a digit? if (ResultChar & 0xF0000000) Overflow = true; ResultChar <<= 4; ResultChar |= CharVal; } - // See if any bits will be truncated when evaluated as a character. if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { Overflow = true; @@ -170,9 +193,13 @@ } // Check for overflow. - if (Overflow && Diags) // Too many digits to fit in - Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, - diag::err_escape_too_large) << 0; + if (!HadError && Overflow) { // Too many digits to fit in + HadError = true; + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_escape_too_large) + << 0; + } break; } case '0': case '1': case '2': case '3': @@ -200,7 +227,58 @@ } break; } + case 'o': { + bool Overflow = false; + if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') { + HadError = true; + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_delimited_escape_missing_brace); + + break; + } + ResultChar = 0; + Delimited = true; + ++ThisTokBuf; + if (*ThisTokBuf == '}') { + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_delimited_escape_empty); + return ResultChar; + } + + while (ThisTokBuf != ThisTokEnd) { + if (*ThisTokBuf == '}') { + EndDelimiterFound = true; + ThisTokBuf++; + break; + } + if (*ThisTokBuf < '0' || *ThisTokBuf > '7') { + HadError = true; + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_delimited_escape_invalid) + << StringRef(ThisTokBuf, 1); + ThisTokBuf++; + continue; + } + if (ResultChar & 0x020000000) + Overflow = true; + ResultChar <<= 3; + ResultChar |= *ThisTokBuf++ - '0'; + } + // Check for overflow. Reject '\777', but not L'\777'. + if (!HadError && + (Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) { + HadError = true; + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_escape_too_large) + << 1; + ResultChar &= ~0U >> (32 - CharWidth); + } + break; + } // Otherwise, these are not valid escapes. case '(': case '{': case '[': case '%': // GCC accepts these as extensions. We warn about them as such though. @@ -224,6 +302,17 @@ break; } + if (Delimited && Diags) { + if (!EndDelimiterFound) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_expected) + << tok::r_brace; + else if (!HadError) { + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::ext_delimited_escape_sequence); + } + } + return ResultChar; } @@ -245,18 +334,32 @@ } ++I; - assert(*I == 'u' || *I == 'U'); + char Kind = *I; + ++I; + + assert(Kind == 'u' || Kind == 'U'); + uint32_t CodePoint = 0; + + if (Kind == 'u' && *I == '{') { + for (++I; *I != '}'; ++I) { + unsigned Value = llvm::hexDigitValue(*I); + assert(Value != -1U); + CodePoint <<= 4; + CodePoint += Value; + } + appendCodePoint(CodePoint, Buf); + continue; + } unsigned NumHexDigits; - if (*I == 'u') + if (Kind == 'u') NumHexDigits = 4; else NumHexDigits = 8; assert(I + NumHexDigits <= E); - uint32_t CodePoint = 0; - for (++I; NumHexDigits != 0; ++I, --NumHexDigits) { + for (; NumHexDigits != 0; ++I, --NumHexDigits) { unsigned Value = llvm::hexDigitValue(*I); assert(Value != -1U); @@ -282,28 +385,82 @@ // Skip the '\u' char's. ThisTokBuf += 2; - if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { + bool Delimited = false; + bool EndDelimiterFound = false; + bool HasError = false; + + if (UcnBegin[1] == 'u' && in_char_string_literal && + ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') { + Delimited = true; + ThisTokBuf++; + } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1); return false; } UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8); - unsigned short UcnLenSave = UcnLen; - for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) { - int CharVal = llvm::hexDigitValue(ThisTokBuf[0]); - if (CharVal == -1) break; + + bool Overflow = false; + unsigned short Count = 0; + for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen); + ++ThisTokBuf) { + if (Delimited && *ThisTokBuf == '}') { + ++ThisTokBuf; + EndDelimiterFound = true; + break; + } + int CharVal = llvm::hexDigitValue(*ThisTokBuf); + if (CharVal == -1) { + HasError = true; + if (!Delimited) + break; + if (Diags) { + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::err_delimited_escape_invalid) + << StringRef(ThisTokBuf, 1); + } + Count++; + continue; + } + if (UcnVal & 0xF0000000) { + Overflow = true; + continue; + } UcnVal <<= 4; UcnVal |= CharVal; + Count++; } + + if (Overflow) { + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::err_escape_too_large) + << 0; + return false; + } + + if (Delimited && !EndDelimiterFound) { + if (Diags) { + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::err_expected) + << tok::r_brace; + } + return false; + } + // If we didn't consume the proper number of digits, there is a problem. - if (UcnLenSave) { + if (Count == 0 || (!Delimited && Count != UcnLen)) { if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, - diag::err_ucn_escape_incomplete); + Delimited ? diag::err_delimited_escape_empty + : diag::err_ucn_escape_incomplete); return false; } + if (HasError) + return false; + // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2] if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints UcnVal > 0x10FFFF) { // maximum legal UTF32 value @@ -338,6 +495,10 @@ Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, diag::warn_ucn_not_valid_in_c89_literal); + if (Delimited && Diags) + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::ext_delimited_escape_sequence); + return true; } diff --git a/clang/test/Lexer/char-escapes-delimited.c b/clang/test/Lexer/char-escapes-delimited.c new file mode 100644 --- /dev/null +++ b/clang/test/Lexer/char-escapes-delimited.c @@ -0,0 +1,81 @@ +// RUN: %clang_cc1 -fsyntax-only -pedantic -verify %s +// RUN: %clang_cc1 -x c -fsyntax-only -pedantic -verify %s +// RUN: %clang_cc1 -fwchar-type=short -fno-signed-wchar -fsyntax-only -pedantic -verify %s +// RUN: %clang_cc1 -x c -fwchar-type=short -fno-signed-wchar -fsyntax-only -pedantic -verify %s + +const char *errors = + "\u{}" //expected-error {{delimited escape sequence cannot be empty}} + "\u{" //expected-error {{expected '}'}} + "\u{h}" //expected-error {{invalid digit 'h' in escape sequence}} + "\x{}" //expected-error {{delimited escape sequence cannot be empty}} + "\x{" //expected-error {{expected '}'}} + "\x{h}" //expected-error {{invalid digit 'h' in escape sequence}} + "\o{}" //expected-error {{delimited escape sequence cannot be empty}} + "\o{" //expected-error {{expected '}'}} + "\o{8}" //expected-error {{invalid digit '8' in escape sequence}} + ; + +void ucn() { + char a = '\u{1234}'; // expected-error {{character too large for enclosing character literal type}} + // expected-warning@-1 {{delimited escape sequences are a Clang extension}} + + unsigned b = U'\u{1234}'; // expected-warning {{extension}} + +#ifdef __cplusplus + unsigned b2 = U'\u{1}'; // expected-warning {{extension}} +#else + unsigned b2 = U'\u{1}'; //expected-error {{universal character name refers to a control character}} +#endif + + unsigned c = U'\u{000000000001234}'; // expected-warning {{extension}} + unsigned d = U'\u{111111111}'; //expected-error {{hex escape sequence out of range}} +} + +void hex() { + char a = '\x{1}'; // expected-warning {{extension}} + char b = '\x{abcdegggggabc}'; // expected-error 5{{invalid digit 'g' in escape sequence}} + char c = '\x{ff1}'; // expected-error {{hex escape sequence out of range}} + +#if __WCHAR_MAX__ > 0xFFFF + unsigned d = L'\x{FFFFFFFF}'; // expected-warning {{extension}} + unsigned e = L'\x{100000000}'; // expected-error {{hex escape sequence out of range}} +#else + unsigned f = L'\x{FFFF}'; // expected-warning {{extension}} + unsigned g = L'\x{10000}'; // expected-error {{hex escape sequence out of range}} +#endif + unsigned h = U'\x{FFFFFFFF}'; // expected-warning {{extension}} + unsigned i = U'\x{100000000}'; // expected-error {{hex escape sequence out of range}} +} + +void octal() { + char a = '\o{1}'; // expected-warning {{extension}} + char b = '\o{12345678881238}'; // expected-error 4{{invalid digit '8' in escape sequence}} + char c = '\o{777}'; // //expected-error {{octal escape sequence out of range}} +#if __WCHAR_MAX__ > 0xFFFF + unsigned d = L'\o{37777777777}'; // expected-warning {{extension}} + unsigned e = L'\o{40000000000}'; // expected-error {{octal escape sequence out of range}} +#else + unsigned d = L'\o{177777}'; // expected-warning {{extension}} + unsigned e = L'\o{200000}'; // expected-error {{octal escape sequence out of range}} +#endif +} + +void concat() { + (void)"\x{" "12}"; // expected-error {{expected '}'}} + (void)"\u{" "12}"; // expected-error {{expected '}'}} + (void)"\o{" "12}"; // expected-error {{expected '}'}} + + (void)"\x{12" "}"; // expected-error {{expected '}'}} + (void)"\u{12" "}"; // expected-error {{expected '}'}} + (void)"\o{12" "}"; // expected-error {{expected '}'}} +} + +void separators() { + (void)"\x{12'3}"; // expected-error {{invalid digit ''' in escape sequence}} + (void)"\u{12'3}"; // expected-error {{invalid digit ''' in escape sequence}} + (void)"\o{12'3}"; // expected-error {{invalid digit ''' in escape sequence}} + + '\x{12'3'}'; // expected-error {{expected '}'}} + // expected-error@-1 2{{expected ';'}} + // expected-warning@-2 3{{expression result unused}} +} diff --git a/clang/test/Parser/cxx11-user-defined-literals.cpp b/clang/test/Parser/cxx11-user-defined-literals.cpp --- a/clang/test/Parser/cxx11-user-defined-literals.cpp +++ b/clang/test/Parser/cxx11-user-defined-literals.cpp @@ -129,6 +129,9 @@ int operator""_℮""_\u212e""_\U0000212e""(const char*, size_t); int operator""_\u212e""_\U0000212e""_℮""(const char*, size_t); int operator""_\U0000212e""_℮""_\u212e""(const char*, size_t); + +int operator""_\u{212f}(char); + int mix_ucn_utf8 = ""_℮""_\u212e""_\U0000212e""; void operator""_℮""_ℯ(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_℮' and '_ℯ') in string literal concatenation}} diff --git a/clang/test/Preprocessor/ucn-pp-identifier.c b/clang/test/Preprocessor/ucn-pp-identifier.c --- a/clang/test/Preprocessor/ucn-pp-identifier.c +++ b/clang/test/Preprocessor/ucn-pp-identifier.c @@ -16,6 +16,10 @@ #error "This should never happen" #endif +#if a\u{FD}() //expected-warning {{Clang extension}} +#error "This should never happen" +#endif + #if \uarecool // expected-warning{{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}} #endif #if \uwerecool // expected-warning{{\u used with no following hex digits; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}} @@ -27,6 +31,7 @@ #define \ufffe // expected-error {{macro name must be an identifier}} #define \U10000000 // expected-error {{macro name must be an identifier}} #define \u0061 // expected-error {{character 'a' cannot be specified by a universal character name}} expected-error {{macro name must be an identifier}} +#define \u{fffe} // expected-error {{macro name must be an identifier}} expected-warning {{Clang extension}} #define a\u0024 @@ -103,3 +108,8 @@ // CHECK-NEXT: #define capital_u_\U00FC // CHECK-NEXT: {{^ \^}} // CHECK-NEXT: {{^ u}} + +#define \u{} // expected-warning {{empty delimited universal character name; treating as '\' 'u' '{' '}'}} expected-error {{macro name must be an identifier}} +#define \u{123456789} // expected-error {{hex escape sequence out of range}} expected-error {{macro name must be an identifier}} +#define \u{ // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}} +#define \u{fgh} // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}} diff --git a/clang/test/Sema/ucn-identifiers.c b/clang/test/Sema/ucn-identifiers.c --- a/clang/test/Sema/ucn-identifiers.c +++ b/clang/test/Sema/ucn-identifiers.c @@ -17,6 +17,7 @@ \u00fcber(1); über(2); \U000000FCber(3); + \u{FC}ber(4); // expected-warning {{Clang extension}} } void badCalls() { @@ -24,7 +25,7 @@ \u00fcber = 0; // expected-error{{non-object type 'void (int)' is not assignable}} über(1, 2); - \U000000FCber(); + \U000000FCber(); #ifdef __cplusplus // expected-error@-3 {{no matching function}} // expected-error@-3 {{no matching function}}