diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -125,6 +125,15 @@ "identifier contains Unicode character that is invisible in " "some environments">, InGroup>; +def ext_delimited_escape_sequence : Extension< + "delimited escape sequences are a clang extension">, + InGroup>; +def err_delimited_escape_empty : Error< + "empty escape sequence">; +def err_delimited_escape_missing_brace: Error< + "expected '{' after '\\%0' escape sequence">; +def err_delimited_escape_invalid : Error< + "invalid digit '%0' in escape sequence">; def err_hex_escape_no_digits : Error< "\\%0 used with no following hex digits">; def warn_ucn_escape_no_digits : Warning< diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -95,6 +95,8 @@ DiagnosticsEngine *Diags, const LangOptions &Features) { const char *EscapeBegin = ThisTokBuf; + bool Delimited = false; + bool EndDelimiterFound = false; // Skip the '\' char. ++ThisTokBuf; @@ -143,26 +145,47 @@ break; case 'x': { // Hex escape. ResultChar = 0; - if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { + if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') { + Delimited = true; + ThisTokBuf++; + if (*ThisTokBuf == '}') { + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_delimited_escape_empty); + return ResultChar; + } + } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, diag::err_hex_escape_no_digits) << "x"; - HadError = true; - break; + return ResultChar; } // Hex escapes are a maximal series of hex digits. bool Overflow = false; for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) { - int CharVal = llvm::hexDigitValue(ThisTokBuf[0]); - if (CharVal == -1) break; + if (Delimited && *ThisTokBuf == '}') { + ThisTokBuf++; + EndDelimiterFound = true; + break; + } + int CharVal = llvm::hexDigitValue(*ThisTokBuf); + if (CharVal == -1) { + // Non delimited hex escape sequences stop at the first non-hex digit + if (!Delimited) + break; + HadError = true; + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_delimited_escape_invalid) + << StringRef(ThisTokBuf, 1); + continue; + } // About to shift out a digit? if (ResultChar & 0xF0000000) Overflow = true; ResultChar <<= 4; ResultChar |= CharVal; } - // See if any bits will be truncated when evaluated as a character. if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { Overflow = true; @@ -170,9 +193,13 @@ } // Check for overflow. - if (Overflow && Diags) // Too many digits to fit in - Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, - diag::err_escape_too_large) << 0; + if (!HadError && Overflow) { // Too many digits to fit in + HadError = true; + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_escape_too_large) + << 0; + } break; } case '0': case '1': case '2': case '3': @@ -200,7 +227,58 @@ } break; } + case 'o': { + bool Overflow = false; + if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') { + HadError = true; + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_delimited_escape_missing_brace); + break; + } + ResultChar = 0; + Delimited = true; + ++ThisTokBuf; + if (*ThisTokBuf == '}') { + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_delimited_escape_empty); + return ResultChar; + } + + while (ThisTokBuf != ThisTokEnd) { + if (*ThisTokBuf == '}') { + EndDelimiterFound = true; + ThisTokBuf++; + break; + } + if (*ThisTokBuf < '0' || *ThisTokBuf > '7') { + HadError = true; + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_delimited_escape_invalid) + << StringRef(ThisTokBuf, 1); + ThisTokBuf++; + continue; + } + if (ResultChar & 0x020000000) + Overflow = true; + + ResultChar <<= 3; + ResultChar |= *ThisTokBuf++ - '0'; + } + // Check for overflow. Reject '\777', but not L'\777'. + if (!HadError && + (Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) { + HadError = true; + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_escape_too_large) + << 1; + ResultChar &= ~0U >> (32 - CharWidth); + } + break; + } // Otherwise, these are not valid escapes. case '(': case '{': case '[': case '%': // GCC accepts these as extensions. We warn about them as such though. @@ -224,6 +302,17 @@ break; } + if (Delimited && Diags) { + if (!EndDelimiterFound) + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_expected) + << tok::r_brace; + else if (!HadError) { + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::ext_delimited_escape_sequence); + } + } + return ResultChar; } @@ -282,28 +371,82 @@ // Skip the '\u' char's. ThisTokBuf += 2; - if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { + bool Delimited = false; + bool EndDelimiterFound = false; + bool HasError = false; + + if (UcnBegin[1] == 'u' && in_char_string_literal && + ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') { + Delimited = true; + ThisTokBuf++; + } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1); return false; } UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8); - unsigned short UcnLenSave = UcnLen; - for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) { - int CharVal = llvm::hexDigitValue(ThisTokBuf[0]); - if (CharVal == -1) break; + + bool Overflow = false; + unsigned short Count = 0; + for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen); + ++ThisTokBuf) { + if (Delimited && *ThisTokBuf == '}') { + ++ThisTokBuf; + EndDelimiterFound = true; + break; + } + int CharVal = llvm::hexDigitValue(*ThisTokBuf); + if (CharVal == -1) { + HasError = true; + if (!Delimited) + break; + if (Diags) { + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::err_delimited_escape_invalid) + << StringRef(ThisTokBuf, 1); + } + Count++; + continue; + } + if (UcnVal & 0xF0000000) { + Overflow = true; + continue; + } UcnVal <<= 4; UcnVal |= CharVal; + Count++; + } + + if (Overflow) { + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::err_escape_too_large) + << 0; + return false; + } + + if (Delimited && !EndDelimiterFound) { + if (Diags) { + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::err_expected) + << tok::r_brace; + } + return false; } + // If we didn't consume the proper number of digits, there is a problem. - if (UcnLenSave) { + if (Count == 0 || (!Delimited && Count != UcnLen)) { if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, - diag::err_ucn_escape_incomplete); + Delimited ? diag::err_delimited_escape_empty + : diag::err_ucn_escape_incomplete); return false; } + if (HasError) + return false; + // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2] if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints UcnVal > 0x10FFFF) { // maximum legal UTF32 value @@ -334,9 +477,15 @@ return false; } - if (!Features.CPlusPlus && !Features.C99 && Diags) + if (!Features.CPlusPlus && !Features.C99 && Diags) { Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, diag::warn_ucn_not_valid_in_c89_literal); + } + + if (Delimited && Diags) { + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::ext_delimited_escape_sequence); + } return true; } diff --git a/clang/test/Lexer/char-escapes-delimited.c b/clang/test/Lexer/char-escapes-delimited.c new file mode 100644 --- /dev/null +++ b/clang/test/Lexer/char-escapes-delimited.c @@ -0,0 +1,81 @@ +// RUN: %clang_cc1 -fsyntax-only -pedantic -verify %s +// RUN: %clang_cc1 -x c -fsyntax-only -pedantic -verify %s +// RUN: %clang_cc1 -fwchar-type=short -fno-signed-wchar -fsyntax-only -pedantic -verify %s +// RUN: %clang_cc1 -x c -fwchar-type=short -fno-signed-wchar -fsyntax-only -pedantic -verify %s + +const char *errors = + "\u{}" //expected-error {{empty escape sequence}} + "\u{" //expected-error {{expected '}'}} + "\u{h}" //expected-error {{invalid digit 'h' in escape sequence}} + "\x{}" //expected-error {{empty escape sequence}} + "\x{" //expected-error {{expected '}'}} + "\x{h}" //expected-error {{invalid digit 'h' in escape sequence}} + "\o{}" //expected-error {{empty escape sequence}} + "\o{" //expected-error {{expected '}'}} + "\o{8}" //expected-error {{invalid digit '8' in escape sequence}} + ; + +void ucn() { + char a = '\u{1234}'; // expected-error {{character too large for enclosing character literal type}} + // expected-warning@-1 {{delimited escape sequences are a clang extension}} + + unsigned b = U'\u{1234}'; // expected-warning {{extension}} + +#ifdef __cplusplus + unsigned b2 = U'\u{1}'; // expected-warning {{extension}} +#else + unsigned b2 = U'\u{1}'; //expected-error {{universal character name refers to a control character}} +#endif + + unsigned c = U'\u{000000000001234}'; // expected-warning {{extension}} + unsigned d = U'\u{111111111}'; //expected-error {{hex escape sequence out of range}} +} + +void hex() { + char a = '\x{1}'; // expected-warning {{extension}} + char b = '\x{abcdegggggabc}'; // expected-error 5{{invalid digit 'g' in escape sequence}} + char c = '\x{ff1}'; // expected-error {{hex escape sequence out of range}} + +#if __WCHAR_MAX__ > 0xFFFF + unsigned d = L'\x{FFFFFFFF}'; // expected-warning {{extension}} + unsigned e = L'\x{100000000}'; // expected-error {{hex escape sequence out of range}} +#else + unsigned f = L'\x{FFFF}'; // expected-warning {{extension}} + unsigned g = L'\x{10000}'; // expected-error {{hex escape sequence out of range}} +#endif + unsigned h = U'\x{FFFFFFFF}'; // expected-warning {{extension}} + unsigned i = U'\x{100000000}'; // expected-error {{hex escape sequence out of range}} +} + +void octal() { + char a = '\o{1}'; // expected-warning {{extension}} + char b = '\o{12345678881238}'; // expected-error 4{{invalid digit '8' in escape sequence}} + char c = '\o{777}'; // //expected-error {{octal escape sequence out of range}} +#if __WCHAR_MAX__ > 0xFFFF + unsigned d = L'\o{37777777777}'; // expected-warning {{extension}} + unsigned e = L'\o{40000000000}'; // expected-error {{octal escape sequence out of range}} +#else + unsigned d = L'\o{177777}'; // expected-warning {{extension}} + unsigned e = L'\o{200000}'; // expected-error {{octal escape sequence out of range}} +#endif +} + +void concat() { + (void)"\x{" "12}"; // expected-error {{expected '}'}} + (void)"\u{" "12}"; // expected-error {{expected '}'}} + (void)"\o{" "12}"; // expected-error {{expected '}'}} + + (void)"\x{12" "}"; // expected-error {{expected '}'}} + (void)"\u{12" "}"; // expected-error {{expected '}'}} + (void)"\o{12" "}"; // expected-error {{expected '}'}} +} + +void separators() { + (void)"\x{12'3}"; // expected-error {{invalid digit ''' in escape sequence}} + (void)"\u{12'3}"; // expected-error {{invalid digit ''' in escape sequence}} + (void)"\o{12'3}"; // expected-error {{invalid digit ''' in escape sequence}} + + '\x{12'3'}'; // expected-error {{expected '}'}} + // expected-error@-1 2{{expected ';'}} + // expected-warning@-2 3{{expression result unused}} +}