Changeset View
Standalone View
clang/lib/Lex/Lexer.cpp
Show First 20 Lines • Show All 1,188 Lines • ▼ Show 20 Lines | |||||||||||||
} | } | ||||||||||||
/// DecodeTrigraphChar - If the specified character is a legal trigraph when | /// DecodeTrigraphChar - If the specified character is a legal trigraph when | ||||||||||||
/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, | /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, | ||||||||||||
/// return the result character. Finally, emit a warning about trigraph use | /// return the result character. Finally, emit a warning about trigraph use | ||||||||||||
/// whether trigraphs are enabled or not. | /// whether trigraphs are enabled or not. | ||||||||||||
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) { | static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) { | ||||||||||||
char Res = GetTrigraphCharForLetter(*CP); | char Res = GetTrigraphCharForLetter(*CP); | ||||||||||||
if (!Res || !L) return Res; | if (!Res) | ||||||||||||
return Res; | |||||||||||||
if (!Trigraphs) { | if (!Trigraphs) { | ||||||||||||
if (!L->isLexingRawMode()) | if (L && !L->isLexingRawMode()) | ||||||||||||
L->Diag(CP-2, diag::trigraph_ignored); | L->Diag(CP-2, diag::trigraph_ignored); | ||||||||||||
return 0; | return 0; | ||||||||||||
} | } | ||||||||||||
if (!L->isLexingRawMode()) | if (L && !L->isLexingRawMode()) | ||||||||||||
L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); | L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); | ||||||||||||
return Res; | return Res; | ||||||||||||
} | } | ||||||||||||
/// getEscapedNewLineSize - Return the size of the specified escaped newline, | /// getEscapedNewLineSize - Return the size of the specified escaped newline, | ||||||||||||
/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a | /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a | ||||||||||||
/// trigraph equivalent on entry to this function. | /// trigraph equivalent on entry to this function. | ||||||||||||
unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { | unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { | ||||||||||||
▲ Show 20 Lines • Show All 2,022 Lines • ▼ Show 20 Lines | if (Delimited && C == '}') { | ||||||||||||
break; | break; | ||||||||||||
} | } | ||||||||||||
unsigned Value = llvm::hexDigitValue(C); | unsigned Value = llvm::hexDigitValue(C); | ||||||||||||
if (Value == -1U) { | if (Value == -1U) { | ||||||||||||
if (!Delimited) | if (!Delimited) | ||||||||||||
break; | break; | ||||||||||||
if (Diagnose) | if (Diagnose) | ||||||||||||
Diag(BufferPtr, diag::warn_delimited_ucn_incomplete) | Diag(SlashLoc, diag::warn_delimited_ucn_incomplete) | ||||||||||||
<< StringRef(KindLoc, 1); | << StringRef(KindLoc, 1); | ||||||||||||
return std::nullopt; | return std::nullopt; | ||||||||||||
} | } | ||||||||||||
if (CodePoint & 0xF000'0000) { | if (CodePoint & 0xF000'0000) { | ||||||||||||
if (Diagnose) | if (Diagnose) | ||||||||||||
Diag(KindLoc, diag::err_escape_too_large) << 0; | Diag(KindLoc, diag::err_escape_too_large) << 0; | ||||||||||||
return std::nullopt; | return std::nullopt; | ||||||||||||
} | } | ||||||||||||
CodePoint <<= 4; | CodePoint <<= 4; | ||||||||||||
CodePoint |= Value; | CodePoint |= Value; | ||||||||||||
CurPtr += CharSize; | CurPtr += CharSize; | ||||||||||||
Count++; | Count++; | ||||||||||||
} | } | ||||||||||||
if (Count == 0) { | if (Count == 0) { | ||||||||||||
if (Diagnose) | if (Diagnose) | ||||||||||||
Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty | Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty | ||||||||||||
: diag::warn_ucn_escape_no_digits) | : diag::warn_ucn_escape_no_digits) | ||||||||||||
<< StringRef(KindLoc, 1); | << StringRef(KindLoc, 1); | ||||||||||||
return std::nullopt; | return std::nullopt; | ||||||||||||
} | } | ||||||||||||
if (Delimited && Kind == 'U') { | if (Delimited && Kind == 'U') { | ||||||||||||
if (Diagnose) | if (Diagnose) | ||||||||||||
Diag(StartPtr, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1); | Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1); | ||||||||||||
return std::nullopt; | return std::nullopt; | ||||||||||||
} | } | ||||||||||||
if (!Delimited && Count != NumHexDigits) { | if (!Delimited && Count != NumHexDigits) { | ||||||||||||
if (Diagnose) { | if (Diagnose) { | ||||||||||||
Diag(BufferPtr, diag::warn_ucn_escape_incomplete); | Diag(SlashLoc, diag::warn_ucn_escape_incomplete); | ||||||||||||
// If the user wrote \U1234, suggest a fixit to \u. | // If the user wrote \U1234, suggest a fixit to \u. | ||||||||||||
if (Count == 4 && NumHexDigits == 8) { | if (Count == 4 && NumHexDigits == 8) { | ||||||||||||
CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); | CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); | ||||||||||||
Diag(KindLoc, diag::note_ucn_four_not_eight) | Diag(KindLoc, diag::note_ucn_four_not_eight) | ||||||||||||
<< FixItHint::CreateReplacement(URange, "u"); | << FixItHint::CreateReplacement(URange, "u"); | ||||||||||||
} | } | ||||||||||||
} | } | ||||||||||||
return std::nullopt; | return std::nullopt; | ||||||||||||
} | } | ||||||||||||
if (Delimited && PP) { | if (Delimited && PP) { | ||||||||||||
Diag(BufferPtr, PP->getLangOpts().CPlusPlus2b | Diag(SlashLoc, PP->getLangOpts().CPlusPlus2b | ||||||||||||
? diag::warn_cxx2b_delimited_escape_sequence | ? diag::warn_cxx2b_delimited_escape_sequence | ||||||||||||
: diag::ext_delimited_escape_sequence) | : diag::ext_delimited_escape_sequence) | ||||||||||||
<< /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0); | << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0); | ||||||||||||
} | } | ||||||||||||
if (Result) { | if (Result) { | ||||||||||||
Result->setFlag(Token::HasUCN); | Result->setFlag(Token::HasUCN); | ||||||||||||
if (CurPtr - StartPtr == (ptrdiff_t)(Count + 2 + (Delimited ? 2 : 0))) | // If the UCN contains either a trigraph or a line splicing, | ||||||||||||
// we need to call getAndAdvanceChar again to set the appropriate flags | |||||||||||||
// on Result. | |||||||||||||
if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0))) | |||||||||||||
StartPtr = CurPtr; | StartPtr = CurPtr; | ||||||||||||
else | else | ||||||||||||
while (StartPtr != CurPtr) | while (StartPtr != CurPtr) | ||||||||||||
(void)getAndAdvanceChar(StartPtr, *Result); | (void)getAndAdvanceChar(StartPtr, *Result); | ||||||||||||
} else { | } else { | ||||||||||||
StartPtr = CurPtr; | StartPtr = CurPtr; | ||||||||||||
} | } | ||||||||||||
return CodePoint; | return CodePoint; | ||||||||||||
} | } | ||||||||||||
llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr, | llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr, | ||||||||||||
const char *SlashLoc, | |||||||||||||
Token *Result) { | Token *Result) { | ||||||||||||
unsigned CharSize; | unsigned CharSize; | ||||||||||||
bool Diagnose = Result && !isLexingRawMode(); | bool Diagnose = Result && !isLexingRawMode(); | ||||||||||||
aaron.ballman: Spurious whitespace | |||||||||||||
char C = getCharAndSize(StartPtr, CharSize); | char C = getCharAndSize(StartPtr, CharSize); | ||||||||||||
assert(C == 'N' && "expected \\N{...}"); | assert(C == 'N' && "expected \\N{...}"); | ||||||||||||
const char *CurPtr = StartPtr + CharSize; | const char *CurPtr = StartPtr + CharSize; | ||||||||||||
const char *KindLoc = &CurPtr[-1]; | const char *KindLoc = &CurPtr[-1]; | ||||||||||||
C = getCharAndSize(CurPtr, CharSize); | C = getCharAndSize(CurPtr, CharSize); | ||||||||||||
if (C != '{') { | if (C != '{') { | ||||||||||||
if (Diagnose) | if (Diagnose) | ||||||||||||
Diag(StartPtr, diag::warn_ucn_escape_incomplete); | Diag(SlashLoc, diag::warn_ucn_escape_incomplete); | ||||||||||||
Not Done ReplyInline ActionsI was able to confirm that StartPtr does point to N. See the diagnostic generated at https://godbolt.org/z/qnajcMeso; the diagnostic caret points to N instead of \. <source>:1:2: warning: incomplete delimited universal character name; treating as '\' 'N' '{' identifier [-Wunicode] \N{abc ^ tahonermann: I was able to confirm that `StartPtr` does point to `N`. See the diagnostic generated at https… | |||||||||||||
Not Done ReplyInline ActionsI think this still needs to be addressed. tryReadNumericUCN() handles this case by requiring that callers pass the location of the \ as SlashLoc. I guess this is technically required since the \ character could be written as either \ or the ??/ trigraph. tahonermann: I think this still needs to be addressed. `tryReadNumericUCN()` handles this case by requiring… | |||||||||||||
Done (by adding SlashLoc)! cor3ntin: Done (by adding SlashLoc)!
There is still room for improvement here but if we wanted to clean… | |||||||||||||
return std::nullopt; | return std::nullopt; | ||||||||||||
} | } | ||||||||||||
CurPtr += CharSize; | CurPtr += CharSize; | ||||||||||||
const char *StartName = CurPtr; | const char *StartName = CurPtr; | ||||||||||||
bool FoundEndDelimiter = false; | bool FoundEndDelimiter = false; | ||||||||||||
llvm::SmallVector<char, 30> Buffer; | llvm::SmallVector<char, 30> Buffer; | ||||||||||||
while (C) { | while (C) { | ||||||||||||
C = getCharAndSize(CurPtr, CharSize); | C = getCharAndSize(CurPtr, CharSize); | ||||||||||||
CurPtr += CharSize; | CurPtr += CharSize; | ||||||||||||
if (C == '}') { | if (C == '}') { | ||||||||||||
FoundEndDelimiter = true; | FoundEndDelimiter = true; | ||||||||||||
break; | break; | ||||||||||||
} | } | ||||||||||||
if (!isAlphanumeric(C) && C != '_' && C != '-' && C != ' ') | if (isVerticalWhitespace(C)) | ||||||||||||
Not Done ReplyInline ActionsisVerticalWhitespace()? aaron.ballman: `isVerticalWhitespace()`? | |||||||||||||
break; | break; | ||||||||||||
Buffer.push_back(C); | Buffer.push_back(C); | ||||||||||||
} | } | ||||||||||||
if (!FoundEndDelimiter || Buffer.empty()) { | if (!FoundEndDelimiter || Buffer.empty()) { | ||||||||||||
if (Diagnose) | if (Diagnose) | ||||||||||||
Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty | Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty | ||||||||||||
: diag::warn_delimited_ucn_incomplete) | : diag::warn_delimited_ucn_incomplete) | ||||||||||||
<< StringRef(KindLoc, 1); | << StringRef(KindLoc, 1); | ||||||||||||
return std::nullopt; | return std::nullopt; | ||||||||||||
} | } | ||||||||||||
StringRef Name(Buffer.data(), Buffer.size()); | StringRef Name(Buffer.data(), Buffer.size()); | ||||||||||||
llvm::Optional<char32_t> Res = | llvm::Optional<char32_t> Match = | ||||||||||||
llvm::sys::unicode::nameToCodepointStrict(Name); | llvm::sys::unicode::nameToCodepointStrict(Name); | ||||||||||||
llvm::Optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch; | llvm::Optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch; | ||||||||||||
if (!Res) { | if (!Match) { | ||||||||||||
if (!isLexingRawMode()) { | |||||||||||||
Diag(StartPtr, diag::err_invalid_ucn_name) | |||||||||||||
<< StringRef(Buffer.data(), Buffer.size()); | |||||||||||||
LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name); | LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name); | ||||||||||||
if (Diagnose) { | |||||||||||||
Diag(StartName, diag::err_invalid_ucn_name) | |||||||||||||
<< StringRef(Buffer.data(), Buffer.size()) | |||||||||||||
<< makeCharRange(*this, StartName, CurPtr - CharSize); | |||||||||||||
if (LooseMatch) { | if (LooseMatch) { | ||||||||||||
Diag(StartName, diag::note_invalid_ucn_name_loose_matching) | Diag(StartName, diag::note_invalid_ucn_name_loose_matching) | ||||||||||||
<< FixItHint::CreateReplacement( | << FixItHint::CreateReplacement( | ||||||||||||
makeCharRange(*this, StartName, CurPtr - CharSize), | makeCharRange(*this, StartName, CurPtr - CharSize), | ||||||||||||
LooseMatch->Name); | LooseMatch->Name); | ||||||||||||
} | } | ||||||||||||
} | } | ||||||||||||
// When finding a match using Unicode loose matching rules | |||||||||||||
// recover after having emitted a diagnostic. | |||||||||||||
if (!LooseMatch) | |||||||||||||
return std::nullopt; | |||||||||||||
// We do not offer misspelled character names suggestions here | // We do not offer misspelled character names suggestions here | ||||||||||||
// as the set of what would be a valid suggestion depends on context, | // as the set of what would be a valid suggestion depends on context, | ||||||||||||
// and we should not make invalid suggestions. | // and we should not make invalid suggestions. | ||||||||||||
} | } | ||||||||||||
if (Diagnose && PP && !LooseMatch) | if (Diagnose && Match) | ||||||||||||
Diag(BufferPtr, PP->getLangOpts().CPlusPlus2b | Diag(SlashLoc, PP->getLangOpts().CPlusPlus2b | ||||||||||||
? diag::warn_cxx2b_delimited_escape_sequence | ? diag::warn_cxx2b_delimited_escape_sequence | ||||||||||||
: diag::ext_delimited_escape_sequence) | : diag::ext_delimited_escape_sequence) | ||||||||||||
<< /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0); | << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0); | ||||||||||||
if (LooseMatch) | // If no diagnostic has been emitted yet, likely because we are doing a | ||||||||||||
Not Done ReplyInline ActionsWhy do we only want to do this if we're diagnosing? aaron.ballman: Why do we only want to do this if we're diagnosing? | |||||||||||||
The scenario we want to avoid: There is a tentative parse of a token, no diagnostic is emitted. cor3ntin: The scenario we want to avoid:
There is a tentative parse of a token, no diagnostic is… | |||||||||||||
Not Done ReplyInline ActionsAhhh okay, that makes sense. I think it'd help to add a comment here to explain that because it's a bit subtle. aaron.ballman: Ahhh okay, that makes sense. I think it'd help to add a comment here to explain that because… | |||||||||||||
Res = LooseMatch->CodePoint; | // tentative lexing, we do not want to recover here to make sure the token | ||||||||||||
// will not be incorrectly considered valid. This function will be called | |||||||||||||
// again and a diagnostic emitted then. | |||||||||||||
if (LooseMatch && Diagnose) | |||||||||||||
Match = LooseMatch->CodePoint; | |||||||||||||
Not Done ReplyInline Actions
I'm having a hard time understanding what this comment is trying to convey. The comment response to Aaron was much more helpful to me. How about this suggested edit? tahonermann: I'm having a hard time understanding what this comment is trying to convey. The comment… | |||||||||||||
It's not that a diagnostic would be emitted twice, is that it would not be emitted at all. cor3ntin: It's not that a diagnostic would be emitted twice, is that it would not be emitted at all.
By… | |||||||||||||
Not Done ReplyInline ActionsMore evidence of the difficulty I'm having appreciating this comment :) Does this sound right? // Only map a loose match to a code point when in a diagnosing state. // If a mapped code point were to be returned in a non-diagnosing state, // token caching would prevent the opportunity to issue a diagnostic when // the token is later used. Tangent: It might be worth renaming one of Res and Result. I keep getting confused due to the similar names. Perhaps rename Result to Token or ResultToken? tahonermann: More evidence of the difficulty I'm having appreciating this comment :)
Does this sound right? | |||||||||||||
I'm not sure caching is exactly right. cor3ntin: I'm not sure caching is exactly right.
There are tentative parses, that should not give false… | |||||||||||||
if (Result) { | if (Result) { | ||||||||||||
Result->setFlag(Token::HasUCN); | Result->setFlag(Token::HasUCN); | ||||||||||||
if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 4)) | // If the UCN contains either a trigraph or a line splicing, | ||||||||||||
// we need to call getAndAdvanceChar again to set the appropriate flags | |||||||||||||
// on Result. | |||||||||||||
if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3)) | |||||||||||||
StartPtr = CurPtr; | StartPtr = CurPtr; | ||||||||||||
else | else | ||||||||||||
while (StartPtr != CurPtr) | while (StartPtr != CurPtr) | ||||||||||||
(void)getAndAdvanceChar(StartPtr, *Result); | (void)getAndAdvanceChar(StartPtr, *Result); | ||||||||||||
} else { | } else { | ||||||||||||
StartPtr = CurPtr; | StartPtr = CurPtr; | ||||||||||||
Not Done ReplyInline ActionsThis is a bit of a tangent, but under what circumstances would CurPtr - StartPtr not be equal to Buffer.size() + 4? Actually, I'm not sure that +4 is correct. It looks like StartPtr is expected to point to N at the beginning of the function, so the initial \ is not included in the range. If N isn't seen, the function returns early. Likewise, if either of the { or } delimiters is not found, the function returns early. I think the goal of this code is to skip the getAndAdvanceChar() machinery when the buffer is being claimed (no need to parse UCNs or trigraphs in the character name), but it looks like, particularly with this DR, that we might always be able to do that. tahonermann: This is a bit of a tangent, but under what circumstances would `CurPtr - StartPtr` not be equal… | |||||||||||||
afaict, 4 is correct here because we are one past-the-end. cor3ntin: afaict, 4 is correct here because we are one past-the-end.
I do however agree that this whole… | |||||||||||||
I looked into it, I'm sure we could improve but not easily, getAndAdvanceChar does set some flags on the token in the presence of trigraphs/line splicing and we need those flags to be set, this is the reason we do need to call that method. cor3ntin: I looked into it, I'm sure we could improve but not easily, `getAndAdvanceChar` does set some… | |||||||||||||
Not Done ReplyInline ActionsMy concern is that, as is, the code always takes the else branch (except when Result is non-null). Assuming a buffer containing "X", the pointers are arranged as follows (where $ is one past the end). \ N { X } $ | | `- CurPtr | `- Buffer `- StartPtr CurPtr - StartPtr is 4, but Buffer.size() + 4 is 5 (Buffer.size() is 1 in this case). I think there might be an easy test to see if this is working as intended. If it isn't, I would expect a diagnostic to be issued if trigraphs are enabled and the buffer contains a trigraph sequence. Something like: \N{LOTUS??>} tahonermann: My concern is that, as is, the code always takes the `else` branch (except when `Result` is non… | |||||||||||||
I can try to add tests
Yes, the if branch sole purpose is to set some state in Result. At the start of the function, StartPtr points to \ There may be a potential refactor here, which is to have getAndAdvanceChar take a bool & ShouldCleanupToken parameter instead of a token so that we don't have to do that dance, but it's a bit outside of the scope of this patch... cor3ntin: I can try to add tests
> My concern is that, as is, the code always takes the else branch… | |||||||||||||
Not Done ReplyInline Actions
It doesn't look like it does. The first use of StartPtr is at line 3314 and it expects to read N: 3314: char C = getCharAndSize(StartPtr, CharSize); 3315: assert(C == 'N' && "expected \\N{...}"); tahonermann: > At the start of the function, StartPtr points to `\`
It doesn't look like it does. The first… | |||||||||||||
Yes, you are right. There was a bug in \u too, probably has been there for ages. cor3ntin: Yes, you are right. There was a bug in \u too, probably has been there for ages.
It's… | |||||||||||||
Not Done ReplyInline ActionsThank you; the added comment makes this more clear. Basically, we only want to skip the call to getAndAdvanceChar() when we're sure the token wasn't spelled with trigraphs or line spaces. tahonermann: Thank you; the added comment makes this more clear. Basically, we only want to skip the call to… | |||||||||||||
} | } | ||||||||||||
Not Done ReplyInline Actions
aaron.ballman: | |||||||||||||
return *Res; | return Match ? llvm::Optional<uint32_t>(*Match) : std::nullopt; | ||||||||||||
} | } | ||||||||||||
uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, | uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, | ||||||||||||
Token *Result) { | Token *Result) { | ||||||||||||
unsigned CharSize; | unsigned CharSize; | ||||||||||||
llvm::Optional<uint32_t> CodePointOpt; | llvm::Optional<uint32_t> CodePointOpt; | ||||||||||||
char Kind = getCharAndSize(StartPtr, CharSize); | char Kind = getCharAndSize(StartPtr, CharSize); | ||||||||||||
if (Kind == 'u' || Kind == 'U') | if (Kind == 'u' || Kind == 'U') | ||||||||||||
CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result); | CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result); | ||||||||||||
else if (Kind == 'N') | else if (Kind == 'N') | ||||||||||||
CodePointOpt = tryReadNamedUCN(StartPtr, Result); | CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result); | ||||||||||||
if (!CodePointOpt) | if (!CodePointOpt) | ||||||||||||
return 0; | return 0; | ||||||||||||
uint32_t CodePoint = *CodePointOpt; | uint32_t CodePoint = *CodePointOpt; | ||||||||||||
// Don't apply C family restrictions to UCNs in assembly mode | // Don't apply C family restrictions to UCNs in assembly mode | ||||||||||||
if (LangOpts.AsmPreprocessor) | if (LangOpts.AsmPreprocessor) | ||||||||||||
▲ Show 20 Lines • Show All 1,026 Lines • Show Last 20 Lines |
Spurious whitespace