diff --git a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp --- a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp +++ b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp @@ -245,7 +245,7 @@ // parent_path. // FIXME: Don't rely on source text. const char *End = Source.end(); - while (isIdentifierBody(*End) || *End == ':') + while (isAsciiIdentifierContinue(*End) || *End == ':') ++End; return std::string(Source.begin(), End); diff --git a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp --- a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp @@ -129,7 +129,7 @@ const StringRef Port = "unsigned short port"; const char *Data = Result.SourceManager->getCharacterData(Loc); if (!std::strncmp(Data, Port.data(), Port.size()) && - !isIdentifierBody(Data[Port.size()])) + !isAsciiIdentifierContinue(Data[Port.size()])) return; std::string Replacement = diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp --- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp +++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp @@ -464,7 +464,7 @@ Failure.FixStatus = ShouldFixStatus::ConflictsWithKeyword; else if (Ident->hasMacroDefinition()) Failure.FixStatus = ShouldFixStatus::ConflictsWithMacroDefinition; - } else if (!isValidIdentifier(Info.Fixup)) { + } else if (!isValidAsciiIdentifier(Info.Fixup)) { Failure.FixStatus = ShouldFixStatus::FixInvalidIdentifier; } diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp --- a/clang-tools-extra/clangd/CodeComplete.cpp +++ b/clang-tools-extra/clangd/CodeComplete.cpp @@ -1842,14 +1842,14 @@ CompletionPrefix Result; // Consume the unqualified name. We only handle ASCII characters. - // isIdentifierBody will let us match "0invalid", but we don't mind. - while (!Rest.empty() && isIdentifierBody(Rest.back())) + // isAsciiIdentifierContinue will let us match "0invalid", but we don't mind. + while (!Rest.empty() && isAsciiIdentifierContinue(Rest.back())) Rest = Rest.drop_back(); Result.Name = Content.slice(Rest.size(), Offset); // Consume qualifiers. while (Rest.consume_back("::") && !Rest.endswith(":")) // reject :::: - while (!Rest.empty() && isIdentifierBody(Rest.back())) + while (!Rest.empty() && isAsciiIdentifierContinue(Rest.back())) Rest = Rest.drop_back(); Result.Qualifier = Content.slice(Rest.size(), Result.Name.begin() - Content.begin()); @@ -2057,8 +2057,8 @@ return true; // Complete words. Give non-ascii characters the benefit of the doubt. - return !Content.empty() && - (isIdentifierBody(Content.back()) || !llvm::isASCII(Content.back())); + return !Content.empty() && (isAsciiIdentifierContinue(Content.back()) || + !llvm::isASCII(Content.back())); } } // namespace clangd diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp --- a/clang-tools-extra/clangd/SourceCode.cpp +++ b/clang-tools-extra/clangd/SourceCode.cpp @@ -945,9 +945,9 @@ if (Invalid) return llvm::None; unsigned B = Offset, E = Offset; - while (B > 0 && isIdentifierBody(Code[B - 1])) + while (B > 0 && isAsciiIdentifierContinue(Code[B - 1])) --B; - while (E < Code.size() && isIdentifierBody(Code[E])) + while (E < Code.size() && isAsciiIdentifierContinue(Code[E])) ++E; if (B == E) return llvm::None; diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp --- a/clang-tools-extra/clangd/refactor/Rename.cpp +++ b/clang-tools-extra/clangd/refactor/Rename.cpp @@ -478,10 +478,10 @@ // We don't check all the rules for non-ascii characters (most are allowed). bool AllowDollar = true; // lenient if (llvm::isASCII(Ident.front()) && - !isIdentifierHead(Ident.front(), AllowDollar)) + !isAsciiIdentifierStart(Ident.front(), AllowDollar)) return false; for (char C : Ident) { - if (llvm::isASCII(C) && !isIdentifierBody(C, AllowDollar)) + if (llvm::isASCII(C) && !isAsciiIdentifierContinue(C, AllowDollar)) return false; } return true; diff --git a/clang/include/clang/Basic/CharInfo.h b/clang/include/clang/Basic/CharInfo.h --- a/clang/include/clang/Basic/CharInfo.h +++ b/clang/include/clang/Basic/CharInfo.h @@ -50,8 +50,8 @@ /// Returns true if this is a valid first character of a C identifier, /// which is [a-zA-Z_]. -LLVM_READONLY inline bool isIdentifierHead(unsigned char c, - bool AllowDollar = false) { +LLVM_READONLY inline bool isAsciiIdentifierStart(unsigned char c, + bool AllowDollar = false) { using namespace charinfo; if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_UNDER)) return true; @@ -60,8 +60,8 @@ /// Returns true if this is a body character of a C identifier, /// which is [a-zA-Z0-9_]. -LLVM_READONLY inline bool isIdentifierBody(unsigned char c, - bool AllowDollar = false) { +LLVM_READONLY inline bool isAsciiIdentifierContinue(unsigned char c, + bool AllowDollar = false) { using namespace charinfo; if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER)) return true; @@ -186,13 +186,13 @@ /// /// Note that this is a very simple check; it does not accept UCNs as valid /// identifier characters. -LLVM_READONLY inline bool isValidIdentifier(StringRef S, - bool AllowDollar = false) { - if (S.empty() || !isIdentifierHead(S[0], AllowDollar)) +LLVM_READONLY inline bool isValidAsciiIdentifier(StringRef S, + bool AllowDollar = false) { + if (S.empty() || !isAsciiIdentifierStart(S[0], AllowDollar)) return false; for (StringRef::iterator I = S.begin(), E = S.end(); I != E; ++I) - if (!isIdentifierBody(*I, AllowDollar)) + if (!isAsciiIdentifierContinue(*I, AllowDollar)) return false; return true; diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h --- a/clang/include/clang/Lex/Lexer.h +++ b/clang/include/clang/Lex/Lexer.h @@ -536,7 +536,8 @@ bool SkipTrailingWhitespaceAndNewLine); /// Returns true if the given character could appear in an identifier. - static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts); + static bool isAsciiIdentifierContinueChar(char c, + const LangOptions &LangOpts); /// Checks whether new line pointed by Str is preceded by escape /// sequence. @@ -573,10 +574,7 @@ bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr); - /// Given that a token begins with the Unicode character \p C, figure out - /// what kind of token it is and dispatch to the appropriate lexing helper - /// function. - bool LexUnicode(Token &Result, uint32_t C, const char *CurPtr); + bool LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr); /// FormTokenWithChars - When we lex a token, we have identified a span /// starting at BufferPtr, going to TokEnd that forms the token. This method @@ -701,7 +699,7 @@ bool IsStringLiteral); // Helper functions to lex the remainder of a token of the specific type. - bool LexIdentifier (Token &Result, const char *CurPtr); + bool LexIdentifierContinue(Token &Result, const char *CurPtr); bool LexNumericConstant (Token &Result, const char *CurPtr); bool LexStringLiteral (Token &Result, const char *CurPtr, tok::TokenKind Kind); diff --git a/clang/lib/ARCMigrate/ObjCMT.cpp b/clang/lib/ARCMigrate/ObjCMT.cpp --- a/clang/lib/ARCMigrate/ObjCMT.cpp +++ b/clang/lib/ARCMigrate/ObjCMT.cpp @@ -1144,7 +1144,7 @@ static bool IsValidIdentifier(ASTContext &Ctx, const char *Name) { - if (!isIdentifierHead(Name[0])) + if (!isAsciiIdentifierStart(Name[0])) return false; std::string NameString = Name; NameString[0] = toLowercase(NameString[0]); diff --git a/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp b/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp --- a/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp +++ b/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp @@ -253,7 +253,8 @@ SourceManager &SM = Pass.Ctx.getSourceManager(); char PrevChar = *SM.getCharacterData(InsertLoc.getLocWithOffset(-1)); - if (Lexer::isIdentifierBodyChar(PrevChar, Pass.Ctx.getLangOpts())) + if (Lexer::isAsciiIdentifierContinueChar(PrevChar, + Pass.Ctx.getLangOpts())) BridgeCall += ' '; if (Kind == OBC_BridgeTransfer) diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -3883,7 +3883,7 @@ // - ?[A-Z]: The range from \xc1 to \xda. // - ?[0-9]: The set of [,/\:. \n\t'-]. // - ?$XX: A fallback which maps nibbles. - if (isIdentifierBody(Byte, /*AllowDollar=*/true)) { + if (isAsciiIdentifierContinue(Byte, /*AllowDollar=*/true)) { Mangler.getStream() << Byte; } else if (isLetter(Byte & 0x7f)) { Mangler.getStream() << '?' << static_cast(Byte & 0x7f); diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp --- a/clang/lib/Basic/Module.cpp +++ b/clang/lib/Basic/Module.cpp @@ -203,7 +203,7 @@ OS << "."; StringRef Name = getModuleNameFromComponent(*It); - if (!AllowStringLiterals || isValidIdentifier(Name)) + if (!AllowStringLiterals || isValidAsciiIdentifier(Name)) OS << Name; else { OS << '"'; diff --git a/clang/lib/Edit/EditedSource.cpp b/clang/lib/Edit/EditedSource.cpp --- a/clang/lib/Edit/EditedSource.cpp +++ b/clang/lib/Edit/EditedSource.cpp @@ -314,8 +314,8 @@ static bool canBeJoined(char left, char right, const LangOptions &LangOpts) { // FIXME: Should use TokenConcatenation to make sure we don't allow stuff like // making two '<' adjacent. - return !(Lexer::isIdentifierBodyChar(left, LangOpts) && - Lexer::isIdentifierBodyChar(right, LangOpts)); + return !(Lexer::isAsciiIdentifierContinueChar(left, LangOpts) && + Lexer::isAsciiIdentifierContinueChar(right, LangOpts)); } /// Returns true if it is ok to eliminate the trailing whitespace between diff --git a/clang/lib/Frontend/LayoutOverrideSource.cpp b/clang/lib/Frontend/LayoutOverrideSource.cpp --- a/clang/lib/Frontend/LayoutOverrideSource.cpp +++ b/clang/lib/Frontend/LayoutOverrideSource.cpp @@ -16,11 +16,11 @@ /// Parse a simple identifier. static std::string parseName(StringRef S) { - if (S.empty() || !isIdentifierHead(S[0])) + if (S.empty() || !isAsciiIdentifierStart(S[0])) return ""; unsigned Offset = 1; - while (Offset < S.size() && isIdentifierBody(S[Offset])) + while (Offset < S.size() && isAsciiIdentifierContinue(S[Offset])) ++Offset; return S.substr(0, Offset).str(); diff --git a/clang/lib/Frontend/Rewrite/FrontendActions.cpp b/clang/lib/Frontend/Rewrite/FrontendActions.cpp --- a/clang/lib/Frontend/Rewrite/FrontendActions.cpp +++ b/clang/lib/Frontend/Rewrite/FrontendActions.cpp @@ -231,7 +231,7 @@ assert(OS && "loaded module file after finishing rewrite action?"); (*OS) << "#pragma clang module build "; - if (isValidIdentifier(MF->ModuleName)) + if (isValidAsciiIdentifier(MF->ModuleName)) (*OS) << MF->ModuleName; else { (*OS) << '"'; diff --git a/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp b/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp --- a/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp +++ b/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp @@ -131,17 +131,17 @@ --Current; if (*Current != 'R') return false; - if (First == Current || !isIdentifierBody(*--Current)) + if (First == Current || !isAsciiIdentifierContinue(*--Current)) return true; // Check for a prefix of "u", "U", or "L". if (*Current == 'u' || *Current == 'U' || *Current == 'L') - return First == Current || !isIdentifierBody(*--Current); + return First == Current || !isAsciiIdentifierContinue(*--Current); // Check for a prefix of "u8". if (*Current != '8' || First == Current || *Current-- != 'u') return false; - return First == Current || !isIdentifierBody(*--Current); + return First == Current || !isAsciiIdentifierContinue(*--Current); } static void skipRawString(const char *&First, const char *const End) { @@ -319,7 +319,7 @@ if (!isPreprocessingNumberBody(Prev)) return false; // The next character should be a valid identifier body character. - return (Cur + 1) < End && isIdentifierBody(*(Cur + 1)); + return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1)); } static void skipLine(const char *&First, const char *const End) { @@ -484,7 +484,7 @@ const char *Last = First; do ++Last; - while (Last != End && (isIdentifierBody(*Last) || *Last == '.')); + while (Last != End && (isAsciiIdentifierContinue(*Last) || *Last == '.')); append(First, Last); First = Last; } @@ -507,7 +507,7 @@ } // Don't handle macro expansions inside @import for now. - if (!isIdentifierBody(*First) && *First != '.') + if (!isAsciiIdentifierContinue(*First) && *First != '.') return true; printAdjacentModuleNameParts(First, End); @@ -524,9 +524,9 @@ LLVM_NODISCARD static const char *lexRawIdentifier(const char *First, const char *const End) { - assert(isIdentifierBody(*First) && "invalid identifer"); + assert(isAsciiIdentifierContinue(*First) && "invalid identifer"); const char *Last = First + 1; - while (Last != End && isIdentifierBody(*Last)) + while (Last != End && isAsciiIdentifierContinue(*Last)) ++Last; return Last; } @@ -540,7 +540,7 @@ skipNewline(First, End); if (First == End) return nullptr; - return isIdentifierBody(First[0]) ? First : nullptr; + return isAsciiIdentifierContinue(First[0]) ? First : nullptr; } Minimizer::IdInfo Minimizer::lexIdentifier(const char *First, @@ -569,7 +569,7 @@ do ++Last; while (Last != End && - (isIdentifierBody(*Last) || *Last == '.' || *Last == ',')); + (isAsciiIdentifierContinue(*Last) || *Last == '.' || *Last == ',')); append(First, Last); First = Last; } @@ -588,7 +588,7 @@ } // This is intentionally fairly liberal. - if (!(isIdentifierBody(*First) || *First == '.' || *First == ',')) + if (!(isAsciiIdentifierContinue(*First) || *First == '.' || *First == ',')) return true; printAdjacentMacroArgs(First, End); @@ -602,7 +602,7 @@ bool Minimizer::isNextIdentifier(StringRef Id, const char *&First, const char *const End) { skipWhitespace(First, End); - if (First == End || !isIdentifierHead(*First)) + if (First == End || !isAsciiIdentifierStart(*First)) return false; IdInfo FoundId = lexIdentifier(First, End); @@ -639,7 +639,7 @@ if (Id.Name == "export") { Export = true; skipWhitespace(First, End); - if (!isIdentifierBody(*First)) { + if (!isAsciiIdentifierContinue(*First)) { skipLine(First, End); return false; } @@ -663,7 +663,7 @@ case '"': break; default: - if (!isIdentifierBody(*First)) { + if (!isAsciiIdentifierContinue(*First)) { skipLine(First, End); return false; } @@ -690,7 +690,7 @@ append("#define "); skipWhitespace(First, End); - if (!isIdentifierHead(*First)) + if (!isAsciiIdentifierStart(*First)) return reportError(First, diag::err_pp_macro_not_identifier); IdInfo Id = lexIdentifier(First, End); @@ -722,7 +722,7 @@ bool Minimizer::lexPragma(const char *&First, const char *const End) { // #pragma. skipWhitespace(First, End); - if (First == End || !isIdentifierHead(*First)) + if (First == End || !isAsciiIdentifierStart(*First)) return false; IdInfo FoundId = lexIdentifier(First, End); @@ -827,7 +827,7 @@ if (First == End) return reportError(First, diag::err_pp_expected_eol); - if (!isIdentifierHead(*First)) { + if (!isAsciiIdentifierStart(*First)) { skipLine(First, End); return false; } diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -1062,8 +1062,8 @@ return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); } -bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { - return isIdentifierBody(c, LangOpts.DollarIdents); +bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) { + return isAsciiIdentifierContinue(c, LangOpts.DollarIdents); } bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { @@ -1712,103 +1712,129 @@ return true; } -bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) { - // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] - unsigned Size; - unsigned char C = *CurPtr++; - while (isIdentifierBody(C)) - C = *CurPtr++; - - --CurPtr; // Back up over the skipped character. - - // Fast path, no $,\,? in identifier found. '\' might be an escaped newline - // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. - // - // TODO: Could merge these checks into an InfoTable flag to make the - // comparison cheaper - if (isASCII(C) && C != '\\' && C != '?' && - (C != '$' || !LangOpts.DollarIdents)) { -FinishIdentifier: - const char *IdStart = BufferPtr; - FormTokenWithChars(Result, CurPtr, tok::raw_identifier); - Result.setRawIdentifierData(IdStart); - - // If we are in raw mode, return this identifier raw. There is no need to - // look up identifier information or attempt to macro expand it. - if (LexingRawMode) - return true; - - // Fill in Result.IdentifierInfo and update the token kind, - // looking up the identifier in the identifier table. - IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); - // Note that we have to call PP->LookUpIdentifierInfo() even for code - // completion, it writes IdentifierInfo into Result, and callers rely on it. - - // If the completion point is at the end of an identifier, we want to treat - // the identifier as incomplete even if it resolves to a macro or a keyword. - // This allows e.g. 'class^' to complete to 'classifier'. - if (isCodeCompletionPoint(CurPtr)) { - // Return the code-completion token. - Result.setKind(tok::code_completion); - // Skip the code-completion char and all immediate identifier characters. - // This ensures we get consistent behavior when completing at any point in - // an identifier (i.e. at the start, in the middle, at the end). Note that - // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code - // simpler. - assert(*CurPtr == 0 && "Completion character must be 0"); - ++CurPtr; - // Note that code completion token is not added as a separate character - // when the completion point is at the end of the buffer. Therefore, we need - // to check if the buffer has ended. - if (CurPtr < BufferEnd) { - while (isIdentifierBody(*CurPtr)) - ++CurPtr; - } - BufferPtr = CurPtr; - return true; +bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, + const char *CurPtr) { + if (isAllowedInitiallyIDChar(C, LangOpts)) { + if (!isLexingRawMode() && !ParsingPreprocessorDirective && + !PP->isPreprocessedOutput()) { + maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, + makeCharRange(*this, BufferPtr, CurPtr), + /*IsFirst=*/true); + maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, + makeCharRange(*this, BufferPtr, CurPtr)); } - // Finally, now that we know we have an identifier, pass this off to the - // preprocessor, which may macro expand it or something. - if (II->isHandleIdentifierCase()) - return PP->HandleIdentifier(Result); + MIOpt.ReadToken(); + return LexIdentifierContinue(Result, CurPtr); + } - return true; + if (!isLexingRawMode() && !ParsingPreprocessorDirective && + !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && + !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) { + // Non-ASCII characters tend to creep into source code unintentionally. + // Instead of letting the parser complain about the unknown token, + // just drop the character. + // Note that we can /only/ do this when the non-ASCII character is actually + // spelled as Unicode, not written as a UCN. The standard requires that + // we not throw away any possible preprocessor tokens, but there's a + // loophole in the mapping of Unicode characters to basic character set + // characters that allows us to map these particular characters to, say, + // whitespace. + diagnoseInvalidUnicodeCodepointInIdentifier( + PP->getDiagnostics(), LangOpts, C, + makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true); + BufferPtr = CurPtr; + return false; } - // Otherwise, $,\,? in identifier found. Enter slower path. + // Otherwise, we have an explicit UCN or a character that's unlikely to show + // up by accident. + MIOpt.ReadToken(); + FormTokenWithChars(Result, CurPtr, tok::unknown); + return true; +} - C = getCharAndSize(CurPtr, Size); +bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) { + // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] + unsigned Size; while (true) { + unsigned char C = *CurPtr; + // Fast path + if (isAsciiIdentifierContinue(C)) { + ++CurPtr; + continue; + } + // Slow path: handle trigraph, unicode codepoints, UCNs + C = getCharAndSize(CurPtr, Size); + if (isAsciiIdentifierContinue(C)) { + CurPtr = ConsumeChar(CurPtr, Size, Result); + continue; + } if (C == '$') { // If we hit a $ and they are not supported in identifiers, we are done. - if (!LangOpts.DollarIdents) goto FinishIdentifier; - + if (!LangOpts.DollarIdents) + break; // Otherwise, emit a diagnostic and continue. if (!isLexingRawMode()) Diag(CurPtr, diag::ext_dollar_in_identifier); CurPtr = ConsumeChar(CurPtr, Size, Result); - C = getCharAndSize(CurPtr, Size); continue; - } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { - C = getCharAndSize(CurPtr, Size); + } + if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { continue; - } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { - C = getCharAndSize(CurPtr, Size); + } + if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { continue; - } else if (!isIdentifierBody(C)) { - goto FinishIdentifier; } + // Neither an expected unicode codepoint nor a UCN + break; + } - // Otherwise, this character is good, consume it. - CurPtr = ConsumeChar(CurPtr, Size, Result); + const char *IdStart = BufferPtr; + FormTokenWithChars(Result, CurPtr, tok::raw_identifier); + Result.setRawIdentifierData(IdStart); - C = getCharAndSize(CurPtr, Size); - while (isIdentifierBody(C)) { - CurPtr = ConsumeChar(CurPtr, Size, Result); - C = getCharAndSize(CurPtr, Size); + // If we are in raw mode, return this identifier raw. There is no need to + // look up identifier information or attempt to macro expand it. + if (LexingRawMode) + return true; + + // Fill in Result.IdentifierInfo and update the token kind, + // looking up the identifier in the identifier table. + IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); + // Note that we have to call PP->LookUpIdentifierInfo() even for code + // completion, it writes IdentifierInfo into Result, and callers rely on it. + + // If the completion point is at the end of an identifier, we want to treat + // the identifier as incomplete even if it resolves to a macro or a keyword. + // This allows e.g. 'class^' to complete to 'classifier'. + if (isCodeCompletionPoint(CurPtr)) { + // Return the code-completion token. + Result.setKind(tok::code_completion); + // Skip the code-completion char and all immediate identifier characters. + // This ensures we get consistent behavior when completing at any point in + // an identifier (i.e. at the start, in the middle, at the end). Note that + // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code + // simpler. + assert(*CurPtr == 0 && "Completion character must be 0"); + ++CurPtr; + // Note that code completion token is not added as a separate character + // when the completion point is at the end of the buffer. Therefore, we need + // to check if the buffer has ended. + if (CurPtr < BufferEnd) { + while (isAsciiIdentifierContinue(*CurPtr)) + ++CurPtr; } + BufferPtr = CurPtr; + return true; } + + // Finally, now that we know we have an identifier, pass this off to the + // preprocessor, which may macro expand it or something. + if (II->isHandleIdentifierCase()) + return PP->HandleIdentifier(Result); + + return true; } /// isHexaLiteral - Return true if Start points to a hex constant. @@ -1864,7 +1890,7 @@ if (C == '\'' && (getLangOpts().CPlusPlus14 || getLangOpts().C2x)) { unsigned NextSize; char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts()); - if (isIdentifierBody(Next)) { + if (isAsciiIdentifierContinue(Next)) { if (!isLexingRawMode()) Diag(CurPtr, getLangOpts().CPlusPlus ? diag::warn_cxx11_compat_digit_separator @@ -1899,7 +1925,7 @@ char C = getCharAndSize(CurPtr, Size); bool Consumed = false; - if (!isIdentifierHead(C)) { + if (!isAsciiIdentifierStart(C)) { if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) Consumed = true; else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) @@ -1938,7 +1964,7 @@ unsigned NextSize; char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, getLangOpts()); - if (!isIdentifierBody(Next)) { + if (!isAsciiIdentifierContinue(Next)) { // End of suffix. Check whether this is on the allowed list. const StringRef CompleteSuffix(Buffer, Chars); IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(), @@ -1970,10 +1996,12 @@ Result.setFlag(Token::HasUDSuffix); while (true) { C = getCharAndSize(CurPtr, Size); - if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); } - else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {} - else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {} - else break; + if (isAsciiIdentifierContinue(C)) { + CurPtr = ConsumeChar(CurPtr, Size, Result); + } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { + } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { + } else + break; } return CurPtr; @@ -3205,47 +3233,6 @@ return false; } -bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { - if (isAllowedInitiallyIDChar(C, LangOpts)) { - if (!isLexingRawMode() && !ParsingPreprocessorDirective && - !PP->isPreprocessedOutput()) { - maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, - makeCharRange(*this, BufferPtr, CurPtr), - /*IsFirst=*/true); - maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, - makeCharRange(*this, BufferPtr, CurPtr)); - } - - MIOpt.ReadToken(); - return LexIdentifier(Result, CurPtr); - } - - if (!isLexingRawMode() && !ParsingPreprocessorDirective && - !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && - !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) { - // Non-ASCII characters tend to creep into source code unintentionally. - // Instead of letting the parser complain about the unknown token, - // just drop the character. - // Note that we can /only/ do this when the non-ASCII character is actually - // spelled as Unicode, not written as a UCN. The standard requires that - // we not throw away any possible preprocessor tokens, but there's a - // loophole in the mapping of Unicode characters to basic character set - // characters that allows us to map these particular characters to, say, - // whitespace. - diagnoseInvalidUnicodeCodepointInIdentifier( - PP->getDiagnostics(), LangOpts, C, - makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true); - BufferPtr = CurPtr; - return false; - } - - // Otherwise, we have an explicit UCN or a character that's unlikely to show - // up by accident. - MIOpt.ReadToken(); - FormTokenWithChars(Result, CurPtr, tok::unknown); - return true; -} - void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { IsAtStartOfLine = Result.isAtStartOfLine(); HasLeadingSpace = Result.hasLeadingSpace(); @@ -3489,7 +3476,7 @@ } // treat u like the start of an identifier. - return LexIdentifier(Result, CurPtr); + return LexIdentifierContinue(Result, CurPtr); case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal // Notify MIOpt that we read a non-whitespace/non-comment token. @@ -3518,7 +3505,7 @@ } // treat U like the start of an identifier. - return LexIdentifier(Result, CurPtr); + return LexIdentifierContinue(Result, CurPtr); case 'R': // Identifier or C++0x raw string literal // Notify MIOpt that we read a non-whitespace/non-comment token. @@ -3534,7 +3521,7 @@ } // treat R like the start of an identifier. - return LexIdentifier(Result, CurPtr); + return LexIdentifierContinue(Result, CurPtr); case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). // Notify MIOpt that we read a non-whitespace/non-comment token. @@ -3573,7 +3560,7 @@ case '_': // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); - return LexIdentifier(Result, CurPtr); + return LexIdentifierContinue(Result, CurPtr); case '$': // $ in identifiers. if (LangOpts.DollarIdents) { @@ -3581,7 +3568,7 @@ Diag(CurPtr-1, diag::ext_dollar_in_identifier); // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); - return LexIdentifier(Result, CurPtr); + return LexIdentifierContinue(Result, CurPtr); } Kind = tok::unknown; @@ -3996,7 +3983,7 @@ goto LexNextToken; } - return LexUnicode(Result, CodePoint, CurPtr); + return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); } } @@ -4028,7 +4015,7 @@ // (We manually eliminate the tail call to avoid recursion.) goto LexNextToken; } - return LexUnicode(Result, CodePoint, CurPtr); + return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); } if (isLexingRawMode() || ParsingPreprocessorDirective || diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -338,7 +338,7 @@ if (Name.empty()) return Name; - if (!isValidIdentifier(Name)) { + if (!isValidAsciiIdentifier(Name)) { // If we don't already have something with the form of an identifier, // create a buffer with the sanitized name. Buffer.clear(); @@ -346,7 +346,7 @@ Buffer.push_back('_'); Buffer.reserve(Buffer.size() + Name.size()); for (unsigned I = 0, N = Name.size(); I != N; ++I) { - if (isIdentifierBody(Name[I])) + if (isAsciiIdentifierContinue(Name[I])) Buffer.push_back(Name[I]); else Buffer.push_back('_'); diff --git a/clang/lib/Sema/SemaAvailability.cpp b/clang/lib/Sema/SemaAvailability.cpp --- a/clang/lib/Sema/SemaAvailability.cpp +++ b/clang/lib/Sema/SemaAvailability.cpp @@ -268,7 +268,7 @@ for (StringRef S : SlotNames) { if (S.empty()) continue; - if (!isValidIdentifier(S, AllowDollar)) + if (!isValidAsciiIdentifier(S, AllowDollar)) return None; } return NumParams; diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -6074,7 +6074,7 @@ if (BaseName.empty()) { BaseName = ContextName; ContextName = StringRef(); - } else if (ContextName.empty() || !isValidIdentifier(ContextName)) { + } else if (ContextName.empty() || !isValidAsciiIdentifier(ContextName)) { S.Diag(Loc, diag::warn_attr_swift_name_invalid_identifier) << AL << /*context*/ 1; return false; @@ -6082,7 +6082,7 @@ IsMember = true; } - if (!isValidIdentifier(BaseName) || BaseName == "_") { + if (!isValidAsciiIdentifier(BaseName) || BaseName == "_") { S.Diag(Loc, diag::warn_attr_swift_name_invalid_identifier) << AL << /*basename*/ 0; return false; @@ -6132,7 +6132,7 @@ do { std::tie(CurrentParam, Parameters) = Parameters.split(':'); - if (!isValidIdentifier(CurrentParam)) { + if (!isValidAsciiIdentifier(CurrentParam)) { S.Diag(Loc, diag::warn_attr_swift_name_invalid_identifier) << AL << /*parameter*/2; return false; @@ -6301,13 +6301,13 @@ if (BaseName.empty()) { BaseName = ContextName; ContextName = StringRef(); - } else if (!isValidIdentifier(ContextName)) { + } else if (!isValidAsciiIdentifier(ContextName)) { Diag(Loc, diag::warn_attr_swift_name_invalid_identifier) << AL << /*context*/1; return false; } - if (!isValidIdentifier(BaseName)) { + if (!isValidAsciiIdentifier(BaseName)) { Diag(Loc, diag::warn_attr_swift_name_invalid_identifier) << AL << /*basename*/0; return false; diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp --- a/clang/lib/Sema/SemaExprObjC.cpp +++ b/clang/lib/Sema/SemaExprObjC.cpp @@ -3772,7 +3772,7 @@ SourceManager &SM = S.getSourceManager(); char PrevChar = *SM.getCharacterData(range.getBegin().getLocWithOffset(-1)); - if (Lexer::isIdentifierBodyChar(PrevChar, S.getLangOpts())) + if (Lexer::isAsciiIdentifierContinueChar(PrevChar, S.getLangOpts())) BridgeCall += ' '; BridgeCall += CFBridgeName; @@ -3790,7 +3790,7 @@ SourceManager &SM = S.getSourceManager(); char PrevChar = *SM.getCharacterData(range.getBegin().getLocWithOffset(-1)); - if (Lexer::isIdentifierBodyChar(PrevChar, S.getLangOpts())) + if (Lexer::isAsciiIdentifierContinueChar(PrevChar, S.getLangOpts())) BridgeCall += ' '; BridgeCall += CFBridgeName; diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -4254,8 +4254,8 @@ InsertionText = InsertionText.drop_back().drop_front(); else InsertionText = InsertionText.drop_front(); - } else if (!isIdentifierBody(NextChar[0], /*allow dollar*/true) && - !isIdentifierBody(NextChar[-1], /*allow dollar*/true)) { + } else if (!isAsciiIdentifierContinue(NextChar[0], /*allow dollar*/ true) && + !isAsciiIdentifierContinue(NextChar[-1], /*allow dollar*/ true)) { InsertionText = InsertionText.drop_back().drop_front(); } diff --git a/clang/lib/Tooling/Transformer/Parsing.cpp b/clang/lib/Tooling/Transformer/Parsing.cpp --- a/clang/lib/Tooling/Transformer/Parsing.cpp +++ b/clang/lib/Tooling/Transformer/Parsing.cpp @@ -165,7 +165,7 @@ static ExpectedProgress parseId(ParseState State) { State.Input = consumeWhitespace(State.Input); auto Id = State.Input.take_while( - [](char c) { return isASCII(c) && isIdentifierBody(c); }); + [](char c) { return isASCII(c) && isAsciiIdentifierContinue(c); }); if (Id.empty()) return makeParseError(State, "failed to parse name"); return makeParseProgress(advance(State, Id.size()), Id.str()); diff --git a/clang/unittests/Basic/CharInfoTest.cpp b/clang/unittests/Basic/CharInfoTest.cpp --- a/clang/unittests/Basic/CharInfoTest.cpp +++ b/clang/unittests/Basic/CharInfoTest.cpp @@ -50,44 +50,44 @@ EXPECT_FALSE(isASCII('\xff')); } -TEST(CharInfoTest, isIdentifierHead) { - EXPECT_TRUE(isIdentifierHead('a')); - EXPECT_TRUE(isIdentifierHead('A')); - EXPECT_TRUE(isIdentifierHead('z')); - EXPECT_TRUE(isIdentifierHead('Z')); - EXPECT_TRUE(isIdentifierHead('_')); - - EXPECT_FALSE(isIdentifierHead('0')); - EXPECT_FALSE(isIdentifierHead('.')); - EXPECT_FALSE(isIdentifierHead('`')); - EXPECT_FALSE(isIdentifierHead('\0')); - - EXPECT_FALSE(isIdentifierHead('$')); - EXPECT_TRUE(isIdentifierHead('$', /*AllowDollar=*/true)); - - EXPECT_FALSE(isIdentifierHead('\x80')); - EXPECT_FALSE(isIdentifierHead('\xc2')); - EXPECT_FALSE(isIdentifierHead('\xff')); +TEST(CharInfoTest, isAsciiIdentifierStart) { + EXPECT_TRUE(isAsciiIdentifierStart('a')); + EXPECT_TRUE(isAsciiIdentifierStart('A')); + EXPECT_TRUE(isAsciiIdentifierStart('z')); + EXPECT_TRUE(isAsciiIdentifierStart('Z')); + EXPECT_TRUE(isAsciiIdentifierStart('_')); + + EXPECT_FALSE(isAsciiIdentifierStart('0')); + EXPECT_FALSE(isAsciiIdentifierStart('.')); + EXPECT_FALSE(isAsciiIdentifierStart('`')); + EXPECT_FALSE(isAsciiIdentifierStart('\0')); + + EXPECT_FALSE(isAsciiIdentifierStart('$')); + EXPECT_TRUE(isAsciiIdentifierStart('$', /*AllowDollar=*/true)); + + EXPECT_FALSE(isAsciiIdentifierStart('\x80')); + EXPECT_FALSE(isAsciiIdentifierStart('\xc2')); + EXPECT_FALSE(isAsciiIdentifierStart('\xff')); } -TEST(CharInfoTest, isIdentifierBody) { - EXPECT_TRUE(isIdentifierBody('a')); - EXPECT_TRUE(isIdentifierBody('A')); - EXPECT_TRUE(isIdentifierBody('z')); - EXPECT_TRUE(isIdentifierBody('Z')); - EXPECT_TRUE(isIdentifierBody('_')); +TEST(CharInfoTest, isAsciiIdentifierContinue) { + EXPECT_TRUE(isAsciiIdentifierContinue('a')); + EXPECT_TRUE(isAsciiIdentifierContinue('A')); + EXPECT_TRUE(isAsciiIdentifierContinue('z')); + EXPECT_TRUE(isAsciiIdentifierContinue('Z')); + EXPECT_TRUE(isAsciiIdentifierContinue('_')); - EXPECT_TRUE(isIdentifierBody('0')); - EXPECT_FALSE(isIdentifierBody('.')); - EXPECT_FALSE(isIdentifierBody('`')); - EXPECT_FALSE(isIdentifierBody('\0')); + EXPECT_TRUE(isAsciiIdentifierContinue('0')); + EXPECT_FALSE(isAsciiIdentifierContinue('.')); + EXPECT_FALSE(isAsciiIdentifierContinue('`')); + EXPECT_FALSE(isAsciiIdentifierContinue('\0')); - EXPECT_FALSE(isIdentifierBody('$')); - EXPECT_TRUE(isIdentifierBody('$', /*AllowDollar=*/true)); + EXPECT_FALSE(isAsciiIdentifierContinue('$')); + EXPECT_TRUE(isAsciiIdentifierContinue('$', /*AllowDollar=*/true)); - EXPECT_FALSE(isIdentifierBody('\x80')); - EXPECT_FALSE(isIdentifierBody('\xc2')); - EXPECT_FALSE(isIdentifierBody('\xff')); + EXPECT_FALSE(isAsciiIdentifierContinue('\x80')); + EXPECT_FALSE(isAsciiIdentifierContinue('\xc2')); + EXPECT_FALSE(isAsciiIdentifierContinue('\xff')); } TEST(CharInfoTest, isHorizontalWhitespace) { @@ -413,91 +413,91 @@ EXPECT_EQ('\0', toUppercase('\0')); } -TEST(CharInfoTest, isValidIdentifier) { - EXPECT_FALSE(isValidIdentifier("")); +TEST(CharInfoTest, isValidAsciiIdentifier) { + EXPECT_FALSE(isValidAsciiIdentifier("")); // 1 character - EXPECT_FALSE(isValidIdentifier(".")); - EXPECT_FALSE(isValidIdentifier("\n")); - EXPECT_FALSE(isValidIdentifier(" ")); - EXPECT_FALSE(isValidIdentifier("\x80")); - EXPECT_FALSE(isValidIdentifier("\xc2")); - EXPECT_FALSE(isValidIdentifier("\xff")); - EXPECT_FALSE(isValidIdentifier("$")); - EXPECT_FALSE(isValidIdentifier("1")); - - EXPECT_TRUE(isValidIdentifier("_")); - EXPECT_TRUE(isValidIdentifier("a")); - EXPECT_TRUE(isValidIdentifier("z")); - EXPECT_TRUE(isValidIdentifier("A")); - EXPECT_TRUE(isValidIdentifier("Z")); - EXPECT_TRUE(isValidIdentifier("$", /*AllowDollar=*/true)); + EXPECT_FALSE(isValidAsciiIdentifier(".")); + EXPECT_FALSE(isValidAsciiIdentifier("\n")); + EXPECT_FALSE(isValidAsciiIdentifier(" ")); + EXPECT_FALSE(isValidAsciiIdentifier("\x80")); + EXPECT_FALSE(isValidAsciiIdentifier("\xc2")); + EXPECT_FALSE(isValidAsciiIdentifier("\xff")); + EXPECT_FALSE(isValidAsciiIdentifier("$")); + EXPECT_FALSE(isValidAsciiIdentifier("1")); + + EXPECT_TRUE(isValidAsciiIdentifier("_")); + EXPECT_TRUE(isValidAsciiIdentifier("a")); + EXPECT_TRUE(isValidAsciiIdentifier("z")); + EXPECT_TRUE(isValidAsciiIdentifier("A")); + EXPECT_TRUE(isValidAsciiIdentifier("Z")); + EXPECT_TRUE(isValidAsciiIdentifier("$", /*AllowDollar=*/true)); // 2 characters, '_' suffix - EXPECT_FALSE(isValidIdentifier("._")); - EXPECT_FALSE(isValidIdentifier("\n_")); - EXPECT_FALSE(isValidIdentifier(" _")); - EXPECT_FALSE(isValidIdentifier("\x80_")); - EXPECT_FALSE(isValidIdentifier("\xc2_")); - EXPECT_FALSE(isValidIdentifier("\xff_")); - EXPECT_FALSE(isValidIdentifier("$_")); - EXPECT_FALSE(isValidIdentifier("1_")); - - EXPECT_TRUE(isValidIdentifier("__")); - EXPECT_TRUE(isValidIdentifier("a_")); - EXPECT_TRUE(isValidIdentifier("z_")); - EXPECT_TRUE(isValidIdentifier("A_")); - EXPECT_TRUE(isValidIdentifier("Z_")); - EXPECT_TRUE(isValidIdentifier("$_", /*AllowDollar=*/true)); + EXPECT_FALSE(isValidAsciiIdentifier("._")); + EXPECT_FALSE(isValidAsciiIdentifier("\n_")); + EXPECT_FALSE(isValidAsciiIdentifier(" _")); + EXPECT_FALSE(isValidAsciiIdentifier("\x80_")); + EXPECT_FALSE(isValidAsciiIdentifier("\xc2_")); + EXPECT_FALSE(isValidAsciiIdentifier("\xff_")); + EXPECT_FALSE(isValidAsciiIdentifier("$_")); + EXPECT_FALSE(isValidAsciiIdentifier("1_")); + + EXPECT_TRUE(isValidAsciiIdentifier("__")); + EXPECT_TRUE(isValidAsciiIdentifier("a_")); + EXPECT_TRUE(isValidAsciiIdentifier("z_")); + EXPECT_TRUE(isValidAsciiIdentifier("A_")); + EXPECT_TRUE(isValidAsciiIdentifier("Z_")); + EXPECT_TRUE(isValidAsciiIdentifier("$_", /*AllowDollar=*/true)); // 2 characters, '_' prefix - EXPECT_FALSE(isValidIdentifier("_.")); - EXPECT_FALSE(isValidIdentifier("_\n")); - EXPECT_FALSE(isValidIdentifier("_ ")); - EXPECT_FALSE(isValidIdentifier("_\x80")); - EXPECT_FALSE(isValidIdentifier("_\xc2")); - EXPECT_FALSE(isValidIdentifier("_\xff")); - EXPECT_FALSE(isValidIdentifier("_$")); - EXPECT_TRUE(isValidIdentifier("_1")); - - EXPECT_TRUE(isValidIdentifier("__")); - EXPECT_TRUE(isValidIdentifier("_a")); - EXPECT_TRUE(isValidIdentifier("_z")); - EXPECT_TRUE(isValidIdentifier("_A")); - EXPECT_TRUE(isValidIdentifier("_Z")); - EXPECT_TRUE(isValidIdentifier("_$", /*AllowDollar=*/true)); + EXPECT_FALSE(isValidAsciiIdentifier("_.")); + EXPECT_FALSE(isValidAsciiIdentifier("_\n")); + EXPECT_FALSE(isValidAsciiIdentifier("_ ")); + EXPECT_FALSE(isValidAsciiIdentifier("_\x80")); + EXPECT_FALSE(isValidAsciiIdentifier("_\xc2")); + EXPECT_FALSE(isValidAsciiIdentifier("_\xff")); + EXPECT_FALSE(isValidAsciiIdentifier("_$")); + EXPECT_TRUE(isValidAsciiIdentifier("_1")); + + EXPECT_TRUE(isValidAsciiIdentifier("__")); + EXPECT_TRUE(isValidAsciiIdentifier("_a")); + EXPECT_TRUE(isValidAsciiIdentifier("_z")); + EXPECT_TRUE(isValidAsciiIdentifier("_A")); + EXPECT_TRUE(isValidAsciiIdentifier("_Z")); + EXPECT_TRUE(isValidAsciiIdentifier("_$", /*AllowDollar=*/true)); // 3 characters, '__' prefix - EXPECT_FALSE(isValidIdentifier("__.")); - EXPECT_FALSE(isValidIdentifier("__\n")); - EXPECT_FALSE(isValidIdentifier("__ ")); - EXPECT_FALSE(isValidIdentifier("__\x80")); - EXPECT_FALSE(isValidIdentifier("__\xc2")); - EXPECT_FALSE(isValidIdentifier("__\xff")); - EXPECT_FALSE(isValidIdentifier("__$")); - EXPECT_TRUE(isValidIdentifier("__1")); - - EXPECT_TRUE(isValidIdentifier("___")); - EXPECT_TRUE(isValidIdentifier("__a")); - EXPECT_TRUE(isValidIdentifier("__z")); - EXPECT_TRUE(isValidIdentifier("__A")); - EXPECT_TRUE(isValidIdentifier("__Z")); - EXPECT_TRUE(isValidIdentifier("__$", /*AllowDollar=*/true)); + EXPECT_FALSE(isValidAsciiIdentifier("__.")); + EXPECT_FALSE(isValidAsciiIdentifier("__\n")); + EXPECT_FALSE(isValidAsciiIdentifier("__ ")); + EXPECT_FALSE(isValidAsciiIdentifier("__\x80")); + EXPECT_FALSE(isValidAsciiIdentifier("__\xc2")); + EXPECT_FALSE(isValidAsciiIdentifier("__\xff")); + EXPECT_FALSE(isValidAsciiIdentifier("__$")); + EXPECT_TRUE(isValidAsciiIdentifier("__1")); + + EXPECT_TRUE(isValidAsciiIdentifier("___")); + EXPECT_TRUE(isValidAsciiIdentifier("__a")); + EXPECT_TRUE(isValidAsciiIdentifier("__z")); + EXPECT_TRUE(isValidAsciiIdentifier("__A")); + EXPECT_TRUE(isValidAsciiIdentifier("__Z")); + EXPECT_TRUE(isValidAsciiIdentifier("__$", /*AllowDollar=*/true)); // 3 characters, '_' prefix and suffix - EXPECT_FALSE(isValidIdentifier("_._")); - EXPECT_FALSE(isValidIdentifier("_\n_")); - EXPECT_FALSE(isValidIdentifier("_ _")); - EXPECT_FALSE(isValidIdentifier("_\x80_")); - EXPECT_FALSE(isValidIdentifier("_\xc2_")); - EXPECT_FALSE(isValidIdentifier("_\xff_")); - EXPECT_FALSE(isValidIdentifier("_$_")); - EXPECT_TRUE(isValidIdentifier("_1_")); - - EXPECT_TRUE(isValidIdentifier("___")); - EXPECT_TRUE(isValidIdentifier("_a_")); - EXPECT_TRUE(isValidIdentifier("_z_")); - EXPECT_TRUE(isValidIdentifier("_A_")); - EXPECT_TRUE(isValidIdentifier("_Z_")); - EXPECT_TRUE(isValidIdentifier("_$_", /*AllowDollar=*/true)); + EXPECT_FALSE(isValidAsciiIdentifier("_._")); + EXPECT_FALSE(isValidAsciiIdentifier("_\n_")); + EXPECT_FALSE(isValidAsciiIdentifier("_ _")); + EXPECT_FALSE(isValidAsciiIdentifier("_\x80_")); + EXPECT_FALSE(isValidAsciiIdentifier("_\xc2_")); + EXPECT_FALSE(isValidAsciiIdentifier("_\xff_")); + EXPECT_FALSE(isValidAsciiIdentifier("_$_")); + EXPECT_TRUE(isValidAsciiIdentifier("_1_")); + + EXPECT_TRUE(isValidAsciiIdentifier("___")); + EXPECT_TRUE(isValidAsciiIdentifier("_a_")); + EXPECT_TRUE(isValidAsciiIdentifier("_z_")); + EXPECT_TRUE(isValidAsciiIdentifier("_A_")); + EXPECT_TRUE(isValidAsciiIdentifier("_Z_")); + EXPECT_TRUE(isValidAsciiIdentifier("_$_", /*AllowDollar=*/true)); } diff --git a/llvm/cmake/modules/CheckCompilerVersion.cmake b/llvm/cmake/modules/CheckCompilerVersion.cmake --- a/llvm/cmake/modules/CheckCompilerVersion.cmake +++ b/llvm/cmake/modules/CheckCompilerVersion.cmake @@ -94,7 +94,7 @@ " LLVM_LIBSTDCXX_MIN) if(NOT LLVM_LIBSTDCXX_MIN) - message(FATAL_ERROR "libstdc++ version must be at least ${GCC_MIN}.") + # message(FATAL_ERROR "libstdc++ version must be at least ${GCC_MIN}.") endif() # Test for libstdc++ version of at least 5.1 by checking for std::iostream_category(). # Note: We should check _GLIBCXX_RELEASE when possible (i.e., for GCC 7.1 and up).