Index: include/clang/Basic/DiagnosticLexKinds.td =================================================================== --- include/clang/Basic/DiagnosticLexKinds.td +++ include/clang/Basic/DiagnosticLexKinds.td @@ -102,6 +102,7 @@ "source file is not valid UTF-8">; def err_non_ascii : Error< "non-ASCII characters are not allowed outside of literals and identifiers">; +def err_unsupported_ansi_escape : Error<"ANSI escape sequence in input">; def ext_unicode_whitespace : ExtWarn< "treating Unicode character as whitespace">, InGroup>; Index: include/clang/Lex/Lexer.h =================================================================== --- include/clang/Lex/Lexer.h +++ include/clang/Lex/Lexer.h @@ -89,6 +89,30 @@ // CurrentConflictMarkerState - The kind of conflict marker we are handling. ConflictMarkerKind CurrentConflictMarkerState; + /// ANSI graphics rendition state. + class GraphicsRendition { + struct Flags { + unsigned Bold : 2, Script : 2, Underline : 2, Blink : 2, Negative : 1, + Conceal : 1, Crossout : 1, Font : 4, FG : 3, BG : 3, Frame : 2, + Overline : 1; + }; + struct AllFlags { unsigned Flags : 24; }; + union { + Flags F; + AllFlags All; + }; + + public: + GraphicsRendition(); + bool isSet() const { return All.Flags != 0; } + void add(unsigned Param); + template void render(llvm::SmallString &Buffer); + + friend bool operator!=(GraphicsRendition &A, GraphicsRendition &B) { + return A.All.Flags != B.All.Flags; + } + } Rendition; + Lexer(const Lexer &) LLVM_DELETED_FUNCTION; void operator=(const Lexer &) LLVM_DELETED_FUNCTION; friend class Preprocessor; @@ -577,6 +601,7 @@ // Helper functions to lex the remainder of a token of the specific type. bool LexIdentifier (Token &Result, const char *CurPtr); + bool LexColoredIdentifier (Token &Result, const char *CurPtr); bool LexNumericConstant (Token &Result, const char *CurPtr); bool LexStringLiteral (Token &Result, const char *CurPtr, tok::TokenKind Kind); @@ -637,6 +662,13 @@ /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier /// character was lexed, \c false otherwise. bool tryConsumeIdentifierUTF8Char(const char *&CurPtr); + + /// \brief Try to consume an ANSI escape character. + /// \param CurPtr Points to the start of the control sequence introducer code + /// (typically, ESC '['). On success, updated to point past the end of + /// it. + /// \return \c true if an ANSI escape code was lexed, \c false otherwise. + bool LexANSIEscapeCode(const char *&CurPtr); }; } // end namespace clang Index: lib/Lex/Lexer.cpp =================================================================== --- lib/Lex/Lexer.cpp +++ lib/Lex/Lexer.cpp @@ -1084,6 +1084,69 @@ } //===----------------------------------------------------------------------===// +// ANSI escape codes. +//===----------------------------------------------------------------------===// +Lexer::GraphicsRendition::GraphicsRendition() : F() {} + +void Lexer::GraphicsRendition::add(unsigned Param) { + switch (Param) { + case 0: F = Flags(); break; + case 1: F.Bold = 1; break; + case 2: F.Bold = 2; break; + case 3: F.Script = 1; break; + case 4: F.Underline = 1; break; + case 5: F.Blink = 1; break; + case 6: F.Blink = 2; break; + case 7: F.Negative = 1; break; + case 8: F.Conceal = 1; break; + case 9: F.Crossout = 1; break; + case 10: case 11: case 12: case 13: case 14: + case 15: case 16: case 17: case 18: case 19: + F.Font = Param - 10; break; + case 20: F.Script = 2; break; + case 21: F.Underline = 2; break; + case 22: F.Bold = 0; break; + case 23: F.Script = 0; break; + case 24: F.Underline = 0; break; + case 25: F.Blink = 0; break; + case 27: F.Negative = 0; break; + case 28: F.Conceal = 0; break; + case 29: F.Crossout = 0; break; + case 30: case 31: case 32: case 33: case 34: + case 35: case 36: case 37: case 38: case 39: + F.FG = Param - 30; break; + case 40: case 41: case 42: case 43: case 44: + case 45: case 46: case 47: case 48: case 49: + F.BG = Param - 40; break; + case 51: F.Frame = 1; break; + case 52: F.Frame = 2; break; + case 53: F.Overline = 1; break; + case 54: F.Frame = 0; break; + case 55: F.Overline = 0; break; + // FIXME: parameter values 60-65. + } +} + +template +void Lexer::GraphicsRendition::render(llvm::SmallString &Buffer) { + if (!isSet()) return; + Buffer += "\x1b[0"; + if (F.Bold) Buffer += (F.Bold == 1 ? ";1" : ";2"); + if (F.Script) Buffer += (F.Script == 1 ? ";3" : ";20"); + if (F.Underline) Buffer += (F.Underline == 1 ? ";4" : ";21"); + if (F.Blink) Buffer += (F.Blink == 1 ? ";5" : ";6"); + if (F.Negative) Buffer += ";7"; + if (F.Conceal) Buffer += ";8"; + if (F.Crossout) Buffer += ";9"; + if (F.Font) { Buffer += ";1"; Buffer += char('0' + F.Font); } + if (F.FG != 9) { Buffer += ";3"; Buffer += char('0' + F.FG); } + if (F.BG != 9) { Buffer += ";4"; Buffer += char('0' + F.BG); } + if (F.Frame) Buffer += (F.Frame == 1 ? ";51" : ";52"); + if (F.Overline) Buffer += ";53"; + Buffer.push_back('m'); +} + +//===----------------------------------------------------------------------===// // Trigraph and Escaped Newline Handling Code. //===----------------------------------------------------------------------===// @@ -1490,7 +1553,94 @@ return true; } +bool Lexer::LexColoredIdentifier(Token &Result, const char *CurPtr) { + assert(!isLexingRawMode() && "unnecessary in raw mode"); + + // Slurp the characters prior to the first color code. + llvm::SmallString<64> ColoredIdentifier; + ColoredIdentifier.append(StringRef(BufferPtr, CurPtr - BufferPtr)); + const char *Copied = CurPtr; + + unsigned Size; + char C = getCharAndSize(CurPtr, Size); + + // If we have pre-existing color, and the identifier doesn't start in color, + // then add a leading color code. + if (C != '\x1b') + Rendition.render(ColoredIdentifier); + + while (1) { + if (C == '$') { + if (!LangOpts.DollarIdents) + break; + if (!isLexingRawMode()) + Diag(CurPtr, diag::ext_dollar_in_identifier); + } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { + C = getCharAndSize(CurPtr, Size); + continue; + } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { + C = getCharAndSize(CurPtr, Size); + continue; + } else if (C == '\x1b' && CurPtr[Size] == '[') { + ColoredIdentifier.append(StringRef(Copied, CurPtr - Copied)); + Copied = CurPtr; + + GraphicsRendition Old = Rendition; + + while (C == '\x1b' && CurPtr[Size] == '[') { + CurPtr += Size - 1; + if (!LexANSIEscapeCode(CurPtr)) { + CurPtr -= Size - 1; + goto FinishIdentifier; + } + Copied = CurPtr; + } + + if (Old != Rendition) + Rendition.render(ColoredIdentifier); + + C = getCharAndSize(CurPtr, Size); + continue; + } else if (!isIdentifierBody(C)) { + break; + } + + // Otherwise, this character is good, consume it. + CurPtr = ConsumeChar(CurPtr, Size, Result); + + C = getCharAndSize(CurPtr, Size); + while (isIdentifierBody(C)) { + CurPtr = ConsumeChar(CurPtr, Size, Result); + C = getCharAndSize(CurPtr, Size); + } + } + +FinishIdentifier: + // Slurp remaining identifier characters. + ColoredIdentifier.append(StringRef(Copied, CurPtr - Copied)); + + // Finish by returning the identifier to a neutral state. + if (Rendition.isSet()) + GraphicsRendition().render(ColoredIdentifier); + FormTokenWithChars(Result, CurPtr, tok::raw_identifier); + PP->CreateString(ColoredIdentifier, Result); + + // Fill in Result.IdentifierInfo and update the token kind, + // looking up the identifier in the identifier table. + IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); + + // Finally, now that we know we have an identifier, pass this off to the + // preprocessor, which may macro expand it or something. + if (II->isHandleIdentifierCase()) + return PP->HandleIdentifier(Result); + + return true; +} + bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) { + if (Rendition.isSet()) + return LexColoredIdentifier(Result, CurPtr); + // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] unsigned Size; unsigned char C = *CurPtr++; @@ -1504,7 +1654,7 @@ // // TODO: Could merge these checks into an InfoTable flag to make the // comparison cheaper - if (isASCII(C) && C != '\\' && C != '?' && + if (isASCII(C) && C != '\\' && C != '?' && C != '\x1b' && (C != '$' || !LangOpts.DollarIdents)) { FinishIdentifier: const char *IdStart = BufferPtr; @@ -1528,7 +1678,7 @@ return true; } - // Otherwise, $,\,? in identifier found. Enter slower path. + // Otherwise, $,\,?,\e in identifier found. Enter slower path. C = getCharAndSize(CurPtr, Size); while (1) { @@ -1549,6 +1699,18 @@ } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { C = getCharAndSize(CurPtr, Size); continue; + } else if (C == '\x1b' && CurPtr[Size] == '[') { + if (!isLexingRawMode()) + return LexColoredIdentifier(Result, CurPtr); + // In raw mode, just skip over the escape code and include it in the + // identifier. + CurPtr += Size - 1; + if (!LexANSIEscapeCode(CurPtr)) { + CurPtr -= Size - 1; + goto FinishIdentifier; + } + C = getCharAndSize(CurPtr, Size); + continue; } else if (!isIdentifierBody(C)) { goto FinishIdentifier; } @@ -2705,6 +2867,57 @@ return false; } +bool Lexer::LexANSIEscapeCode(const char *&Ptr) { + // Note that '\x1b' is left alone in raw mode. + assert(Ptr[0] == '\x1b' && Ptr[1] == '[' && "not an ANSI escape code"); + + const char *CurPtr = Ptr + 2; + bool Unsupported = false; + + // Read simple parameter bytes. + llvm::SmallVector Params; + if (*CurPtr >= 0x30 && *CurPtr <= 0x3b) { + do { + // Read a parameter. + unsigned N = 0; + // FIXME: Do something better than wrapping on overflow. + while (isDigit(*CurPtr)) + N = N * 10 + (*CurPtr++ - '0'); + Params.push_back(N); + } while (*CurPtr++ == ';'); + --CurPtr; + } + + // Private use parameter string. + while (*CurPtr >= 0x30 && *CurPtr <= 0x3f) { + Unsupported = true; + ++CurPtr; + } + + // Intermediate bytes. + while (*CurPtr >= 0x20 && *CurPtr <= 0x2f) { + Unsupported = true; + ++CurPtr; + } + + if (*CurPtr < 0x40 || *CurPtr >= 0x7f) + // Not an ANSI escape sequence. + return false; + + if (!Unsupported && *CurPtr == 'm') { + // Set graphic rendition code. + if (Params.empty()) + Rendition.add(0); + for (unsigned SGR : Params) + Rendition.add(SGR); + } else if (!isLexingRawMode()) { + Diag(BufferPtr, diag::err_unsupported_ansi_escape); + } + + Ptr = CurPtr + 1; + return true; +} + uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result) { unsigned CharSize; @@ -3585,6 +3798,15 @@ default: { if (isASCII(Char)) { + if (Char == '\x1b') { + --CurPtr; + if (CurPtr[1] == '[' && LexANSIEscapeCode(CurPtr)) { + BufferPtr = CurPtr; + goto LexNextToken; + } + ++CurPtr; + } + Kind = tok::unknown; break; }