diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h @@ -199,12 +199,15 @@ clang::Language = clang::Language::CXX, clang::LangStandard::Kind = clang::LangStandard::lang_unspecified); -/// Derives a token stream by decoding escapes, interpreting raw_identifiers and -/// splitting the greatergreater token. +/// Decoding raw tokens written in the source code, returning a derived stream. /// -/// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and -/// their backing data is owned by the returned stream. -/// raw_identifier tokens are assigned specific types (identifier, keyword etc). +/// - escaped newlines within tokens are removed +/// - trigraphs are replaced with the characters they encode +/// - UCNs within raw_identifiers are replaced by the characters they encode +/// (UCNs within strings, comments etc are not translated) +/// - raw_identifier tokens are assigned their correct keyword type +/// - the >> token is split into separate > > tokens +/// (we use a modified grammar where >> is a nonterminal, not a token) /// /// The StartsPPLine flag is preserved. /// diff --git a/clang-tools-extra/pseudo/lib/Lex.cpp b/clang-tools-extra/pseudo/lib/Lex.cpp --- a/clang-tools-extra/pseudo/lib/Lex.cpp +++ b/clang-tools-extra/pseudo/lib/Lex.cpp @@ -90,12 +90,23 @@ assert(CharSize != 0 && "no progress!"); Pos += CharSize; } - // Remove universal character names (UCN). + llvm::StringRef Text = CleanBuffer; llvm::SmallString<64> UCNBuffer; - clang::expandUCNs(UCNBuffer, CleanBuffer); + // A surface reading of the standard suggests UCNs might appear anywhere. + // But we need only decode them in raw_identifiers. + // - they cannot appear in punctuation/keyword tokens, because UCNs + // cannot encode basic characters outside of literals [lex.charset] + // - they can appear in literals, but we need not unescape them now. + // We treat them as escape sequences when evaluating the literal. + // - comments are handled similarly to literals + // This is good fortune, because expandUCNs requires its input to be a + // reasonably valid identifier (e.g. without stray backslashes). + if (Tok.Kind == tok::raw_identifier) { + clang::expandUCNs(UCNBuffer, CleanBuffer); + Text = UCNBuffer; + } - llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage); - Tok.Data = Text.data(); + Tok.Data = Text.copy(*CleanedStorage).data(); Tok.Length = Text.size(); Tok.Flags &= ~static_cast(LexFlags::NeedsCleaning); } diff --git a/clang-tools-extra/pseudo/test/crash/backslashes.c b/clang-tools-extra/pseudo/test/crash/backslashes.c new file mode 100644 --- /dev/null +++ b/clang-tools-extra/pseudo/test/crash/backslashes.c @@ -0,0 +1,4 @@ +// We used to try to interpret these backslashes as UCNs. +// RUN: clang-pseudo -source=%s -print-tokens +\ +\ x diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp --- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -17,6 +17,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Signals.h" using clang::pseudo::Grammar; using llvm::cl::desc; @@ -52,6 +53,7 @@ int main(int argc, char *argv[]) { llvm::cl::ParseCommandLineOptions(argc, argv, ""); + llvm::sys::PrintStackTraceOnErrorSignal(argv[0]); clang::LangOptions LangOpts = clang::pseudo::genericLangOpts(); std::string SourceText;