diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h --- a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h +++ b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h @@ -180,7 +180,8 @@ NeedsCleaning = 1 << 1, }; -/// Derives a token stream by decoding escapes and interpreting raw_identifiers. +/// Derives a token stream by decoding escapes, interpreting raw_identifiers and +/// splitting the greatergreater token. /// /// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and /// their backing data is owned by the returned stream. diff --git a/clang-tools-extra/pseudo/lib/Lex.cpp b/clang-tools-extra/pseudo/lib/Lex.cpp --- a/clang-tools-extra/pseudo/lib/Lex.cpp +++ b/clang-tools-extra/pseudo/lib/Lex.cpp @@ -98,9 +98,21 @@ Tok.Length = Text.size(); Tok.Flags &= ~static_cast(LexFlags::NeedsCleaning); } - // Cook raw_identifiers into identifier, keyword, etc. - if (Tok.Kind == tok::raw_identifier) + + if (Tok.Kind == tok::raw_identifier) { + // Cook raw_identifiers into identifier, keyword, etc. Tok.Kind = Identifiers.get(Tok.text()).getTokenID(); + } else if (Tok.Kind == tok::greatergreater) { + // Split the greatergreater token. + // FIXME: split lessless token to support Cuda triple angle brackets <<<. + assert(Tok.text() == ">>"); + Tok.Kind = tok::greater; + Tok.Length = 1; + Result.push(Tok); + // Line is wrong if the first greater is followed by an escaped newline! + Tok.Data = Tok.text().data() + 1; + } + Result.push(std::move(Tok)); } diff --git a/clang-tools-extra/pseudo/lib/cxx.bnf b/clang-tools-extra/pseudo/lib/cxx.bnf --- a/clang-tools-extra/pseudo/lib/cxx.bnf +++ b/clang-tools-extra/pseudo/lib/cxx.bnf @@ -13,6 +13,9 @@ # - the file merely describes the core C++ grammar. Preprocessor directives and # lexical conversions are omitted as we reuse clang's lexer and run a fake # preprocessor; +# - grammar rules with the >> token are adjusted, the greatergreater token is +# split into two > tokens, to make the GLR parser aware of nested templates +# and right shift operator; # # Guidelines: # - non-terminals are lower_case; terminals (aka tokens) correspond to @@ -96,7 +99,7 @@ fold-operator := ^ fold-operator := | fold-operator := << -fold-operator := >> +fold-operator := greatergreater fold-operator := += fold-operator := -= fold-operator := *= @@ -202,7 +205,7 @@ # expr.shift shift-expression := additive-expression shift-expression := shift-expression << additive-expression -shift-expression := shift-expression >> additive-expression +shift-expression := shift-expression greatergreater additive-expression # expr.spaceship compare-expression := shift-expression compare-expression := compare-expression <=> shift-expression @@ -615,7 +618,7 @@ operator-name := ^^ operator-name := || operator-name := << -operator-name := >> +operator-name := greatergreater operator-name := <<= operator-name := >>= operator-name := ++ @@ -737,3 +740,8 @@ module-keyword := IDENTIFIER import-keyword := IDENTIFIER export-keyword := IDENTIFIER + +#! greatergreater token -- clang lexer always lexes it as a single token, we +#! split it into two tokens to make the GLR parser aware of the nested-template +#! case. +greatergreater := > > \ No newline at end of file diff --git a/clang-tools-extra/pseudo/unittests/TokenTest.cpp b/clang-tools-extra/pseudo/unittests/TokenTest.cpp --- a/clang-tools-extra/pseudo/unittests/TokenTest.cpp +++ b/clang-tools-extra/pseudo/unittests/TokenTest.cpp @@ -171,6 +171,25 @@ })); } +TEST(TokenTest, SplitGreaterGreater) { + LangOptions Opts; + std::string Code = R"cpp( +>> // split +// >> with an escaped newline in the middle, split +>\ +> +>>= // not split +)cpp"; + TokenStream Split = stripComments(cook(lex(Code, Opts), Opts)); + EXPECT_THAT(Split.tokens(), ElementsAreArray({ + token(">", tok::greater), + token(">", tok::greater), + token(">", tok::greater), + token(">", tok::greater), + token(">>=", tok::greatergreaterequal), + })); +} + TEST(TokenTest, DropComments) { LangOptions Opts; std::string Code = R"cpp(