Index: include/llvm/MC/MCParser/AsmLexer.h =================================================================== --- include/llvm/MC/MCParser/AsmLexer.h +++ include/llvm/MC/MCParser/AsmLexer.h @@ -45,17 +45,15 @@ void setBuffer(StringRef Buf, const char *ptr = nullptr); StringRef LexUntilEndOfStatement() override; - StringRef LexUntilEndOfLine(); size_t peekTokens(MutableArrayRef Buf, bool ShouldSkipSpace = true) override; - bool isAtStartOfComment(const char *Ptr); - bool isAtStatementSeparator(const char *Ptr); - const MCAsmInfo &getMAI() const { return MAI; } private: + bool isAtStartOfComment(const char *Ptr); + bool isAtStatementSeparator(const char *Ptr); int getNextChar(); AsmToken ReturnError(const char *Loc, const std::string &Msg); @@ -67,6 +65,8 @@ AsmToken LexQuote(); AsmToken LexFloatLiteral(); AsmToken LexHexFloatLiteral(bool NoIntDigits); + + StringRef LexUntilEndOfLine(); }; } // end namespace llvm Index: include/llvm/MC/MCParser/MCAsmLexer.h =================================================================== --- include/llvm/MC/MCParser/MCAsmLexer.h +++ include/llvm/MC/MCParser/MCAsmLexer.h @@ -25,7 +25,8 @@ public: enum TokenKind { // Markers - Eof, Error, + Eof, + Error, // String values. Identifier, @@ -38,20 +39,48 @@ // Real values. Real, + // Comments + Comment, + HashDirective, // No-value. EndOfStatement, Colon, Space, - Plus, Minus, Tilde, - Slash, // '/' + Plus, + Minus, + Tilde, + Slash, // '/' BackSlash, // '\' - LParen, RParen, LBrac, RBrac, LCurly, RCurly, - Star, Dot, Comma, Dollar, Equal, EqualEqual, - - Pipe, PipePipe, Caret, - Amp, AmpAmp, Exclaim, ExclaimEqual, Percent, Hash, - Less, LessEqual, LessLess, LessGreater, - Greater, GreaterEqual, GreaterGreater, At + LParen, + RParen, + LBrac, + RBrac, + LCurly, + RCurly, + Star, + Dot, + Comma, + Dollar, + Equal, + EqualEqual, + + Pipe, + PipePipe, + Caret, + Amp, + AmpAmp, + Exclaim, + ExclaimEqual, + Percent, + Hash, + Less, + LessEqual, + LessLess, + LessGreater, + Greater, + GreaterEqual, + GreaterGreater, + At }; private: @@ -152,8 +181,9 @@ const AsmToken &Lex() { assert(!CurTok.empty()); CurTok.erase(CurTok.begin()); + // Always place in front as LexToken may generate multiple tokens via UnLex. if (CurTok.empty()) - CurTok.emplace_back(LexToken()); + CurTok.insert(CurTok.begin(), LexToken()); return CurTok.front(); } Index: lib/MC/MCParser/AsmLexer.cpp =================================================================== --- lib/MC/MCParser/AsmLexer.cpp +++ lib/MC/MCParser/AsmLexer.cpp @@ -50,20 +50,9 @@ } int AsmLexer::getNextChar() { - char CurChar = *CurPtr++; - switch (CurChar) { - default: - return (unsigned char)CurChar; - case 0: - // A nul character in the stream is either the end of the current buffer or - // a random nul in the file. Disambiguate that here. - if (CurPtr - 1 != CurBuf.end()) - return 0; // Just whitespace. - - // Otherwise, return end of file. - --CurPtr; // Another call to lex will return EOF again. + if (CurPtr == CurBuf.end()) return EOF; - } + return (unsigned char)*CurPtr++; } /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)? @@ -174,38 +163,45 @@ ++CurPtr; return LexLineComment(); default: + isAtStartOfLine = false; return AsmToken(AsmToken::Slash, StringRef(CurPtr - 1, 1)); } // C Style comment. ++CurPtr; // skip the star. - while (1) { - int CurChar = getNextChar(); - switch (CurChar) { - case EOF: - return ReturnError(TokStart, "unterminated comment"); + while (CurPtr != CurBuf.end()) { + switch (*CurPtr++) { case '*': // End of the comment? - if (CurPtr[0] != '/') break; - + if (*CurPtr != '/') + break; ++CurPtr; // End the */. - return LexToken(); + return AsmToken(AsmToken::Comment, + StringRef(TokStart, CurPtr - TokStart)); } } + return ReturnError(TokStart, "unterminated comment"); } /// LexLineComment: Comment: #[^\n]* /// : //[^\n]* AsmToken AsmLexer::LexLineComment() { - // FIXME: This is broken if we happen to a comment at the end of a file, which - // was .included, and which doesn't end with a newline. + // Mark This as an end of statement with a body of the + // comment. While it would be nicer to leave this two tokens, + // backwards compatability with TargetParsers makes keeping this in this form + // better. int CurChar = getNextChar(); while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) CurChar = getNextChar(); - if (CurChar == EOF) - return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); - return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0)); + // Whis is a whole line comment. leave newline + if (isAtStartOfLine) + return AsmToken(AsmToken::EndOfStatement, + StringRef(TokStart, CurPtr - TokStart)); + isAtStartOfLine = true; + + return AsmToken(AsmToken::EndOfStatement, + StringRef(TokStart, CurPtr - 1 - TokStart)); } static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { @@ -423,8 +419,7 @@ while (!isAtStartOfComment(CurPtr) && // Start of line comment. !isAtStatementSeparator(CurPtr) && // End of statement marker. - *CurPtr != '\n' && *CurPtr != '\r' && - (*CurPtr != 0 || CurPtr != CurBuf.end())) { + *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { ++CurPtr; } return StringRef(TokStart, CurPtr-TokStart); @@ -433,8 +428,7 @@ StringRef AsmLexer::LexUntilEndOfLine() { TokStart = CurPtr; - while (*CurPtr != '\n' && *CurPtr != '\r' && - (*CurPtr != 0 || CurPtr != CurBuf.end())) { + while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { ++CurPtr; } return StringRef(TokStart, CurPtr-TokStart); @@ -495,13 +489,24 @@ // This always consumes at least one character. int CurChar = getNextChar(); + if (CurChar == '#' && isAtStartOfLine) { + // If this comment starts with a '#', this may be a cpp + // hash directive and otherwise a line comment. + AsmToken TokenBuf[2]; + MutableArrayRef Buf(TokenBuf, 2); + size_t num = peekTokens(Buf, true); + if (num == 2 && TokenBuf[0].is(AsmToken::Integer) && + TokenBuf[1].is(AsmToken::String)) { + CurPtr = TokStart; // reset curPtr; + StringRef s = LexUntilEndOfLine(); + UnLex(TokenBuf[1]); + UnLex(TokenBuf[0]); + return AsmToken(AsmToken::HashDirective, s); + } + return LexLineComment(); + } + if (isAtStartOfComment(TokStart)) { - // If this comment starts with a '#', then return the Hash token and let - // the assembler parser see if it can be parsed as a cpp line filename - // comment. We do this only if we are at the start of a line. - if (CurChar == '#' && isAtStartOfLine) - return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); - isAtStartOfLine = true; return LexLineComment(); } if (isAtStatementSeparator(TokStart)) { @@ -516,7 +521,7 @@ isAtStartOfLine = true; return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); } - + bool OldIsAtStartOfLine = isAtStartOfLine; isAtStartOfLine = false; switch (CurChar) { default: @@ -526,10 +531,13 @@ // Unknown character, emit an error. return ReturnError(TokStart, "invalid character in input"); - case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); + case EOF: + isAtStartOfLine = true; + return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); case 0: case ' ': case '\t': + isAtStartOfLine = OldIsAtStartOfLine; if (SkipSpace) { // Ignore whitespace. return LexToken(); @@ -586,7 +594,9 @@ } return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); - case '/': return LexSlash(); + case '/': // preserve line starts in case of comment. + isAtStartOfLine = OldIsAtStartOfLine; + return LexSlash(); case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); case '\'': return LexSingleQuote(); case '"': return LexQuote(); Index: lib/MC/MCParser/AsmParser.cpp =================================================================== --- lib/MC/MCParser/AsmParser.cpp +++ lib/MC/MCParser/AsmParser.cpp @@ -257,7 +257,6 @@ bool parseStatement(ParseStatementInfo &Info, MCAsmParserSemaCallback *SI); bool parseCurlyBlockScope(SmallVectorImpl& AsmStrRewrites); - void eatToEndOfLine(); bool parseCppHashLineFilenameComment(SMLoc L); void checkForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body, @@ -625,6 +624,10 @@ const AsmToken &AsmParser::Lex() { const AsmToken *tok = &Lexer.Lex(); + // Parse comments here. + while (tok->is(AsmToken::Comment)) { + tok = &Lexer.Lex(); + } if (tok->is(AsmToken::Eof)) { // If this is the end of an included file, pop the parent file off the @@ -632,7 +635,7 @@ SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); if (ParentIncludeLoc != SMLoc()) { jumpToLoc(ParentIncludeLoc); - tok = &Lexer.Lex(); + return Lex(); } } @@ -713,8 +716,8 @@ // first referenced for a source location. We need to add something // to track that. Currently, we just point to the end of the file. HadError |= - Error(getLexer().getLoc(), "assembler local symbol '" + - Sym->getName() + "' not defined"); + Error(getTok().getLoc(), "assembler local symbol '" + + Sym->getName() + "' not defined"); } } @@ -748,18 +751,18 @@ /// \brief Throw away the rest of the line for testing purposes. void AsmParser::eatToEndOfStatement() { while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof)) - Lex(); + Lexer.Lex(); // Eat EOL. if (Lexer.is(AsmToken::EndOfStatement)) - Lex(); + Lexer.Lex(); } StringRef AsmParser::parseStringToEndOfStatement() { const char *Start = getTok().getLoc().getPointer(); while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof)) - Lex(); + Lexer.Lex(); const char *End = getTok().getLoc().getPointer(); return StringRef(Start, End - Start); @@ -770,7 +773,7 @@ while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Comma) && Lexer.isNot(AsmToken::Eof)) - Lex(); + Lexer.Lex(); const char *End = getTok().getLoc().getPointer(); return StringRef(Start, End - Start); @@ -852,7 +855,7 @@ if (!MAI.useParensForSymbolVariant()) { if (FirstTokenKind == AsmToken::String) { if (Lexer.is(AsmToken::At)) { - Lexer.Lex(); // eat @ + Lex(); // eat @ SMLoc AtLoc = getLexer().getLoc(); StringRef VName; if (parseIdentifier(VName)) @@ -864,14 +867,14 @@ Split = Identifier.split('@'); } } else if (Lexer.is(AsmToken::LParen)) { - Lexer.Lex(); // eat ( + Lex(); // eat '('. StringRef VName; parseIdentifier(VName); if (Lexer.isNot(AsmToken::RParen)) { return Error(Lexer.getTok().getLoc(), "unexpected token in variant, expected ')'"); } - Lexer.Lex(); // eat ) + Lex(); // eat ')'. Split = std::make_pair(Identifier, VName); } @@ -1336,19 +1339,23 @@ /// ::= Label* Identifier OperandList* EndOfStatement bool AsmParser::parseStatement(ParseStatementInfo &Info, MCAsmParserSemaCallback *SI) { + // Eat initial spaces and comments + while (Lexer.is(AsmToken::Space)) + Lex(); if (Lexer.is(AsmToken::EndOfStatement)) { - Out.AddBlankLine(); + // if this is a line comment we can drop it safely + if (getTok().getString().front() == '\r' || + getTok().getString().front() == '\n') + Out.AddBlankLine(); Lex(); return false; } - - // Statements always start with an identifier or are a full line comment. + // Statements always start with an identifier. AsmToken ID = getTok(); SMLoc IDLoc = ID.getLoc(); StringRef IDVal; int64_t LocalLabelVal = -1; - // A full line comment is a '#' as the first token. - if (Lexer.is(AsmToken::Hash)) + if (Lexer.is(AsmToken::HashDirective)) return parseCppHashLineFilenameComment(IDLoc); // Allow an integer followed by a ':' as a directional local label. @@ -1641,7 +1648,8 @@ return parseDirectiveIncbin(); case DK_CODE16: case DK_CODE16GCC: - return TokError(Twine(IDVal) + " not supported yet"); + return TokError(Twine(IDVal) + + " not currently supported for this target"); case DK_REPT: return parseDirectiveRept(IDLoc, IDVal); case DK_IRP: @@ -1861,37 +1869,20 @@ return true; } -/// eatToEndOfLine uses the Lexer to eat the characters to the end of the line -/// since they may not be able to be tokenized to get to the end of line token. -void AsmParser::eatToEndOfLine() { - if (!Lexer.is(AsmToken::EndOfStatement)) - Lexer.LexUntilEndOfLine(); - // Eat EOL. - Lex(); -} - /// parseCppHashLineFilenameComment as this: /// ::= # number "filename" -/// or just as a full line comment if it doesn't have a number and a string. bool AsmParser::parseCppHashLineFilenameComment(SMLoc L) { Lex(); // Eat the hash token. - - if (getLexer().isNot(AsmToken::Integer)) { - // Consume the line since in cases it is not a well-formed line directive, - // as if were simply a full line comment. - eatToEndOfLine(); - return false; - } - + // Lexer only ever emits HashDirective if it fully formed if it's + // done the checking already so this is an internal error. + assert(getTok().is(AsmToken::Integer) && + "Lexing Cpp line comment: Expected Integer"); int64_t LineNumber = getTok().getIntVal(); Lex(); - - if (getLexer().isNot(AsmToken::String)) { - eatToEndOfLine(); - return false; - } - + assert(getTok().is(AsmToken::String) && + "Lexing Cpp line comment: Expected String"); StringRef Filename = getTok().getString(); + Lex(); // Get rid of the enclosing quotes. Filename = Filename.substr(1, Filename.size() - 2); @@ -1900,9 +1891,6 @@ CppHashInfo.Filename = Filename; CppHashInfo.LineNumber = LineNumber; CppHashInfo.Buf = CurBuffer; - - // Ignore any trailing characters, they're just comment. - eatToEndOfLine(); return false; } @@ -2261,7 +2249,7 @@ break; if (FAI >= NParameters) { - assert(M && "expected macro to be defined"); + assert(M && "expected macro to be defined"); Error(IDLoc, "parameter named '" + FA.Name + "' does not exist for macro '" + M->Name + "'"); @@ -2561,16 +2549,16 @@ if (Lexer.isNot(AsmToken::Comma)) return TokError("expected comma"); - Lexer.Lex(); + Lex(); if (Lexer.isNot(AsmToken::Identifier)) return TokError("expected relocation name"); SMLoc NameLoc = Lexer.getTok().getLoc(); StringRef Name = Lexer.getTok().getIdentifier(); - Lexer.Lex(); + Lex(); if (Lexer.is(AsmToken::Comma)) { - Lexer.Lex(); + Lex(); SMLoc ExprLoc = Lexer.getLoc(); if (parseExpression(Expr)) return true; @@ -3776,8 +3764,8 @@ Lex(); } - // Eat the end of statement. - Lex(); + // Eat just the end of statement. + Lexer.Lex(); AsmToken EndToken, StartToken = getTok(); unsigned MacroDepth = 0; @@ -3794,7 +3782,7 @@ getTok().getIdentifier() == ".endmacro") { if (MacroDepth == 0) { // Outermost macro. EndToken = getTok(); - Lex(); + Lexer.Lex(); if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in '" + EndToken.getIdentifier() + "' directive"); @@ -5254,10 +5242,9 @@ bool parseAssignmentExpression(StringRef Name, bool allow_redef, MCAsmParser &Parser, MCSymbol *&Sym, const MCExpr *&Value) { - MCAsmLexer &Lexer = Parser.getLexer(); // FIXME: Use better location, we should use proper tokens. - SMLoc EqualLoc = Lexer.getLoc(); + SMLoc EqualLoc = Parser.getTok().getLoc(); if (Parser.parseExpression(Value)) { Parser.TokError("missing expression"); @@ -5269,7 +5256,7 @@ // a = b // b = c - if (Lexer.isNot(AsmToken::EndOfStatement)) + if (Parser.getTok().isNot(AsmToken::EndOfStatement)) return Parser.TokError("unexpected token in assignment"); // Eat the end of statement marker. Index: lib/Target/ARM/AsmParser/ARMAsmParser.cpp =================================================================== --- lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -9932,7 +9932,7 @@ StringRef Arch = Parser.getTok().getString(); SMLoc ArchLoc = Parser.getTok().getLoc(); - getLexer().Lex(); + Lex(); unsigned ID = ARM::parseArch(Arch); @@ -10054,7 +10054,7 @@ StringRef Name = Parser.getTok().getString(); SMLoc ExtLoc = Parser.getTok().getLoc(); - getLexer().Lex(); + Lex(); bool EnableFeature = true; if (Name.startswith_lower("no")) { Index: lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp =================================================================== --- lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -729,11 +729,10 @@ bool HexagonAsmParser::matchBundleOptions() { MCAsmParser &Parser = getParser(); - MCAsmLexer &Lexer = getLexer(); while (true) { if (!Parser.getTok().is(AsmToken::Colon)) return false; - Lexer.Lex(); + Lex(); StringRef Option = Parser.getTok().getString(); if (Option.compare_lower("endloop0") == 0) HexagonMCInstrInfo::setInnerLoop(MCB); @@ -745,7 +744,7 @@ HexagonMCInstrInfo::setMemStoreReorderEnabled(MCB); else return true; - Lexer.Lex(); + Lex(); } } @@ -1105,7 +1104,7 @@ AsmToken const &Token = getParser().getTok(); StringRef String = Token.getString(); SMLoc Loc = Token.getLoc(); - getLexer().Lex(); + Lex(); do { std::pair HeadTail = String.split('.'); if (!HeadTail.first.empty()) @@ -1220,7 +1219,7 @@ Token.getString().data() - RawString.data () + Token.getString().size()); Lookahead.push_back(Token); - Lexer.Lex(); + Lex(); bool Contigious = Lexer.getTok().getString().data() == Lookahead.back().getString().data() + Lookahead.back().getString().size(); @@ -1297,7 +1296,7 @@ static char const * Comma = ","; do { Tokens.emplace_back (Lexer.getTok()); - Lexer.Lex(); + Lex(); switch (Tokens.back().getKind()) { case AsmToken::TokenKind::Hash: @@ -1346,7 +1345,7 @@ AsmToken const &Token = Parser.getTok(); switch (Token.getKind()) { case AsmToken::EndOfStatement: { - Lexer.Lex(); + Lex(); return false; } case AsmToken::LCurly: { @@ -1354,19 +1353,19 @@ return true; Operands.push_back( HexagonOperand::CreateToken(Token.getString(), Token.getLoc())); - Lexer.Lex(); + Lex(); return false; } case AsmToken::RCurly: { if (Operands.empty()) { Operands.push_back( HexagonOperand::CreateToken(Token.getString(), Token.getLoc())); - Lexer.Lex(); + Lex(); } return false; } case AsmToken::Comma: { - Lexer.Lex(); + Lex(); continue; } case AsmToken::EqualEqual: @@ -1379,7 +1378,7 @@ Token.getString().substr(0, 1), Token.getLoc())); Operands.push_back(HexagonOperand::CreateToken( Token.getString().substr(1, 1), Token.getLoc())); - Lexer.Lex(); + Lex(); continue; } case AsmToken::Hash: { @@ -1389,12 +1388,12 @@ if (!ImplicitExpression) Operands.push_back( HexagonOperand::CreateToken(Token.getString(), Token.getLoc())); - Lexer.Lex(); + Lex(); bool MustExtend = false; bool HiOnly = false; bool LoOnly = false; if (Lexer.is(AsmToken::Hash)) { - Lexer.Lex(); + Lex(); MustExtend = true; } else if (ImplicitExpression) MustNotExtend = true; @@ -1413,7 +1412,7 @@ HiOnly = false; LoOnly = false; } else { - Lexer.Lex(); + Lex(); } } } Index: lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp =================================================================== --- lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp +++ lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp @@ -701,7 +701,7 @@ Error(Lexer.getLoc(), "Expected '('"); return 0; } - Lexer.Lex(); // lex '(' + Parser.Lex(); // lex '('. // Parse identifier if (Parser.parseIdentifier(Identifier)) @@ -718,7 +718,7 @@ Error(Lexer.getLoc(), "Expected ')'"); return 0; } - Lexer.Lex(); // lex ')' + Lex(); // lex ')'. } End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); @@ -1122,7 +1122,7 @@ // Parse until end of statement, consuming commas between operands while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.is(AsmToken::Comma)) { // Consume comma token - Lexer.Lex(); + Lex(); // Parse next operand if (parseOperand(&Operands, Mnemonic) != MatchOperand_Success) Index: lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp =================================================================== --- lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -1663,13 +1663,13 @@ // instruction name, to match what TableGen is doing. std::string NewOpcode; if (getLexer().is(AsmToken::Plus)) { - getLexer().Lex(); + Lex(); NewOpcode = Name; NewOpcode += '+'; Name = NewOpcode; } if (getLexer().is(AsmToken::Minus)) { - getLexer().Lex(); + Lex(); NewOpcode = Name; NewOpcode += '-'; Name = NewOpcode; @@ -1704,7 +1704,7 @@ while (getLexer().isNot(AsmToken::EndOfStatement) && getLexer().is(AsmToken::Comma)) { // Consume the comma token - getLexer().Lex(); + Lex(); // Parse the next operand if (ParseOperand(Operands)) Index: lib/Target/X86/AsmParser/X86AsmParser.cpp =================================================================== --- lib/Target/X86/AsmParser/X86AsmParser.cpp +++ lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1479,7 +1479,7 @@ const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size(); do { End = Tok.getEndLoc(); - getLexer().Lex(); + getParser().Lex(); } while (End.getPointer() < EndPtr); Identifier = LineBuf;