Index: llvm/include/llvm/MC/MCParser/AsmLexer.h =================================================================== --- llvm/include/llvm/MC/MCParser/AsmLexer.h +++ llvm/include/llvm/MC/MCParser/AsmLexer.h @@ -49,10 +49,13 @@ bool ShouldSkipSpace = true) override; const MCAsmInfo &getMAI() const { return MAI; } + bool isEOF(); + bool tokOutOfBounds(const AsmToken tok); private: bool isAtStartOfComment(const char *Ptr); bool isAtStatementSeparator(const char *Ptr); + char peekPtr(int offset = 0); int getNextChar(); AsmToken ReturnError(const char *Loc, const std::string &Msg); Index: llvm/lib/MC/MCParser/AsmLexer.cpp =================================================================== --- llvm/lib/MC/MCParser/AsmLexer.cpp +++ llvm/lib/MC/MCParser/AsmLexer.cpp @@ -61,25 +61,43 @@ return (unsigned char)*CurPtr++; } -/// The leading integral digit sequence and dot/e should have already been +/// peekPtr: This is a function to extrapolate getting the char at the +/// CurPtr without consuming it. This prevents accessing memory +/// past the end of the buffer. +char AsmLexer::peekPtr(int offset) { + if (CurPtr + offset >= CurBuf.end() || CurPtr + offset < CurBuf.begin()) + return EOF; + return *(CurPtr + offset); +} + +bool AsmLexer::isEOF() { return CurPtr == CurBuf.end(); } + +bool AsmLexer::tokOutOfBounds(const AsmToken tok) { + const char *Loc = tok.getLoc().getPointer(); + return Loc >= CurBuf.end(); +} + +/// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)? +/// +/// The leading integral digit sequence and dot should have already been /// consumed, some or all of the fractional digit sequence *can* have been /// consumed. AsmToken AsmLexer::LexFloatLiteral() { // Skip the fractional digit sequence. - while (isDigit(*CurPtr)) + while (isDigit(peekPtr())) ++CurPtr; - if (*CurPtr == '-' || *CurPtr == '+') + if (peekPtr() == '-' || peekPtr() == '+') return ReturnError(CurPtr, "Invalid sign in float literal"); // Check for exponent - if ((*CurPtr == 'e' || *CurPtr == 'E')) { + if ((peekPtr() == 'e' || peekPtr() == 'E')) { ++CurPtr; - if (*CurPtr == '-' || *CurPtr == '+') + if (peekPtr() == '-' || peekPtr() == '+') ++CurPtr; - while (isDigit(*CurPtr)) + while (isDigit(peekPtr())) ++CurPtr; } @@ -94,16 +112,16 @@ /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed /// before we get here. AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) { - assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') && + assert((peekPtr() == 'p' || peekPtr() == 'P' || peekPtr() == '.') && "unexpected parse state in floating hex"); bool NoFracDigits = true; // Skip the fractional part if there is one - if (*CurPtr == '.') { + if (peekPtr() == '.') { ++CurPtr; const char *FracStart = CurPtr; - while (isHexDigit(*CurPtr)) + while (isHexDigit(peekPtr())) ++CurPtr; NoFracDigits = CurPtr == FracStart; @@ -114,17 +132,17 @@ "expected at least one significand digit"); // Make sure we do have some kind of proper exponent part - if (*CurPtr != 'p' && *CurPtr != 'P') + if (peekPtr() != 'p' && peekPtr() != 'P') return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " "expected exponent part 'p'"); ++CurPtr; - if (*CurPtr == '+' || *CurPtr == '-') + if (peekPtr() == '+' || peekPtr() == '-') ++CurPtr; // N.b. exponent digits are *not* hex const char *ExpStart = CurPtr; - while (isDigit(*CurPtr)) + while (isDigit(peekPtr())) ++CurPtr; if (CurPtr == ExpStart) @@ -142,17 +160,17 @@ AsmToken AsmLexer::LexIdentifier() { // Check for floating point literals. - if (CurPtr[-1] == '.' && isDigit(*CurPtr)) { + if (peekPtr(-1) == '.' && isDigit(peekPtr())) { // Disambiguate a .1243foo identifier from a floating literal. - while (isDigit(*CurPtr)) + while (isDigit(peekPtr())) ++CurPtr; - if (!IsIdentifierChar(*CurPtr, AllowAtInIdentifier) || - *CurPtr == 'e' || *CurPtr == 'E') + if (!IsIdentifierChar(peekPtr(), AllowAtInIdentifier) || + peekPtr() == 'e' || peekPtr() == 'E') return LexFloatLiteral(); } - while (IsIdentifierChar(*CurPtr, AllowAtInIdentifier)) + while (IsIdentifierChar(peekPtr(), AllowAtInIdentifier)) ++CurPtr; // Handle . as a special case. @@ -165,7 +183,7 @@ /// LexSlash: Slash: / /// C-Style Comment: /* ... */ AsmToken AsmLexer::LexSlash() { - switch (*CurPtr) { + switch (peekPtr()) { case '*': IsAtStartOfStatement = false; break; // C style comment. @@ -184,7 +202,7 @@ switch (*CurPtr++) { case '*': // End of the comment? - if (*CurPtr != '/') + if (peekPtr() != '/') break; // If we have a CommentConsumer, notify it about the comment. if (CommentConsumer) { @@ -232,24 +250,24 @@ StringRef(TokStart, CurPtr - 1 - TokStart)); } -static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { +static void SkipIgnoredIntegerSuffix(const char *&CurPtr, const char * endBuf) { // Skip ULL, UL, U, L and LL suffices. - if (CurPtr[0] == 'U') + if ((CurPtr < endBuf) && CurPtr[0] == 'U') ++CurPtr; - if (CurPtr[0] == 'L') + if ((CurPtr < endBuf) && CurPtr[0] == 'L') ++CurPtr; - if (CurPtr[0] == 'L') + if ((CurPtr < endBuf) && CurPtr[0] == 'L') ++CurPtr; } // Look ahead to search for first non-hex digit, if it's [hH], then we treat the // integer as a hexadecimal, possibly with leading zeroes. static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix, - bool LexHex) { + bool LexHex, const char * endBuf) { const char *FirstNonDec = nullptr; const char *LookAhead = CurPtr; while (true) { - if (isDigit(*LookAhead)) { + if ((LookAhead < endBuf) && isDigit(*LookAhead)) { ++LookAhead; } else { if (!FirstNonDec) @@ -286,18 +304,18 @@ AsmToken AsmLexer::LexDigit() { // MASM-flavor binary integer: [01]+[bB] // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH] - if (LexMasmIntegers && isdigit(CurPtr[-1])) { - const char *FirstNonBinary = (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? - CurPtr - 1 : nullptr; + if (LexMasmIntegers && isdigit(peekPtr(-1))) { + const char *FirstNonBinary = + (peekPtr(-1) != '0' && peekPtr(-1) != '1') ? CurPtr - 1 : nullptr; const char *OldCurPtr = CurPtr; - while (isHexDigit(*CurPtr)) { + while (isHexDigit(peekPtr())) { if (*CurPtr != '0' && *CurPtr != '1' && !FirstNonBinary) FirstNonBinary = CurPtr; ++CurPtr; } unsigned Radix = 0; - if (*CurPtr == 'h' || *CurPtr == 'H') { + if (peekPtr() == 'h' || peekPtr() == 'H') { // hexadecimal number ++CurPtr; Radix = 16; @@ -314,7 +332,7 @@ "invalid hexdecimal number"); // MSVC accepts and ignores type suffices on integer literals. - SkipIgnoredIntegerSuffix(CurPtr); + SkipIgnoredIntegerSuffix(CurPtr, CurBuf.end()); return intToken(Result, Value); } @@ -324,12 +342,12 @@ } // Decimal integer: [1-9][0-9]* - if (CurPtr[-1] != '0' || CurPtr[0] == '.') { - unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers); + if (peekPtr(-1) != '0' || peekPtr() == '.') { + unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers, CurBuf.end()); bool isHex = Radix == 16; // Check for floating point literals. - if (!isHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) { - if (*CurPtr == '.') + if (!isHex && (peekPtr() == '.' || peekPtr() == 'e' || peekPtr() == 'E')) { + if (peekPtr() == '.') ++CurPtr; return LexFloatLiteral(); } @@ -347,21 +365,21 @@ // The darwin/x86 (and x86-64) assembler accepts and ignores type // suffices on integer literals. - SkipIgnoredIntegerSuffix(CurPtr); + SkipIgnoredIntegerSuffix(CurPtr, CurBuf.end()); return intToken(Result, Value); } - if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) { + if (!LexMasmIntegers && ((peekPtr() == 'b') || (peekPtr() == 'B'))) { ++CurPtr; // See if we actually have "0b" as part of something like "jmp 0b\n" - if (!isDigit(CurPtr[0])) { + if (!isDigit(peekPtr())) { --CurPtr; StringRef Result(TokStart, CurPtr - TokStart); return AsmToken(AsmToken::Integer, Result, 0); } const char *NumStart = CurPtr; - while (CurPtr[0] == '0' || CurPtr[0] == '1') + while (peekPtr() == '0' || peekPtr() == '1') ++CurPtr; // Requires at least one binary digit. @@ -376,20 +394,20 @@ // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL // suffixes on integer literals. - SkipIgnoredIntegerSuffix(CurPtr); + SkipIgnoredIntegerSuffix(CurPtr, CurBuf.end()); return intToken(Result, Value); } - if ((*CurPtr == 'x') || (*CurPtr == 'X')) { + if ((peekPtr() == 'x') || (peekPtr() == 'X')) { ++CurPtr; const char *NumStart = CurPtr; - while (isHexDigit(CurPtr[0])) + while (isHexDigit(peekPtr())) ++CurPtr; // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be // diagnosed by LexHexFloatLiteral). - if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P') + if (peekPtr() == '.' || peekPtr() == 'p' || peekPtr() == 'P') return LexHexFloatLiteral(NumStart == CurPtr); // Otherwise requires at least one hex digit. @@ -401,19 +419,19 @@ return ReturnError(TokStart, "invalid hexadecimal number"); // Consume the optional [hH]. - if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H')) + if (LexMasmIntegers && (peekPtr() == 'h' || peekPtr() == 'H')) ++CurPtr; // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL // suffixes on integer literals. - SkipIgnoredIntegerSuffix(CurPtr); + SkipIgnoredIntegerSuffix(CurPtr, CurBuf.end()); return intToken(StringRef(TokStart, CurPtr - TokStart), Result); } // Either octal or hexadecimal. APInt Value(128, 0, true); - unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers); + unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers, CurBuf.end()); bool isHex = Radix == 16; StringRef Result(TokStart, CurPtr - TokStart); if (Result.getAsInteger(Radix, Value)) @@ -426,7 +444,7 @@ // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL // suffixes on integer literals. - SkipIgnoredIntegerSuffix(CurPtr); + SkipIgnoredIntegerSuffix(CurPtr, CurBuf.end()); return intToken(Result, Value); } @@ -490,7 +508,7 @@ while (!isAtStartOfComment(CurPtr) && // Start of line comment. !isAtStatementSeparator(CurPtr) && // End of statement marker. - *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { + peekPtr() != '\n' && peekPtr() != '\r' && !isEOF()) { ++CurPtr; } return StringRef(TokStart, CurPtr-TokStart); @@ -499,7 +517,7 @@ StringRef AsmLexer::LexUntilEndOfLine() { TokStart = CurPtr; - while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) { + while (peekPtr() != '\n' && peekPtr() != '\r' && !isEOF()) { ++CurPtr; } return StringRef(TokStart, CurPtr-TokStart); @@ -532,7 +550,8 @@ bool AsmLexer::isAtStartOfComment(const char *Ptr) { StringRef CommentString = MAI.getCommentString(); - + if (Ptr >= CurBuf.end()) + return false; if (CommentString.size() == 1) return CommentString[0] == Ptr[0]; @@ -540,10 +559,14 @@ if (CommentString[1] == '#') return CommentString[0] == Ptr[0]; + if (Ptr + CommentString.size() >= CurBuf.end()) + return false; return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0; } bool AsmLexer::isAtStatementSeparator(const char *Ptr) { + if (Ptr == CurBuf.end()) + return false; return strncmp(Ptr, MAI.getSeparatorString(), strlen(MAI.getSeparatorString())) == 0; } @@ -587,7 +610,7 @@ if (CurChar == EOF && !IsAtStartOfStatement) { IsAtStartOfLine = true; IsAtStartOfStatement = true; - return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); + return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0)); } IsAtStartOfLine = false; bool OldIsAtStartOfStatement = IsAtStartOfStatement; @@ -608,7 +631,7 @@ case ' ': case '\t': IsAtStartOfStatement = OldIsAtStartOfStatement; - while (*CurPtr == ' ' || *CurPtr == '\t') + while (peekPtr() == ' ' || peekPtr() == '\t' || peekPtr() == 0) CurPtr++; if (SkipSpace) return LexToken(); // Ignore whitespace. @@ -618,7 +641,7 @@ IsAtStartOfLine = true; IsAtStartOfStatement = true; // If this is a CR followed by LF, treat that as one token. - if (CurPtr != CurBuf.end() && *CurPtr == '\n') + if (!isEOF() && peekPtr() == '\n') ++CurPtr; return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, CurPtr - TokStart)); @@ -642,32 +665,32 @@ case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); case '=': - if (*CurPtr == '=') { + if (peekPtr() == '=') { ++CurPtr; return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); } return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); case '-': - if (*CurPtr == '>') { + if (peekPtr() == '>') { ++CurPtr; return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2)); } return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); case '|': - if (*CurPtr == '|') { + if (peekPtr() == '|') { ++CurPtr; return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); } return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); case '&': - if (*CurPtr == '&') { + if (peekPtr() == '&') { ++CurPtr; return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); } return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); case '!': - if (*CurPtr == '=') { + if (peekPtr() == '=') { ++CurPtr; return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); } @@ -722,7 +745,7 @@ case '5': case '6': case '7': case '8': case '9': return LexDigit(); case '<': - switch (*CurPtr) { + switch (peekPtr()) { case '<': ++CurPtr; return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2)); @@ -736,7 +759,7 @@ return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); } case '>': - switch (*CurPtr) { + switch (peekPtr()) { case '>': ++CurPtr; return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2)); Index: llvm/lib/MC/MCParser/AsmParser.cpp =================================================================== --- llvm/lib/MC/MCParser/AsmParser.cpp +++ llvm/lib/MC/MCParser/AsmParser.cpp @@ -808,7 +808,8 @@ Error(Lexer.getErrLoc(), Lexer.getErr()); // if it's a end of statement with a comment in it - if (getTok().is(AsmToken::EndOfStatement)) { + if (getTok().is(AsmToken::EndOfStatement) && + !(Lexer.tokOutOfBounds(getTok()))) { // if this is a line comment output it. if (!getTok().getString().empty() && getTok().getString().front() != '\n' && getTok().getString().front() != '\r' && MAI.preserveAsmComments())