Index: include/llvm/MC/MCParser/AsmLexer.h =================================================================== --- include/llvm/MC/MCParser/AsmLexer.h +++ include/llvm/MC/MCParser/AsmLexer.h @@ -14,6 +14,7 @@ #ifndef LLVM_MC_MCPARSER_ASMLEXER_H #define LLVM_MC_MCPARSER_ASMLEXER_H +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/Support/DataTypes.h" @@ -31,6 +32,9 @@ StringRef CurBuf; bool isAtStartOfLine; + SmallBitVector IdPrefixCharSet; + SmallBitVector IdBodyCharSet; + void operator=(const AsmLexer&) = delete; AsmLexer(const AsmLexer&) = delete; @@ -59,6 +63,14 @@ int getNextChar(); AsmToken ReturnError(const char *Loc, const std::string &Msg); + bool IsIdentifierPrefixChar(char c) const; + bool IsIdentifierBodyChar(char c) const; + + void setIdentifierCharSet(bool Value, + StringRef PfxCharSet, + StringRef BodyCharSet) override; + bool isIdentifierCharSetContains(char) const override; + AsmToken LexIdentifier(); AsmToken LexSlash(); AsmToken LexLineComment(); Index: include/llvm/MC/MCParser/MCAsmLexer.h =================================================================== --- include/llvm/MC/MCParser/MCAsmLexer.h +++ include/llvm/MC/MCParser/MCAsmLexer.h @@ -129,7 +129,6 @@ protected: // Can only create subclasses. const char *TokStart; bool SkipSpace; - bool AllowAtInIdentifier; MCAsmLexer(); @@ -208,8 +207,16 @@ /// Set whether spaces should be ignored by the lexer void setSkipSpace(bool val) { SkipSpace = val; } - bool getAllowAtInIdentifier() { return AllowAtInIdentifier; } - void setAllowAtInIdentifier(bool v) { AllowAtInIdentifier = v; } + /// allow/disallow an identifier to contain specified characters + virtual void setIdentifierCharSet(bool Value, + StringRef PfxCharSet, + StringRef BodyCharSet); + + /// test whether the specified character can be found in an identifier + virtual bool isIdentifierCharSetContains(char) const = 0; + + bool getAllowAtInIdentifier() { return isIdentifierCharSetContains('@'); } + void setAllowAtInIdentifier(bool v) { setIdentifierCharSet(v, "", "@"); } }; } // End llvm namespace Index: lib/MC/MCParser/AsmLexer.cpp =================================================================== --- lib/MC/MCParser/AsmLexer.cpp +++ lib/MC/MCParser/AsmLexer.cpp @@ -15,21 +15,73 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SMLoc.h" -#include #include #include #include using namespace llvm; +// standard is(x)digit generally much slower than simple +// checks like below +inline static bool isDigit(char C) { + return (C >= '0' && C <= '9'); +} + +inline static bool isHexDigit(char C) { + return isDigit(C) + || (C >= 'A' && C <= 'F') + || (C >= 'a' && C <= 'f'); +} + AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) { CurPtr = nullptr; isAtStartOfLine = true; - AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@"); + + IdPrefixCharSet.resize(256); + IdPrefixCharSet.set('a', 'z' + 1); + IdPrefixCharSet.set('A', 'Z' + 1); + IdPrefixCharSet.set('.'); + IdPrefixCharSet.set('_'); + + IdBodyCharSet = IdPrefixCharSet; + IdBodyCharSet.set('0', '9' + 1); + IdBodyCharSet.set('$'); + IdBodyCharSet.set('?'); + + if (!StringRef(MAI.getCommentString()).startswith("@")) + IdBodyCharSet.set('@'); } AsmLexer::~AsmLexer() { } +void AsmLexer::setIdentifierCharSet(bool Value, + StringRef PfxCharSet, + StringRef BodyCharSet) { + if (Value) { + for (auto C : PfxCharSet) + IdPrefixCharSet.set((unsigned char)C); + for (auto C : BodyCharSet) + IdBodyCharSet.set((unsigned char)C); + } else { + for (auto C : PfxCharSet) + IdPrefixCharSet.reset((unsigned char)C); + for (auto C : BodyCharSet) + IdBodyCharSet.reset((unsigned char)C); + } +} + +inline bool AsmLexer::IsIdentifierPrefixChar(char C) const { + return IdPrefixCharSet.test((unsigned char)C); +} + +inline bool AsmLexer::IsIdentifierBodyChar(char C) const { + return IdBodyCharSet.test((unsigned char)C); +} + +bool AsmLexer::isIdentifierCharSetContains(char C) const { + return IsIdentifierBodyChar(C) || IsIdentifierPrefixChar(C); +} + void AsmLexer::setBuffer(StringRef Buf, const char *ptr) { CurBuf = Buf; @@ -73,7 +125,7 @@ /// consumed. AsmToken AsmLexer::LexFloatLiteral() { // Skip the fractional digit sequence. - while (isdigit(*CurPtr)) + while (isDigit(*CurPtr)) ++CurPtr; // Check for exponent; we intentionally accept a slighlty wider set of @@ -83,7 +135,7 @@ ++CurPtr; if (*CurPtr == '-' || *CurPtr == '+') ++CurPtr; - while (isdigit(*CurPtr)) + while (isDigit(*CurPtr)) ++CurPtr; } @@ -107,7 +159,7 @@ ++CurPtr; const char *FracStart = CurPtr; - while (isxdigit(*CurPtr)) + while (isHexDigit(*CurPtr)) ++CurPtr; NoFracDigits = CurPtr == FracStart; @@ -128,7 +180,7 @@ // N.b. exponent digits are *not* hex const char *ExpStart = CurPtr; - while (isdigit(*CurPtr)) + while (isDigit(*CurPtr)) ++CurPtr; if (CurPtr == ExpStart) @@ -138,29 +190,10 @@ return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); } -/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]* -static bool IsIdentifierChar(char c, bool AllowAt) { - return isalnum(c) || c == '_' || c == '$' || c == '.' || - (c == '@' && AllowAt) || c == '?'; -} AsmToken AsmLexer::LexIdentifier() { - // Check for floating point literals. - if (CurPtr[-1] == '.' && isdigit(*CurPtr)) { - // Disambiguate a .1243foo identifier from a floating literal. - while (isdigit(*CurPtr)) - ++CurPtr; - if (*CurPtr == 'e' || *CurPtr == 'E' || - !IsIdentifierChar(*CurPtr, AllowAtInIdentifier)) - return LexFloatLiteral(); - } - - while (IsIdentifierChar(*CurPtr, AllowAtInIdentifier)) + assert(IsIdentifierPrefixChar(*TokStart)); + while (IsIdentifierBodyChar(*CurPtr)) ++CurPtr; - - // Handle . as a special case. - if (CurPtr == TokStart+1 && TokStart[0] == '.') - return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); - return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); } @@ -220,9 +253,9 @@ const char *FirstHex = nullptr; const char *LookAhead = CurPtr; while (1) { - if (isdigit(*LookAhead)) { + if (isDigit(*LookAhead)) { ++LookAhead; - } else if (isxdigit(*LookAhead)) { + } else if (isHexDigit(*LookAhead)) { if (!FirstHex) FirstHex = LookAhead; ++LookAhead; @@ -283,7 +316,7 @@ if (*CurPtr == 'b') { ++CurPtr; // See if we actually have "0b" as part of something like "jmp 0b\n" - if (!isdigit(CurPtr[0])) { + if (!isDigit(CurPtr[0])) { --CurPtr; StringRef Result(TokStart, CurPtr - TokStart); return AsmToken(AsmToken::Integer, Result, 0); @@ -312,7 +345,7 @@ if (*CurPtr == 'x') { ++CurPtr; const char *NumStart = CurPtr; - while (isxdigit(CurPtr[0])) + while (isHexDigit(CurPtr[0])) ++CurPtr; // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be @@ -489,7 +522,7 @@ AsmToken AsmLexer::LexToken() { TokStart = CurPtr; // This always consumes at least one character. - int CurChar = getNextChar(); + const int CurChar = getNextChar(); if (isAtStartOfComment(TokStart)) { // If this comment starts with a '#', then return the Hash token and let @@ -514,11 +547,32 @@ } isAtStartOfLine = false; + + if (CurChar == '.' && isDigit(*CurPtr)) { + if (!IsIdentifierPrefixChar('.')) + return LexFloatLiteral(); + + const auto SavePos = CurPtr; + // Disambiguate a .1243foo identifier from a floating literal. + do { ++CurPtr; } + while (isDigit(*CurPtr)); + if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierBodyChar(*CurPtr)) + return LexFloatLiteral(); + CurPtr = SavePos; + } + + const AsmToken Id = IsIdentifierPrefixChar(CurChar) ? + LexIdentifier() : + AsmToken(AsmToken::Error, StringRef()); + + // if single char id - need further check + if (Id.is(AsmToken::Identifier) && Id.getString().size() > 1) + return Id; + switch (CurChar) { default: - // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* - if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') - return LexIdentifier(); + if (Id.is(AsmToken::Identifier)) // single char id indeed + return Id; // Unknown character, emit an error. return ReturnError(TokStart, "invalid character in input"); @@ -541,6 +595,7 @@ case '\r': isAtStartOfLine = true; return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); + case '.': return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); Index: lib/MC/MCParser/MCAsmLexer.cpp =================================================================== --- lib/MC/MCParser/MCAsmLexer.cpp +++ lib/MC/MCParser/MCAsmLexer.cpp @@ -19,6 +19,11 @@ MCAsmLexer::~MCAsmLexer() { } +void MCAsmLexer::setIdentifierCharSet(bool Value, + StringRef PfxCharSet, + StringRef BodyCharSet) { +} + SMLoc MCAsmLexer::getLoc() const { return SMLoc::getFromPointer(TokStart); }