diff --git a/llvm/include/llvm/MC/MCParser/AsmLexer.h b/llvm/include/llvm/MC/MCParser/AsmLexer.h
--- a/llvm/include/llvm/MC/MCParser/AsmLexer.h
+++ b/llvm/include/llvm/MC/MCParser/AsmLexer.h
@@ -56,6 +56,7 @@
bool isAtStartOfComment(const char *Ptr);
bool isAtStatementSeparator(const char *Ptr);
int getNextChar();
+ int peekNextChar();
AsmToken ReturnError(const char *Loc, const std::string &Msg);
AsmToken LexIdentifier();
diff --git a/llvm/include/llvm/MC/MCParser/MCAsmLexer.h b/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
--- a/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/llvm/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -51,6 +51,7 @@
bool IsAtStartOfStatement = true;
bool LexMasmHexFloats = false;
bool LexMasmIntegers = false;
+ bool LexMasmStrings = false;
bool UseMasmDefaultRadix = false;
unsigned DefaultRadix = 10;
AsmCommentConsumer *CommentConsumer = nullptr;
@@ -163,6 +164,10 @@
/// Set whether to lex masm-style hex float literals, such as 3f800000r.
void setLexMasmHexFloats(bool V) { LexMasmHexFloats = V; }
+
+ /// Set whether to lex masm-style string literals, such as 'Can''t find file'
+ /// and "This ""value"" not found".
+ void setLexMasmStrings(bool V) { LexMasmStrings = V; }
};
} // end namespace llvm
diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp
--- a/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -64,6 +64,12 @@
return (unsigned char)*CurPtr++;
}
+int AsmLexer::peekNextChar() {
+ if (CurPtr == CurBuf.end())
+ return EOF;
+ return (unsigned char)*CurPtr;
+}
+
/// The leading integral digit sequence and dot should have already been
/// consumed, some or all of the fractional digit sequence *can* have been
/// consumed.
@@ -521,6 +527,24 @@
AsmToken AsmLexer::LexSingleQuote() {
int CurChar = getNextChar();
+ if (LexMasmStrings) {
+ while (CurChar != EOF) {
+ if (CurChar != '\'') {
+ CurChar = getNextChar();
+ } else if (peekNextChar() == '\'') {
+ // In MASM single-quote strings, doubled single-quotes mean an escaped
+ // single quote, so should be lexed in.
+ getNextChar();
+ CurChar = getNextChar();
+ } else {
+ break;
+ }
+ }
+ if (CurChar == EOF)
+ return ReturnError(TokStart, "unterminated string constant");
+ return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
+ }
+
if (CurChar == '\\')
CurChar = getNextChar();
@@ -555,6 +579,24 @@
/// LexQuote: String: "..."
AsmToken AsmLexer::LexQuote() {
int CurChar = getNextChar();
+ if (LexMasmStrings) {
+ while (CurChar != EOF) {
+ if (CurChar != '"') {
+ CurChar = getNextChar();
+ } else if (peekNextChar() == '"') {
+ // In MASM double-quoted strings, doubled double-quotes mean an escaped
+ // double quote, so should be lexed in.
+ getNextChar();
+ CurChar = getNextChar();
+ } else {
+ break;
+ }
+ }
+ if (CurChar == EOF)
+ return ReturnError(TokStart, "unterminated string constant");
+ return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
+ }
+
// TODO: does gas allow multiline string constants?
while (CurChar != '"') {
if (CurChar == '\\') {
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -3086,70 +3086,19 @@
return true;
Data = "";
+ char Quote = getTok().getString().front();
StringRef Str = getTok().getStringContents();
- for (unsigned i = 0, e = Str.size(); i != e; ++i) {
- if (Str[i] != '\\') {
- Data += Str[i];
- continue;
- }
-
- // Recognize escaped characters. Note that this escape semantics currently
- // loosely follows Darwin 'as'.
- ++i;
- if (i == e)
- return TokError("unexpected backslash at end of string");
-
- // Recognize hex sequences similarly to GNU 'as'.
- if (Str[i] == 'x' || Str[i] == 'X') {
- size_t length = Str.size();
- if (i + 1 >= length || !isHexDigit(Str[i + 1]))
- return TokError("invalid hexadecimal escape sequence");
-
- // Consume hex characters. GNU 'as' reads all hexadecimal characters and
- // then truncates to the lower 16 bits. Seems reasonable.
- unsigned Value = 0;
- while (i + 1 < length && isHexDigit(Str[i + 1]))
- Value = Value * 16 + hexDigitValue(Str[++i]);
-
- Data += (unsigned char)(Value & 0xFF);
- continue;
- }
-
- // Recognize octal sequences.
- if ((unsigned)(Str[i] - '0') <= 7) {
- // Consume up to three octal characters.
- unsigned Value = Str[i] - '0';
-
- if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
+ Data.reserve(Str.size());
+ for (int i = 0, e = Str.size(); i != e; ++i) {
+ Data.push_back(Str[i]);
+ if (Str[i] == Quote) {
+ // MASM treats doubled delimiting quotes as an escaped delimiting quote.
+ // If we're escaping the string's trailing delimiter, we're definitely
+ // missing a quotation mark.
+ if (i + 1 == Str.size())
+ return Error(getTok().getLoc(), "missing quotation mark in string");
+ if (Str[i + 1] == Quote)
++i;
- Value = Value * 8 + (Str[i] - '0');
-
- if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
- ++i;
- Value = Value * 8 + (Str[i] - '0');
- }
- }
-
- if (Value > 255)
- return TokError("invalid octal escape sequence (out of range)");
-
- Data += (unsigned char)Value;
- continue;
- }
-
- // Otherwise recognize individual escapes.
- switch (Str[i]) {
- default:
- // Just reject invalid escape sequences for now.
- return TokError("invalid escape sequence (unrecognized character)");
-
- case 'b': Data += '\b'; break;
- case 'f': Data += '\f'; break;
- case 'n': Data += '\n'; break;
- case 'r': Data += '\r'; break;
- case 't': Data += '\t'; break;
- case '"': Data += '"'; break;
- case '\\': Data += '\\'; break;
}
}
@@ -3220,7 +3169,9 @@
SmallVectorImpl<const MCExpr *> &Values,
unsigned StringPadLength) {
if (getTok().is(AsmToken::String)) {
- StringRef Value = getTok().getStringContents();
+ std::string Value;
+ if (parseEscapedString(Value))
+ return true;
if (Size == 1) {
// Treat each character as an initializer.
for (const char CharVal : Value)
@@ -3235,11 +3186,10 @@
return Error(getTok().getLoc(), "out of range literal value");
uint64_t IntValue = 0;
- for (const unsigned char CharVal : Value.bytes())
+ for (const unsigned char CharVal : Value)
IntValue = (IntValue << 8) | CharVal;
Values.push_back(MCConstantExpr::create(IntValue, getContext()));
}
- Lex();
} else {
const MCExpr *Value;
if (parseExpression(Value))
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1696,6 +1696,17 @@
case AsmToken::At:
case AsmToken::String:
case AsmToken::Identifier: {
+ if (Parser.isParsingMasm() && Tok.is(AsmToken::String)) {
+ // Single-character strings should be treated as integer constants. This
+ // includes MASM escapes for quotes.
+ char Quote = Tok.getString().front();
+ StringRef Contents = Tok.getStringContents();
+ if (Contents.size() == 1 || Contents == std::string(2, Quote)) {
+ if (SM.onInteger(Contents.front(), ErrMsg))
+ return Error(Tok.getLoc(), ErrMsg);
+ break;
+ }
+ }
SMLoc IdentLoc = Tok.getLoc();
StringRef Identifier = Tok.getString();
UpdateLocLex = false;
diff --git a/llvm/test/tools/llvm-ml/strings.test b/llvm/test/tools/llvm-ml/strings.test
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-ml/strings.test
@@ -0,0 +1,90 @@
+# RUN: llvm-ml -filetype=asm %s | FileCheck %s
+
+.data
+
+dq_single_character BYTE "a"
+; CHECK-LABEL: dq_single_character:
+; CHECK-NEXT: .byte 97
+; CHECK-NOT: .byte
+
+dq_join BYTE "ab", "cd"
+; CHECK-LABEL: dq_join:
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 100
+; CHECK-NOT: .byte
+
+dq_quote_escape BYTE "ab""""cd"
+; Intended result: ab""cd
+; CHECK-LABEL: dq_quote_escape:
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 34
+; CHECK-NEXT: .byte 34
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 100
+; CHECK-NOT: .byte
+
+dq_single_quote BYTE "ab''''cd"
+; Intended result: ab''''cd
+; CHECK-LABEL: dq_single_quote:
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 39
+; CHECK-NEXT: .byte 39
+; CHECK-NEXT: .byte 39
+; CHECK-NEXT: .byte 39
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 100
+; CHECK-NOT: .byte
+
+sq_single_character BYTE 'a'
+; CHECK-LABEL: sq_single_character:
+; CHECK-NEXT: .byte 97
+; CHECK-NOT: .byte
+
+sq_join BYTE 'ab', 'cd'
+; CHECK-LABEL: sq_join:
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 100
+; CHECK-NOT: .byte
+
+sq_quote_escape BYTE 'ab''''cd'
+; Intended result: ab''cd
+; CHECK-LABEL: sq_quote_escape:
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 39
+; CHECK-NEXT: .byte 39
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 100
+; CHECK-NOT: .byte
+
+sq_double_quote BYTE 'ab""""cd'
+; Intended result: ab""""cd
+; CHECK-LABEL: sq_double_quote:
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 34
+; CHECK-NEXT: .byte 34
+; CHECK-NEXT: .byte 34
+; CHECK-NEXT: .byte 34
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 100
+; CHECK-NOT: .byte
+
+mixed_quotes_join BYTE "a'b", 'c"d'
+; Intended result: a'bc"d
+; CHECK-LABEL: mixed_quotes_join:
+; CHECK-NEXT: .byte 97
+; CHECK-NEXT: .byte 39
+; CHECK-NEXT: .byte 98
+; CHECK-NEXT: .byte 99
+; CHECK-NEXT: .byte 34
+; CHECK-NEXT: .byte 100
+; CHECK-NOT: .byte
+
+end
diff --git a/llvm/test/tools/llvm-ml/struct.test b/llvm/test/tools/llvm-ml/struct.test
--- a/llvm/test/tools/llvm-ml/struct.test
+++ b/llvm/test/tools/llvm-ml/struct.test
@@ -46,7 +46,7 @@
; CHECK-NEXT: .byte 101
; CHECK-NEXT: .zero 1
-t2 FOOBAR <"gh",,<10,11>,<12>,"ijk">
+t2 FOOBAR <"gh",,<10,11>,<12>,'ijk'>
; CHECK: t2:
;
diff --git a/llvm/tools/llvm-ml/llvm-ml.cpp b/llvm/tools/llvm-ml/llvm-ml.cpp
--- a/llvm/tools/llvm-ml/llvm-ml.cpp
+++ b/llvm/tools/llvm-ml/llvm-ml.cpp
@@ -184,6 +184,7 @@
Lexer.setLexMasmIntegers(true);
Lexer.useMasmDefaultRadix(true);
Lexer.setLexMasmHexFloats(true);
+ Lexer.setLexMasmStrings(true);
bool Error = false;
while (Lexer.Lex().isNot(AsmToken::Eof)) {
@@ -216,6 +217,7 @@
Parser->getLexer().setLexMasmIntegers(true);
Parser->getLexer().useMasmDefaultRadix(true);
Parser->getLexer().setLexMasmHexFloats(true);
+ Parser->getLexer().setLexMasmStrings(true);
int Res = Parser->Run(/*NoInitialTextSection=*/true);