diff --git a/llvm/include/llvm/MC/MCParser/AsmLexer.h b/llvm/include/llvm/MC/MCParser/AsmLexer.h --- a/llvm/include/llvm/MC/MCParser/AsmLexer.h +++ b/llvm/include/llvm/MC/MCParser/AsmLexer.h @@ -56,6 +56,7 @@ bool isAtStartOfComment(const char *Ptr); bool isAtStatementSeparator(const char *Ptr); int getNextChar(); + int peekNextChar(); AsmToken ReturnError(const char *Loc, const std::string &Msg); AsmToken LexIdentifier(); diff --git a/llvm/include/llvm/MC/MCParser/MCAsmLexer.h b/llvm/include/llvm/MC/MCParser/MCAsmLexer.h --- a/llvm/include/llvm/MC/MCParser/MCAsmLexer.h +++ b/llvm/include/llvm/MC/MCParser/MCAsmLexer.h @@ -51,6 +51,7 @@ bool IsAtStartOfStatement = true; bool LexMasmHexFloats = false; bool LexMasmIntegers = false; + bool LexMasmStrings = false; bool UseMasmDefaultRadix = false; unsigned DefaultRadix = 10; AsmCommentConsumer *CommentConsumer = nullptr; @@ -163,6 +164,10 @@ /// Set whether to lex masm-style hex float literals, such as 3f800000r. void setLexMasmHexFloats(bool V) { LexMasmHexFloats = V; } + + /// Set whether to lex masm-style string literals, such as 'Can''t find file' + /// and "This ""value"" not found". + void setLexMasmStrings(bool V) { LexMasmStrings = V; } }; } // end namespace llvm diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp --- a/llvm/lib/MC/MCParser/AsmLexer.cpp +++ b/llvm/lib/MC/MCParser/AsmLexer.cpp @@ -64,6 +64,12 @@ return (unsigned char)*CurPtr++; } +int AsmLexer::peekNextChar() { + if (CurPtr == CurBuf.end()) + return EOF; + return (unsigned char)*CurPtr; +} + /// The leading integral digit sequence and dot should have already been /// consumed, some or all of the fractional digit sequence *can* have been /// consumed. @@ -521,6 +527,24 @@ AsmToken AsmLexer::LexSingleQuote() { int CurChar = getNextChar(); + if (LexMasmStrings) { + while (CurChar != EOF) { + if (CurChar != '\'') { + CurChar = getNextChar(); + } else if (peekNextChar() == '\'') { + // In MASM single-quote strings, doubled single-quotes mean an escaped + // single quote, so should be lexed in. + getNextChar(); + CurChar = getNextChar(); + } else { + break; + } + } + if (CurChar == EOF) + return ReturnError(TokStart, "unterminated string constant"); + return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); + } + if (CurChar == '\\') CurChar = getNextChar(); @@ -555,6 +579,24 @@ /// LexQuote: String: "..." AsmToken AsmLexer::LexQuote() { int CurChar = getNextChar(); + if (LexMasmStrings) { + while (CurChar != EOF) { + if (CurChar != '"') { + CurChar = getNextChar(); + } else if (peekNextChar() == '"') { + // In MASM double-quoted strings, doubled double-quotes mean an escaped + // double quote, so should be lexed in. + getNextChar(); + CurChar = getNextChar(); + } else { + break; + } + } + if (CurChar == EOF) + return ReturnError(TokStart, "unterminated string constant"); + return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); + } + // TODO: does gas allow multiline string constants? while (CurChar != '"') { if (CurChar == '\\') { diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -3086,70 +3086,19 @@ return true; Data = ""; + char Quote = getTok().getString().front(); StringRef Str = getTok().getStringContents(); - for (unsigned i = 0, e = Str.size(); i != e; ++i) { - if (Str[i] != '\\') { - Data += Str[i]; - continue; - } - - // Recognize escaped characters. Note that this escape semantics currently - // loosely follows Darwin 'as'. - ++i; - if (i == e) - return TokError("unexpected backslash at end of string"); - - // Recognize hex sequences similarly to GNU 'as'. - if (Str[i] == 'x' || Str[i] == 'X') { - size_t length = Str.size(); - if (i + 1 >= length || !isHexDigit(Str[i + 1])) - return TokError("invalid hexadecimal escape sequence"); - - // Consume hex characters. GNU 'as' reads all hexadecimal characters and - // then truncates to the lower 16 bits. Seems reasonable. - unsigned Value = 0; - while (i + 1 < length && isHexDigit(Str[i + 1])) - Value = Value * 16 + hexDigitValue(Str[++i]); - - Data += (unsigned char)(Value & 0xFF); - continue; - } - - // Recognize octal sequences. - if ((unsigned)(Str[i] - '0') <= 7) { - // Consume up to three octal characters. - unsigned Value = Str[i] - '0'; - - if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) { + Data.reserve(Str.size()); + for (int i = 0, e = Str.size(); i != e; ++i) { + Data.push_back(Str[i]); + if (Str[i] == Quote) { + // MASM treats doubled delimiting quotes as an escaped delimiting quote. + // If we're escaping the string's trailing delimiter, we're definitely + // missing a quotation mark. + if (i + 1 == Str.size()) + return Error(getTok().getLoc(), "missing quotation mark in string"); + if (Str[i + 1] == Quote) ++i; - Value = Value * 8 + (Str[i] - '0'); - - if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) { - ++i; - Value = Value * 8 + (Str[i] - '0'); - } - } - - if (Value > 255) - return TokError("invalid octal escape sequence (out of range)"); - - Data += (unsigned char)Value; - continue; - } - - // Otherwise recognize individual escapes. - switch (Str[i]) { - default: - // Just reject invalid escape sequences for now. - return TokError("invalid escape sequence (unrecognized character)"); - - case 'b': Data += '\b'; break; - case 'f': Data += '\f'; break; - case 'n': Data += '\n'; break; - case 'r': Data += '\r'; break; - case 't': Data += '\t'; break; - case '"': Data += '"'; break; - case '\\': Data += '\\'; break; } } @@ -3220,7 +3169,9 @@ SmallVectorImpl &Values, unsigned StringPadLength) { if (getTok().is(AsmToken::String)) { - StringRef Value = getTok().getStringContents(); + std::string Value; + if (parseEscapedString(Value)) + return true; if (Size == 1) { // Treat each character as an initializer. for (const char CharVal : Value) @@ -3235,11 +3186,10 @@ return Error(getTok().getLoc(), "out of range literal value"); uint64_t IntValue = 0; - for (const unsigned char CharVal : Value.bytes()) + for (const unsigned char CharVal : Value) IntValue = (IntValue << 8) | CharVal; Values.push_back(MCConstantExpr::create(IntValue, getContext())); } - Lex(); } else { const MCExpr *Value; if (parseExpression(Value)) diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1696,6 +1696,17 @@ case AsmToken::At: case AsmToken::String: case AsmToken::Identifier: { + if (Parser.isParsingMasm() && Tok.is(AsmToken::String)) { + // Single-character strings should be treated as integer constants. This + // includes MASM escapes for quotes. + char Quote = Tok.getString().front(); + StringRef Contents = Tok.getStringContents(); + if (Contents.size() == 1 || Contents == std::string(2, Quote)) { + if (SM.onInteger(Contents.front(), ErrMsg)) + return Error(Tok.getLoc(), ErrMsg); + break; + } + } SMLoc IdentLoc = Tok.getLoc(); StringRef Identifier = Tok.getString(); UpdateLocLex = false; diff --git a/llvm/test/tools/llvm-ml/strings.test b/llvm/test/tools/llvm-ml/strings.test new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-ml/strings.test @@ -0,0 +1,110 @@ +# RUN: llvm-ml -filetype=asm %s | FileCheck %s + +.data + +dq_single_character BYTE "a" +; CHECK-LABEL: dq_single_character: +; CHECK-NEXT: .byte 97 +; CHECK-NOT: .byte + +dq_join BYTE "ab", "cd" +; CHECK-LABEL: dq_join: +; CHECK-NEXT: .byte 97 +; CHECK-NEXT: .byte 98 +; CHECK-NEXT: .byte 99 +; CHECK-NEXT: .byte 100 +; CHECK-NOT: .byte + +dq_quote_escape BYTE "ab""""cd" +; Intended result: ab""cd +; CHECK-LABEL: dq_quote_escape: +; CHECK-NEXT: .byte 97 +; CHECK-NEXT: .byte 98 +; CHECK-NEXT: .byte 34 +; CHECK-NEXT: .byte 34 +; CHECK-NEXT: .byte 99 +; CHECK-NEXT: .byte 100 +; CHECK-NOT: .byte + +dq_single_quote BYTE "ab''''cd" +; Intended result: ab''''cd +; CHECK-LABEL: dq_single_quote: +; CHECK-NEXT: .byte 97 +; CHECK-NEXT: .byte 98 +; CHECK-NEXT: .byte 39 +; CHECK-NEXT: .byte 39 +; CHECK-NEXT: .byte 39 +; CHECK-NEXT: .byte 39 +; CHECK-NEXT: .byte 99 +; CHECK-NEXT: .byte 100 +; CHECK-NOT: .byte + +sq_single_character BYTE 'a' +; CHECK-LABEL: sq_single_character: +; CHECK-NEXT: .byte 97 +; CHECK-NOT: .byte + +sq_join BYTE 'ab', 'cd' +; CHECK-LABEL: sq_join: +; CHECK-NEXT: .byte 97 +; CHECK-NEXT: .byte 98 +; CHECK-NEXT: .byte 99 +; CHECK-NEXT: .byte 100 +; CHECK-NOT: .byte + +sq_quote_escape BYTE 'ab''''cd' +; Intended result: ab''cd +; CHECK-LABEL: sq_quote_escape: +; CHECK-NEXT: .byte 97 +; CHECK-NEXT: .byte 98 +; CHECK-NEXT: .byte 39 +; CHECK-NEXT: .byte 39 +; CHECK-NEXT: .byte 99 +; CHECK-NEXT: .byte 100 +; CHECK-NOT: .byte + +sq_double_quote BYTE 'ab""""cd' +; Intended result: ab""""cd +; CHECK-LABEL: sq_double_quote: +; CHECK-NEXT: .byte 97 +; CHECK-NEXT: .byte 98 +; CHECK-NEXT: .byte 34 +; CHECK-NEXT: .byte 34 +; CHECK-NEXT: .byte 34 +; CHECK-NEXT: .byte 34 +; CHECK-NEXT: .byte 99 +; CHECK-NEXT: .byte 100 +; CHECK-NOT: .byte + +mixed_quotes_join BYTE "a'b", 'c"d' +; Intended result: a'bc"d +; CHECK-LABEL: mixed_quotes_join: +; CHECK-NEXT: .byte 97 +; CHECK-NEXT: .byte 39 +; CHECK-NEXT: .byte 98 +; CHECK-NEXT: .byte 99 +; CHECK-NEXT: .byte 34 +; CHECK-NEXT: .byte 100 +; CHECK-NOT: .byte + +.code + +sq_char_test PROC +; CHECK-LABEL: sq_char_test: + + mov eax, 'a' +; CHECK: mov eax, 97 + + ret +sq_char_test ENDP + +dq_char_test PROC +; CHECK-LABEL: dq_char_test: + + mov eax, "b" +; CHECK: mov eax, 98 + + ret +dq_char_test ENDP + +end diff --git a/llvm/test/tools/llvm-ml/struct.test b/llvm/test/tools/llvm-ml/struct.test --- a/llvm/test/tools/llvm-ml/struct.test +++ b/llvm/test/tools/llvm-ml/struct.test @@ -46,7 +46,7 @@ ; CHECK-NEXT: .byte 101 ; CHECK-NEXT: .zero 1 -t2 FOOBAR <"gh",,<10,11>,<12>,"ijk"> +t2 FOOBAR <"gh",,<10,11>,<12>,'ijk'> ; CHECK: t2: ; diff --git a/llvm/tools/llvm-ml/llvm-ml.cpp b/llvm/tools/llvm-ml/llvm-ml.cpp --- a/llvm/tools/llvm-ml/llvm-ml.cpp +++ b/llvm/tools/llvm-ml/llvm-ml.cpp @@ -184,6 +184,7 @@ Lexer.setLexMasmIntegers(true); Lexer.useMasmDefaultRadix(true); Lexer.setLexMasmHexFloats(true); + Lexer.setLexMasmStrings(true); bool Error = false; while (Lexer.Lex().isNot(AsmToken::Eof)) { @@ -216,6 +217,7 @@ Parser->getLexer().setLexMasmIntegers(true); Parser->getLexer().useMasmDefaultRadix(true); Parser->getLexer().setLexMasmHexFloats(true); + Parser->getLexer().setLexMasmStrings(true); int Res = Parser->Run(/*NoInitialTextSection=*/true);