Index: llvm/trunk/include/llvm/Support/YAMLParser.h =================================================================== --- llvm/trunk/include/llvm/Support/YAMLParser.h +++ llvm/trunk/include/llvm/Support/YAMLParser.h @@ -73,8 +73,11 @@ /// \returns true if there was an error, false otherwise. bool scanTokens(StringRef Input); -/// \brief Escape \a Input for a double quoted scalar. -std::string escape(StringRef Input); +/// \brief Escape \a Input for a double quoted scalar; if \p EscapePrintable +/// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is +/// false, those UTF8 sequences encoding printable unicode scalars will not be +/// escaped, but emitted verbatim. +std::string escape(StringRef Input, bool EscapePrintable = true); /// \brief This class represents a YAML stream potentially containing multiple /// documents. Index: llvm/trunk/lib/Support/YAMLParser.cpp =================================================================== --- llvm/trunk/lib/Support/YAMLParser.cpp +++ llvm/trunk/lib/Support/YAMLParser.cpp @@ -26,6 +26,7 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Support/Unicode.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -687,7 +688,7 @@ return true; } -std::string yaml::escape(StringRef Input) { +std::string yaml::escape(StringRef Input, bool EscapePrintable) { std::string EscapedInput; for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { if (*i == '\\') @@ -734,6 +735,9 @@ EscapedInput += "\\L"; else if (UnicodeScalarValue.first == 0x2029) EscapedInput += "\\P"; + else if (!EscapePrintable && + sys::unicode::isPrintable(UnicodeScalarValue.first)) + EscapedInput += StringRef(i, UnicodeScalarValue.second); else { std::string HexStr = utohexstr(UnicodeScalarValue.first); if (HexStr.size() <= 2) Index: llvm/trunk/lib/Support/YAMLTraits.cpp =================================================================== --- llvm/trunk/lib/Support/YAMLTraits.cpp +++ llvm/trunk/lib/Support/YAMLTraits.cpp @@ -638,39 +638,22 @@ const char *Base = S.data(); const char *const Quote = MustQuote == QuotingType::Single ? "'" : "\""; - const char QuoteChar = MustQuote == QuotingType::Single ? '\'' : '"'; - output(Quote); // Starting quote. - // When using single-quoted strings, any single quote ' must be doubled to be - // escaped. - // When using double-quoted strings, print \x + hex for non-printable ASCII - // characters, and escape double quotes. - while (j < End) { - if (S[j] == QuoteChar) { // Escape quotes. - output(StringRef(&Base[i], j - i)); // "flush". - if (MustQuote == QuotingType::Double) { // Print it as \" - output(StringLiteral("\\")); - output(StringRef(Quote, 1)); - } else { // Single - output(StringLiteral("''")); // Print it as '' - } - i = j + 1; - } else if (MustQuote == QuotingType::Double && - !sys::unicode::isPrintable(S[j]) && (S[j] & 0x80) == 0) { - // If we're double quoting non-printable characters, we prefer printing - // them as "\x" + their hex representation. Note that special casing is - // needed for UTF-8, where a byte may be part of a UTF-8 sequence and - // appear as non-printable, in which case we want to print the correct - // unicode character and not its hex representation. - output(StringRef(&Base[i], j - i)); // "flush" - output(StringLiteral("\\x")); - - // Output the byte 0x0F as \x0f. - auto FormattedHex = format_hex_no_prefix(S[j], 2); - Out << FormattedHex; - Column += 4; // one for the '\', one for the 'x', and two for the hex + // When using double-quoted strings (and only in that case), non-printable characters may be + // present, and will be escaped using a variety of unicode-scalar and special short-form + // escapes. This is handled in yaml::escape. + if (MustQuote == QuotingType::Double) { + output(yaml::escape(Base, /* EscapePrintable= */ false)); + this->outputUpToEndOfLine(Quote); + return; + } + // When using single-quoted strings, any single quote ' must be doubled to be escaped. + while (j < End) { + if (S[j] == '\'') { // Escape quotes. + output(StringRef(&Base[i], j - i)); // "flush". + output(StringLiteral("''")); // Print it as '' i = j + 1; } ++j; Index: llvm/trunk/unittests/Support/YAMLIOTest.cpp =================================================================== --- llvm/trunk/unittests/Support/YAMLIOTest.cpp +++ llvm/trunk/unittests/Support/YAMLIOTest.cpp @@ -2464,7 +2464,10 @@ yamlize(xout, Input, true, Ctx); ostr.flush(); - EXPECT_EQ(Expected, out); + + // Make a separate StringRef so we get nice byte-by-byte output. + llvm::StringRef Got(out); + EXPECT_EQ(Expected, Got); } TEST(YAMLIO, TestEscaped) { @@ -2485,4 +2488,17 @@ // UTF8 with single quote inside double quote TestEscaped("parameter 'параметр' is unused", "\"parameter 'параметр' is unused\""); + + // String with embedded non-printable multibyte UTF-8 sequence (U+200B + // zero-width space). The thing to test here is that we emit a + // unicode-scalar level escape like \uNNNN (at the YAML level), and don't + // just pass the UTF-8 byte sequence through as with quoted printables. + TestEscaped("foo\u200Bbar", "\"foo\\u200Bbar\""); + { + const unsigned char foobar[10] = {'f', 'o', 'o', + 0xE2, 0x80, 0x8B, // UTF-8 of U+200B + 'b', 'a', 'r', + 0x0}; + TestEscaped((char const *)foobar, "\"foo\\u200Bbar\""); + } }