Index: include/llvm/Support/JSON.h =================================================================== --- include/llvm/Support/JSON.h +++ include/llvm/Support/JSON.h @@ -53,12 +53,37 @@ namespace llvm { namespace json { + +// === String encodings === +// +// JSON strings are character sequences (not byte sequences like std::string). +// We need to know the encoding, and for simplicity only support UTF-8. +// +// - When parsing, invalid UTF-8 is a syntax error like any other +// +// - When creating Values from strings, callers must ensure they are UTF-8. +// with asserts on, invalid UTF-8 will crash the program +// with asserts off, we'll substitute the replacement character (U+FFFD) +// Callers can use json::isUTF8() and json::fixUTF8() for validation. +// +// - When retrieving strings from Values (e.g. asString()), the result will +// always be valid UTF-8. + +/// Returns true if \p S is valid UTF-8, which is required for use as JSON. +/// Doesn't check for invalid codepoints, which don't cause problems for us. +/// If it returns false, \p Offset is set to a byte offset near the first error. +bool isUTF8(llvm::StringRef S, size_t *ErrOffset = nullptr); +/// Replaces invalid UTF-8 sequences in \p S with the replacement character +/// (U+FFFD). The returned string is valid UTF-8. +std::string fixUTF8(llvm::StringRef S); + class Array; class ObjectKey; class Value; /// An Object is a JSON object, which maps strings to heterogenous JSON values. /// The string keys may be owned or references. +/// They must be valid UTF-8, validated by caller! See comment above. class Object : public std::map { public: explicit Object() {} @@ -179,6 +204,7 @@ /// And parsed: /// Expected E = json::parse("[1, 2, null]"); /// assert(E && E->kind() == Value::Array); +/// class Value { public: enum Kind { @@ -200,16 +226,26 @@ Value(json::Object &&Properties) : Type(T_Object) { create(std::move(Properties)); } - // Strings: types with value semantics. - Value(std::string &&V) : Type(T_String) { create(std::move(V)); } - Value(const std::string &V) : Type(T_String) { create(V); } - Value(const llvm::SmallVectorImpl &V) : Type(T_String) { - create(V.begin(), V.end()); + // Strings: types with value semantics. Must be valid UTF-8. + Value(std::string V) : Type(T_String) { + if (!LLVM_LIKELY(isUTF8(V))) { + assert(false && "Invalid UTF-8 in value used as JSON"); + V = fixUTF8(std::move(V)); + } + create(std::move(V)); } + Value(const llvm::SmallVectorImpl &V) + : Value(std::string(V.begin(), V.end())){}; Value(const llvm::formatv_object_base &V) : Value(V.str()){}; - // Strings: types with reference semantics. - Value(llvm::StringRef V) : Type(T_StringRef) { create(V); } - Value(const char *V) : Type(T_StringRef) { create(V); } + // Strings: types with reference semantics. Must be valid UTF-8. + Value(StringRef V) : Type(T_StringRef) { + create(V); + if (!LLVM_LIKELY(isUTF8(V))) { + assert(false && "Invalid UTF-8 in value used as JSON"); + *this = Value(fixUTF8(V)); + } + } + Value(const char *V) : Value(StringRef(V)) {} Value(std::nullptr_t) : Type(T_Null) {} // Prevent implicit conversions to boolean. template &V) : ObjectKey(std::string(V.begin(), V.end())) {} ObjectKey(const llvm::formatv_object_base &V) : ObjectKey(V.str()) {} @@ -538,6 +584,7 @@ return llvm::inconvertibleErrorCode(); } }; + } // namespace json /// Allow printing json::Value with formatv(). Index: lib/Support/JSON.cpp =================================================================== --- lib/Support/JSON.cpp +++ lib/Support/JSON.cpp @@ -205,6 +205,14 @@ Parser(StringRef JSON) : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {} + bool checkUTF8() { + size_t ErrOffset; + if (isUTF8(StringRef(Start, End - Start), &ErrOffset)) + return true; + P = Start + ErrOffset; // For line/column calculation. + return parseError("Invalid UTF-8 sequence"); + } + bool parseValue(Value &Out); bool assertEnd() { @@ -499,13 +507,90 @@ Expected parse(StringRef JSON) { Parser P(JSON); Value E = nullptr; - if (P.parseValue(E)) - if (P.assertEnd()) - return std::move(E); + if (P.checkUTF8()) + if (P.parseValue(E)) + if (P.assertEnd()) + return std::move(E); return P.takeError(); } char ParseError::ID = 0; +// Advances I past the character starting at S[I]. +// Returns its length, or 0 if invalid. +LLVM_ATTRIBUTE_ALWAYS_INLINE +static int measureChar(llvm::StringRef S, size_t &I) { + unsigned char C = S[I++]; + if (LLVM_LIKELY(!(C & 0x80))) // ASCII + return 1; + + auto EatTrailing = [&]() -> unsigned char { + if (LLVM_LIKELY(I < S.size() && ((unsigned char)S[I] & 0xC0) == 0x80)) + return S[I++]; + return 0; + }; + // Multi-byte character length is encoded in the leading ones of C. + // We'll return if valid and break if invalid. + switch (countLeadingOnes(C)) { + case 1: // 10xxx is a continuation byte. + break; + case 2: // 110xxxxx 10xxxxxx. + // U+80 = C2 80 is the first two-byte character. + if (C < 0xC2 || !EatTrailing()) + break; + return 2; + case 3: // 1110xxxx 10xxxxxx 10xxxxxx. + // U+800 = E0 A0 80 is the first three-byte character. + if (C == 0xE0) { + if (EatTrailing() < 0xA0 || !EatTrailing()) + break; + } else if (!EatTrailing() || !EatTrailing()) { + break; + } + return 3; + case 4: // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx. + // U+10000 = F0 90 80 80 is the first three-byte character. + if (C == 0xF0) { + if (EatTrailing() < 0x90 || !EatTrailing() || !EatTrailing()) + break; + } else if (!EatTrailing() || !EatTrailing() || !EatTrailing()) { + break; + } + return 4; + default: // 11111xxx is invalid. + break; + } + // If we fell off the end of the switch, the character is invalid. + while (EatTrailing()) // Advance to the next plausible character. + ; + return 0; +} + +bool isUTF8(llvm::StringRef S, size_t *ErrOffset) { + // We could optimize for the all-ASCII case, but this already compiles down + // to a tight loop in that case. + for (size_t I = 0; I < S.size();) + if (!measureChar(S, I)) { + if (ErrOffset) + *ErrOffset = I - 1; // I points *after* the error, rewind into range. + return false; + } + return true; +} + +std::string fixUTF8(llvm::StringRef S) { + std::string Result; + for (size_t I = 0; I < S.size();) + if (int Len = measureChar(S, I)) + Result.append(S.data() + I - Len, Len); + else { + // UTF-8 encoded replacement character (U+FFFD). + Result.push_back(0xef); + Result.push_back(0xbf); + Result.push_back(0xbd); + } + return Result; +} + } // namespace json } // namespace llvm Index: unittests/Support/JSONTest.cpp =================================================================== --- unittests/Support/JSONTest.cpp +++ unittests/Support/JSONTest.cpp @@ -27,6 +27,14 @@ EXPECT_EQ(R"("foo")", s("foo")); EXPECT_EQ("[1,2,3]", s({1, 2, 3})); EXPECT_EQ(R"({"x":10,"y":20})", s(Object{{"x", 10}, {"y", 20}})); + +#ifdef NDEBUG + EXPECT_EQ(R"("�")", s("\xC0\x80")); + EXPECT_EQ(R"({"�":0})", s(Object{{"\xC0\x80", 0}})); +#else + EXPECT_DEATH(s("\xC0\x80"), "Invalid UTF-8"); + EXPECT_DEATH(s(Object{{"\xC0\x80", 0}}), "Invalid UTF-8"); +#endif } TEST(JSONTest, Constructors) { @@ -179,6 +187,31 @@ "valid": 1, invalid: 2 })"); + ExpectErr("Invalid UTF-8 sequence", "\"\xC0\x80\""); // WTF-8 null +} + +// Direct tests of isUTF8 and fixUTF8. Internal uses are also tested elsewhere. +TEST(JSONTest, UTF8) { + for (const char *Valid : { + "this is ASCII text", + "thïs tëxt häs BMP chäräctërs", + "𐌶𐌰L𐌾𐍈 C𐍈𐌼𐌴𐍃", + "invalid \xED\xA0\x80 codepoints \xF4\x90\x80\x80 not checked", + }) { + EXPECT_TRUE(isUTF8(Valid)) << Valid; + EXPECT_EQ(fixUTF8(Valid), Valid); + } + for (auto Invalid : std::vector>{ + {"lone trailing \x81\x82 bytes", "lone trailing � bytes"}, + {"missing trailing \xD0 bytes", "missing trailing � bytes"}, + {"truncated character \xD0", "truncated character �"}, + {"not \xC1\x80 the \xE0\x9f\xBF shortest \xF0\x83\x83\x83 encoding", + "not � the � shortest � encoding"}, + {"too \xF9\x80\x80\x80\x80 long", "too � long"}, + }) { + EXPECT_FALSE(isUTF8(Invalid.first)) << Invalid.first; + EXPECT_EQ(fixUTF8(Invalid.first), Invalid.second); + } } TEST(JSONTest, Inspection) {