Index: include/llvm/Support/JSON.h =================================================================== --- include/llvm/Support/JSON.h +++ include/llvm/Support/JSON.h @@ -53,12 +53,37 @@ namespace llvm { namespace json { + +// === String encodings === +// +// JSON strings are character sequences (not byte sequences like std::string). +// We need to know the encoding, and for simplicity only support UTF-8. +// +// - When parsing, invalid UTF-8 is a syntax error like any other +// +// - When creating Values from strings, callers must ensure they are UTF-8. +// with asserts on, invalid UTF-8 will crash the program +// with asserts off, we'll substitute the replacement character (U+FFFD) +// Callers can use json::isUTF8() and json::fixUTF8() for validation. +// +// - When retrieving strings from Values (e.g. asString()), the result will +// always be valid UTF-8. + +/// Returns true if \p S is valid UTF-8, which is required for use as JSON. +/// If it returns false, \p Offset is set to a byte offset near the first error. +bool isUTF8(llvm::StringRef S, size_t *ErrOffset = nullptr); +/// Replaces invalid UTF-8 sequences in \p S with the replacement character +/// (U+FFFD). The returned string is valid UTF-8. +/// This is much slower than isUTF8, so test that first. +std::string fixUTF8(llvm::StringRef S); + class Array; class ObjectKey; class Value; /// An Object is a JSON object, which maps strings to heterogenous JSON values. /// The string keys may be owned or references. +/// They must be valid UTF-8, validated by caller! See comment above. class Object : public std::map { public: explicit Object() {} @@ -200,16 +225,26 @@ Value(json::Object &&Properties) : Type(T_Object) { create(std::move(Properties)); } - // Strings: types with value semantics. - Value(std::string &&V) : Type(T_String) { create(std::move(V)); } - Value(const std::string &V) : Type(T_String) { create(V); } - Value(const llvm::SmallVectorImpl &V) : Type(T_String) { - create(V.begin(), V.end()); + // Strings: types with value semantics. Must be valid UTF-8. + Value(std::string V) : Type(T_String) { + if (!LLVM_LIKELY(isUTF8(V))) { + assert(false && "Invalid UTF-8 in value used as JSON"); + V = fixUTF8(std::move(V)); + } + create(std::move(V)); } + Value(const llvm::SmallVectorImpl &V) + : Value(std::string(V.begin(), V.end())){}; Value(const llvm::formatv_object_base &V) : Value(V.str()){}; - // Strings: types with reference semantics. - Value(llvm::StringRef V) : Type(T_StringRef) { create(V); } - Value(const char *V) : Type(T_StringRef) { create(V); } + // Strings: types with reference semantics. Must be valid UTF-8. + Value(StringRef V) : Type(T_StringRef) { + create(V); + if (!LLVM_LIKELY(isUTF8(V))) { + assert(false && "Invalid UTF-8 in value used as JSON"); + *this = Value(fixUTF8(V)); + } + } + Value(const char *V) : Value(StringRef(V)) {} Value(std::nullptr_t) : Type(T_Null) {} // Prevent implicit conversions to boolean. template &V) : ObjectKey(std::string(V.begin(), V.end())) {} ObjectKey(const llvm::formatv_object_base &V) : ObjectKey(V.str()) {} Index: lib/Support/JSON.cpp =================================================================== --- lib/Support/JSON.cpp +++ lib/Support/JSON.cpp @@ -8,6 +8,7 @@ //===---------------------------------------------------------------------===// #include "llvm/Support/JSON.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Format.h" #include @@ -205,6 +206,14 @@ Parser(StringRef JSON) : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {} + bool checkUTF8() { + size_t ErrOffset; + if (isUTF8(StringRef(Start, End - Start), &ErrOffset)) + return true; + P = Start + ErrOffset; // For line/column calculation. + return parseError("Invalid UTF-8 sequence"); + } + bool parseValue(Value &Out); bool assertEnd() { @@ -499,13 +508,46 @@ Expected parse(StringRef JSON) { Parser P(JSON); Value E = nullptr; - if (P.parseValue(E)) - if (P.assertEnd()) - return std::move(E); + if (P.checkUTF8()) + if (P.parseValue(E)) + if (P.assertEnd()) + return std::move(E); return P.takeError(); } char ParseError::ID = 0; +bool isUTF8(llvm::StringRef S, size_t *ErrOffset) { + // Fast-path for ASCII, which is valid UTF-8. + for (unsigned char C : S) + if (LLVM_UNLIKELY(C & 0x80)) + goto not_ascii; + return true; + +not_ascii: + const UTF8 *Data = reinterpret_cast(S.data()), *Rest = Data; + bool OK = LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())); + if (!OK && ErrOffset) + *ErrOffset = Rest - Data; + return OK; +} + +std::string fixUTF8(llvm::StringRef S) { + // This isn't particularly efficient, but is only for error-recovery. + std::vector Codepoints(S.size()); // 1 codepoint per byte suffices. + const UTF8 *In8 = reinterpret_cast(S.data()); + UTF32 *Out32 = Codepoints.data(); + ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(), + lenientConversion); + Codepoints.resize(Out32 - Codepoints.data()); + std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice + const UTF32 *In32 = Codepoints.data(); + UTF8 *Out8 = reinterpret_cast(&Res[0]); + ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(), + strictConversion); + Res.resize(reinterpret_cast(Out8) - Res.data()); + return Res; +} + } // namespace json } // namespace llvm Index: unittests/Support/JSONTest.cpp =================================================================== --- unittests/Support/JSONTest.cpp +++ unittests/Support/JSONTest.cpp @@ -27,6 +27,14 @@ EXPECT_EQ(R"("foo")", s("foo")); EXPECT_EQ("[1,2,3]", s({1, 2, 3})); EXPECT_EQ(R"({"x":10,"y":20})", s(Object{{"x", 10}, {"y", 20}})); + +#ifdef NDEBUG + EXPECT_EQ(R"("��")", s("\xC0\x80")); + EXPECT_EQ(R"({"��":0})", s(Object{{"\xC0\x80", 0}})); +#else + EXPECT_DEATH(s("\xC0\x80"), "Invalid UTF-8"); + EXPECT_DEATH(s(Object{{"\xC0\x80", 0}}), "Invalid UTF-8"); +#endif } TEST(JSONTest, Constructors) { @@ -179,6 +187,31 @@ "valid": 1, invalid: 2 })"); + ExpectErr("Invalid UTF-8 sequence", "\"\xC0\x80\""); // WTF-8 null +} + +// Direct tests of isUTF8 and fixUTF8. Internal uses are also tested elsewhere. +TEST(JSONTest, UTF8) { + for (const char *Valid : { + "this is ASCII text", + "thïs tëxt häs BMP chäräctërs", + "𐌶𐌰L𐌾𐍈 C𐍈𐌼𐌴𐍃", + }) { + EXPECT_TRUE(isUTF8(Valid)) << Valid; + EXPECT_EQ(fixUTF8(Valid), Valid); + } + for (auto Invalid : std::vector>{ + {"lone trailing \x81\x82 bytes", "lone trailing �� bytes"}, + {"missing trailing \xD0 bytes", "missing trailing � bytes"}, + {"truncated character \xD0", "truncated character �"}, + {"not \xC1\x80 the \xE0\x9f\xBF shortest \xF0\x83\x83\x83 encoding", + "not �� the ��� shortest ���� encoding"}, + {"too \xF9\x80\x80\x80\x80 long", "too ����� long"}, + {"surrogate \xED\xA0\x80 invalid \xF4\x90\x80\x80", + "surrogate ��� invalid ����"}}) { + EXPECT_FALSE(isUTF8(Invalid.first)) << Invalid.first; + EXPECT_EQ(fixUTF8(Invalid.first), Invalid.second); + } } TEST(JSONTest, Inspection) {