Index: include/llvm/ADT/StringExtras.h =================================================================== --- include/llvm/ADT/StringExtras.h +++ include/llvm/ADT/StringExtras.h @@ -88,6 +88,17 @@ /// lowercase letter as classified by "C" locale. inline bool isAlnum(char C) { return isAlpha(C) || isDigit(C); } +/// Checks whether character \p C is valid ASCII (high bit is zero). +inline bool isASCII(char C) { return static_cast(C) <= 127; } + +/// Checks whether all characters in S are ASCII. +inline bool isASCII(llvm::StringRef S) { + for (char C : S) + if (LLVM_UNLIKELY(!isASCII(C))) + return false; + return true; +} + /// Returns the corresponding lowercase character if \p x is uppercase. inline char toLower(char x) { if (x >= 'A' && x <= 'Z') Index: include/llvm/Support/JSON.h =================================================================== --- include/llvm/Support/JSON.h +++ include/llvm/Support/JSON.h @@ -54,6 +54,30 @@ namespace llvm { namespace json { + +// === String encodings === +// +// JSON strings are character sequences (not byte sequences like std::string). +// We need to know the encoding, and for simplicity only support UTF-8. +// +// - When parsing, invalid UTF-8 is a syntax error like any other +// +// - When creating Values from strings, callers must ensure they are UTF-8. +// with asserts on, invalid UTF-8 will crash the program +// with asserts off, we'll substitute the replacement character (U+FFFD) +// Callers can use json::isUTF8() and json::fixUTF8() for validation. +// +// - When retrieving strings from Values (e.g. asString()), the result will +// always be valid UTF-8. + +/// Returns true if \p S is valid UTF-8, which is required for use as JSON. +/// If it returns false, \p Offset is set to a byte offset near the first error. +bool isUTF8(llvm::StringRef S, size_t *ErrOffset = nullptr); +/// Replaces invalid UTF-8 sequences in \p S with the replacement character +/// (U+FFFD). The returned string is valid UTF-8. +/// This is much slower than isUTF8, so test that first. +std::string fixUTF8(llvm::StringRef S); + class Array; class ObjectKey; class Value; @@ -273,16 +297,26 @@ Value(json::Object &&Properties) : Type(T_Object) { create(std::move(Properties)); } - // Strings: types with value semantics. - Value(std::string &&V) : Type(T_String) { create(std::move(V)); } - Value(const std::string &V) : Type(T_String) { create(V); } - Value(const llvm::SmallVectorImpl &V) : Type(T_String) { - create(V.begin(), V.end()); + // Strings: types with value semantics. Must be valid UTF-8. + Value(std::string V) : Type(T_String) { + if (LLVM_UNLIKELY(!isUTF8(V))) { + assert(false && "Invalid UTF-8 in value used as JSON"); + V = fixUTF8(std::move(V)); + } + create(std::move(V)); } + Value(const llvm::SmallVectorImpl &V) + : Value(std::string(V.begin(), V.end())){}; Value(const llvm::formatv_object_base &V) : Value(V.str()){}; - // Strings: types with reference semantics. - Value(llvm::StringRef V) : Type(T_StringRef) { create(V); } - Value(const char *V) : Type(T_StringRef) { create(V); } + // Strings: types with reference semantics. Must be valid UTF-8. + Value(StringRef V) : Type(T_StringRef) { + create(V); + if (LLVM_UNLIKELY(!isUTF8(V))) { + assert(false && "Invalid UTF-8 in value used as JSON"); + *this = Value(fixUTF8(V)); + } + } + Value(const char *V) : Value(StringRef(V)) {} Value(std::nullptr_t) : Type(T_Null) {} // Boolean (disallow implicit conversions). // (The last template parameter is a dummy to keep templates distinct.) @@ -449,13 +483,23 @@ /// ObjectKey is a used to capture keys in Object. Like Value but: /// - only strings are allowed /// - it's optimized for the string literal case (Owned == nullptr) +/// Like Value, strings must be UTF-8. See isUTF8 documentation for details. class ObjectKey { public: - ObjectKey(const char *S) : Data(S) {} - ObjectKey(llvm::StringRef S) : Data(S) {} - ObjectKey(std::string &&V) - : Owned(new std::string(std::move(V))), Data(*Owned) {} - ObjectKey(const std::string &V) : Owned(new std::string(V)), Data(*Owned) {} + ObjectKey(const char *S) : ObjectKey(StringRef(S)) {} + ObjectKey(std::string S) : Owned(new std::string(std::move(S))) { + if (LLVM_UNLIKELY(!isUTF8(*Owned))) { + assert(false && "Invalid UTF-8 in value used as JSON"); + *Owned = fixUTF8(std::move(*Owned)); + } + Data = *Owned; + } + ObjectKey(llvm::StringRef S) : Data(S) { + if (LLVM_UNLIKELY(!isUTF8(Data))) { + assert(false && "Invalid UTF-8 in value used as JSON"); + *this = ObjectKey(fixUTF8(S)); + } + } ObjectKey(const llvm::SmallVectorImpl &V) : ObjectKey(std::string(V.begin(), V.end())) {} ObjectKey(const llvm::formatv_object_base &V) : ObjectKey(V.str()) {} Index: lib/Support/JSON.cpp =================================================================== --- lib/Support/JSON.cpp +++ lib/Support/JSON.cpp @@ -8,6 +8,7 @@ //===---------------------------------------------------------------------===// #include "llvm/Support/JSON.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Format.h" #include @@ -199,6 +200,14 @@ Parser(StringRef JSON) : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {} + bool checkUTF8() { + size_t ErrOffset; + if (isUTF8(StringRef(Start, End - Start), &ErrOffset)) + return true; + P = Start + ErrOffset; // For line/column calculation. + return parseError("Invalid UTF-8 sequence"); + } + bool parseValue(Value &Out); bool assertEnd() { @@ -458,7 +467,7 @@ // Case 3: it's a leading surrogate. We expect a trailing one next. // Case 3a: there's no trailing \u escape. Don't advance in the stream. - if (!LLVM_LIKELY(P + 2 <= End && *P == '\\' && *(P + 1) == 'u')) { + if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) { Invalid(); // Leading surrogate was unpaired. return true; } @@ -496,9 +505,10 @@ Expected parse(StringRef JSON) { Parser P(JSON); Value E = nullptr; - if (P.parseValue(E)) - if (P.assertEnd()) - return std::move(E); + if (P.checkUTF8()) + if (P.parseValue(E)) + if (P.assertEnd()) + return std::move(E); return P.takeError(); } char ParseError::ID = 0; @@ -514,6 +524,37 @@ return Elements; } +bool isUTF8(llvm::StringRef S, size_t *ErrOffset) { + // Fast-path for ASCII, which is valid UTF-8. + if (LLVM_LIKELY(isASCII(S))) + return true; + + const UTF8 *Data = reinterpret_cast(S.data()), *Rest = Data; + if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size()))) + return true; + + if (ErrOffset) + *ErrOffset = Rest - Data; + return false; +} + +std::string fixUTF8(llvm::StringRef S) { + // This isn't particularly efficient, but is only for error-recovery. + std::vector Codepoints(S.size()); // 1 codepoint per byte suffices. + const UTF8 *In8 = reinterpret_cast(S.data()); + UTF32 *Out32 = Codepoints.data(); + ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(), + lenientConversion); + Codepoints.resize(Out32 - Codepoints.data()); + std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice + const UTF32 *In32 = Codepoints.data(); + UTF8 *Out8 = reinterpret_cast(&Res[0]); + ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(), + strictConversion); + Res.resize(reinterpret_cast(Out8) - Res.data()); + return Res; +} + } // namespace json } // namespace llvm Index: unittests/Support/JSONTest.cpp =================================================================== --- unittests/Support/JSONTest.cpp +++ unittests/Support/JSONTest.cpp @@ -27,6 +27,14 @@ EXPECT_EQ(R"("foo")", s("foo")); EXPECT_EQ("[1,2,3]", s({1, 2, 3})); EXPECT_EQ(R"({"x":10,"y":20})", s(Object{{"x", 10}, {"y", 20}})); + +#ifdef NDEBUG + EXPECT_EQ(R"("��")", s("\xC0\x80")); + EXPECT_EQ(R"({"��":0})", s(Object{{"\xC0\x80", 0}})); +#else + EXPECT_DEATH(s("\xC0\x80"), "Invalid UTF-8"); + EXPECT_DEATH(s(Object{{"\xC0\x80", 0}}), "Invalid UTF-8"); +#endif } TEST(JSONTest, Constructors) { @@ -181,6 +189,31 @@ "valid": 1, invalid: 2 })"); + ExpectErr("Invalid UTF-8 sequence", "\"\xC0\x80\""); // WTF-8 null +} + +// Direct tests of isUTF8 and fixUTF8. Internal uses are also tested elsewhere. +TEST(JSONTest, UTF8) { + for (const char *Valid : { + "this is ASCII text", + "thïs tëxt häs BMP chäräctërs", + "𐌶𐌰L𐌾𐍈 C𐍈𐌼𐌴𐍃", + }) { + EXPECT_TRUE(isUTF8(Valid)) << Valid; + EXPECT_EQ(fixUTF8(Valid), Valid); + } + for (auto Invalid : std::vector>{ + {"lone trailing \x81\x82 bytes", "lone trailing �� bytes"}, + {"missing trailing \xD0 bytes", "missing trailing � bytes"}, + {"truncated character \xD0", "truncated character �"}, + {"not \xC1\x80 the \xE0\x9f\xBF shortest \xF0\x83\x83\x83 encoding", + "not �� the ��� shortest ���� encoding"}, + {"too \xF9\x80\x80\x80\x80 long", "too ����� long"}, + {"surrogate \xED\xA0\x80 invalid \xF4\x90\x80\x80", + "surrogate ��� invalid ����"}}) { + EXPECT_FALSE(isUTF8(Invalid.first)) << Invalid.first; + EXPECT_EQ(fixUTF8(Invalid.first), Invalid.second); + } } TEST(JSONTest, Inspection) {