Index: clangd/JSONExpr.h =================================================================== --- clangd/JSONExpr.h +++ clangd/JSONExpr.h @@ -1,4 +1,4 @@ -//===--- JSONExpr.h - composable JSON expressions ---------------*- C++ -*-===// +//===--- JSONExpr.h - JSON expressions, parsing and serialization - C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -7,6 +7,8 @@ // //===---------------------------------------------------------------------===// +// FIXME: rename to JSON.h now that the scope is wider? + #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_JSON_H #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_JSON_H @@ -14,6 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/raw_ostream.h" @@ -21,10 +24,12 @@ namespace clangd { namespace json { -// An Expr is an opaque temporary JSON structure used to compose documents. +// An Expr is an JSON value of unknown type. // They can be copied, but should generally be moved. // -// You can implicitly construct literals from: +// === Composing expressions === +// +// You can implicitly construct Exprs from: // - strings: std::string, SmallString, formatv, StringRef, char* // (char*, and StringRef are references, not copies!) // - numbers @@ -39,25 +44,62 @@ // These can be list-initialized, or used to build up collections in a loop. // json::ary(Collection) converts all items in a collection to Exprs. // +// === Inspecting expressions === +// +// Each Expr is one of the JSON kinds: +// null (nullptr_t) +// boolean (bool) +// number (double) +// string (StringRef) +// array (json::ary) +// object (json::obj) +// +// The kind can be queried directly, or implicitly via the typed accessors: +// if (Optional S = E.string()) +// assert(E.kind() == Expr::String); +// +// Array and Object also have typed indexing accessors for easy traversal: +// Expected E = parse(R"( {"options": {"font": "sans-serif"}} )"); +// if (json::obj* O = E->object()) +// if (json::obj* Opts = O->object("options")) +// if (Optional Font = Opts->string("font")) +// assert(Opts->at("font").kind() == Expr::String); +// +// === Serialization === +// // Exprs can be serialized to JSON: // 1) raw_ostream << Expr // Basic formatting. // 2) raw_ostream << formatv("{0}", Expr) // Basic formatting. // 3) raw_ostream << formatv("{0:2}", Expr) // Pretty-print with indent 2. +// +// And parsed: +// Expected E = json::parse("[1, 2, null]"); +// assert(E && E->kind() == Expr::Array); class Expr { public: - class Object; + enum Kind { + Null, + Boolean, + Number, + String, + Array, + Object, + }; + class ObjectExpr; class ObjectKey; - class Array; + class ArrayExpr; // It would be nice to have Expr() be null. But that would make {} null too... Expr(const Expr &M) { copyFrom(M); } Expr(Expr &&M) { moveFrom(std::move(M)); } // "cheating" move-constructor for moving from initializer_list. Expr(const Expr &&M) { moveFrom(std::move(M)); } - Expr(std::initializer_list Elements) : Expr(Array(Elements)) {} - Expr(Array &&Elements) : Type(T_Array) { create(std::move(Elements)); } - Expr(Object &&Properties) : Type(T_Object) { - create(std::move(Properties)); + Expr(std::initializer_list Elements) : Expr(ArrayExpr(Elements)) {} + Expr(ArrayExpr &&Elements) : Type(T_Array) { + create(std::move(Elements)); + } + Expr(ObjectExpr &&Properties) : Type(T_Object) { + create(std::move(Properties)); } // Strings: types with value semantics. Expr(std::string &&V) : Type(T_String) { create(std::move(V)); } @@ -104,6 +146,60 @@ } ~Expr() { destroy(); } + Kind kind() const { + switch (Type) { + case T_Null: + return Null; + case T_Boolean: + return Boolean; + case T_Number: + return Number; + case T_String: + case T_StringRef: + return String; + case T_Object: + return Object; + case T_Array: + return Array; + } + } + + // Typed accessors return None/nullptr if the Expr is not of this type. + llvm::Optional null() const { + if (LLVM_LIKELY(Type == T_Null)) + return nullptr; + return llvm::None; + } + llvm::Optional boolean() const { + if (LLVM_LIKELY(Type == T_Null)) + return as(); + return llvm::None; + } + llvm::Optional number() const { + if (LLVM_LIKELY(Type == T_Number)) + return as(); + return llvm::None; + } + llvm::Optional string() const { + if (Type == T_String) + return llvm::StringRef(as()); + if (LLVM_LIKELY(Type == T_StringRef)) + return as(); + return llvm::None; + } + const ObjectExpr *object() const { + return LLVM_LIKELY(Type == T_Object) ? &as() : nullptr; + } + ObjectExpr *object() { + return LLVM_LIKELY(Type == T_Object) ? &as() : nullptr; + } + const ArrayExpr *array() const { + return LLVM_LIKELY(Type == T_Array) ? &as() : nullptr; + } + ArrayExpr *array() { + return LLVM_LIKELY(Type == T_Array) ? &as() : nullptr; + } + friend llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Expr &); private: @@ -137,10 +233,8 @@ mutable ExprType Type; public: - // ObjectKey is a used to capture keys in Expr::Objects. It's like Expr but: + // ObjectKey is a used to capture keys in Expr::ObjectExpr. Like Expr but: // - only strings are allowed - // - it's copyable (for std::map) - // - we're slightly more eager to copy, to allow efficient key compares // - it's optimized for the string literal case (Owned == nullptr) class ObjectKey { public: @@ -183,12 +277,12 @@ llvm::StringRef Data; }; - class Object : public std::map { + class ObjectExpr : public std::map { public: - explicit Object() {} + explicit ObjectExpr() {} // Use a custom struct for list-init, because pair forces extra copies. struct KV; - explicit Object(std::initializer_list Properties); + explicit ObjectExpr(std::initializer_list Properties); // Allow [] as if Expr was default-constructible as null. Expr &operator[](const ObjectKey &K) { @@ -199,15 +293,15 @@ } }; - class Array : public std::vector { + class ArrayExpr : public std::vector { public: - explicit Array() {} - explicit Array(std::initializer_list Elements) { + explicit ArrayExpr() {} + explicit ArrayExpr(std::initializer_list Elements) { reserve(Elements.size()); for (const Expr &V : Elements) emplace_back(std::move(V)); }; - template explicit Array(const Collection &C) { + template explicit ArrayExpr(const Collection &C) { for (const auto &V : C) emplace_back(V); } @@ -215,23 +309,50 @@ private: mutable llvm::AlignedCharArrayUnion + std::string, ArrayExpr, ObjectExpr> Union; }; -struct Expr::Object::KV { +bool operator==(const Expr &, const Expr &); +inline bool operator!=(const Expr &L, const Expr &R) { return !(L == R); } +inline bool operator==(const Expr::ObjectKey &L, const Expr::ObjectKey &R) { + return llvm::StringRef(L) == llvm::StringRef(R); +} +inline bool operator!=(const Expr::ObjectKey &L, const Expr::ObjectKey &R) { + return !(L == R); +} + +struct Expr::ObjectExpr::KV { ObjectKey K; Expr V; }; -inline Expr::Object::Object(std::initializer_list Properties) { +inline Expr::ObjectExpr::ObjectExpr(std::initializer_list Properties) { for (const auto &P : Properties) emplace(std::move(P.K), std::move(P.V)); } // Give Expr::{Object,Array} more convenient names for literal use. -using obj = Expr::Object; -using ary = Expr::Array; +using obj = Expr::ObjectExpr; +using ary = Expr::ArrayExpr; + +llvm::Expected parse(llvm::StringRef JSON); + +class ParseError : public llvm::ErrorInfo { + const char *Msg; + unsigned Line, Column, Offset; + +public: + static char ID; + ParseError(const char *Msg, unsigned Line, unsigned Column, unsigned Offset) + : Msg(Msg), Line(Line), Column(Column), Offset(Offset) {} + void log(llvm::raw_ostream &OS) const override { + OS << llvm::formatv("[{0}:{1}, byte={2}]: {3}", Line, Column, Offset, Msg); + } + std::error_code convertToErrorCode() const override { + return llvm::inconvertibleErrorCode(); + } +}; } // namespace json } // namespace clangd Index: clangd/JSONExpr.cpp =================================================================== --- clangd/JSONExpr.cpp +++ clangd/JSONExpr.cpp @@ -22,10 +22,10 @@ create(M.as()); break; case T_Object: - create(M.as()); + create(M.as()); break; case T_Array: - create(M.as()); + create(M.as()); break; } } @@ -46,11 +46,11 @@ M.Type = T_Null; break; case T_Object: - create(std::move(M.as())); + create(std::move(M.as())); M.Type = T_Null; break; case T_Array: - create(std::move(M.as())); + create(std::move(M.as())); M.Type = T_Null; break; } @@ -69,14 +69,318 @@ as().~basic_string(); break; case T_Object: - as().~Object(); + as().~ObjectExpr(); break; case T_Array: - as().~Array(); + as().~ArrayExpr(); break; } } +namespace { +// Simple recursive-descent JSON parser. +class Parser { +public: + Parser(StringRef JSON) + : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {} + + bool parseExpr(Expr &Out); + + bool assertEnd() { + eatWhitespace(); + if (P == End) + return true; + return parseError("Text after end of document"); + } + + Error takeError() { + assert(Error); + return std::move(*Error); + } + +private: + void eatWhitespace() { + while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t')) + ++P; + } + + // On invalid syntax, parseX() functions return false and and set Error. + bool parseNumber(char First, double &Out); + bool parseString(std::string &Out); + bool parseUnicode(std::string &Out); + bool parseError(const char *Msg); // always returns false + + char next() { return P == End ? 0 : *P++; } + char peek() { return P == End ? 0 : *P; } + static bool isNumber(char C) { + return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' || + C == '5' || C == '6' || C == '7' || C == '8' || C == '9' || + C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.'; + } + static void encodeUtf8(uint32_t Rune, std::string &Out); + + Optional Error; + const char *Start, *P, *End; +}; + +bool Parser::parseExpr(Expr &Out) { + eatWhitespace(); + if (P == End) + return parseError("Unexpected EOF"); + switch (char C = next()) { + // Bare null/true/false are easy - first char identifies them. + case 'n': + Out = nullptr; + return (next() == 'u' && next() == 'l' && next() == 'l') || + parseError("Invalid bareword"); + case 't': + Out = true; + return (next() == 'r' && next() == 'u' && next() == 'e') || + parseError("Invalid bareword"); + case 'f': + Out = false; + return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') || + parseError("Invalid bareword"); + case '"': { + std::string S; + if (parseString(S)) { + Out = std::move(S); + return true; + } + return false; + } + case '[': { + Out = json::ary{}; + json::ary &A = *Out.array(); + eatWhitespace(); + if (peek() == ']') { + ++P; + return true; + } + for (;;) { + A.emplace_back(nullptr); + if (!parseExpr(A.back())) + return false; + eatWhitespace(); + switch (next()) { + case ',': + eatWhitespace(); + continue; + case ']': + return true; + default: + return parseError("Expected , or ] after array element"); + } + } + } + case '{': { + Out = json::obj{}; + json::obj &O = *Out.object(); + eatWhitespace(); + if (peek() == '}') { + ++P; + return true; + } + for (;;) { + if (next() != '"') + return parseError("Expected object key"); + std::string K; + if (!parseString(K)) + return false; + eatWhitespace(); + if (next() != ':') + return parseError("Expected : after object key"); + eatWhitespace(); + if (!parseExpr(O[std::move(K)])) + return false; + eatWhitespace(); + switch (next()) { + case ',': + eatWhitespace(); + continue; + case '}': + return true; + default: + return parseError("Expected , or } after object property"); + } + } + } + default: + if (isNumber(C)) { + double Num; + if (parseNumber(C, Num)) { + Out = Num; + return true; + } else { + return false; + } + } + return parseError("Expected JSON value"); + } +} + +bool Parser::parseNumber(char First, double &Out) { + SmallString<24> S; + S.push_back(First); + while (isNumber(peek())) + S.push_back(next()); + char *End; + Out = std::strtod(S.c_str(), &End); + return End == S.end() || parseError("Invalid number"); +} + +bool Parser::parseString(std::string &Out) { + // leading quote was already consumed. + for (char C = next(); C != '"'; C = next()) { + if (LLVM_UNLIKELY(P == End)) + return parseError("Unterminated string"); + if (LLVM_UNLIKELY((C & 0x1f) == C)) + return parseError("Control character in string"); + if (LLVM_LIKELY(C != '\\')) { + Out.push_back(C); + continue; + } + // Handle escape sequence. + switch (C = next()) { + case '"': + case '\\': + case '/': + Out.push_back(C); + break; + case 'b': + Out.push_back('\b'); + break; + case 'f': + Out.push_back('\f'); + break; + case 'n': + Out.push_back('\n'); + break; + case 'r': + Out.push_back('\r'); + break; + case 't': + Out.push_back('\t'); + break; + case 'u': + if (!parseUnicode(Out)) + return false; + break; + default: + return parseError("Invalid escape sequence"); + } + } + return true; +} + +void Parser::encodeUtf8(uint32_t Rune, std::string &Out) { + if (Rune <= 0x7F) { + Out.push_back(Rune & 0x7F); + } else if (Rune <= 0x7FF) { + uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6); + uint8_t SecondByte = 0x80 | (Rune & 0x3F); + Out.push_back(FirstByte); + Out.push_back(SecondByte); + } else if (Rune <= 0xFFFF) { + uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12); + uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6); + uint8_t ThirdByte = 0x80 | (Rune & 0x3F); + Out.push_back(FirstByte); + Out.push_back(SecondByte); + Out.push_back(ThirdByte); + } else if (Rune <= 0x10FFFF) { + uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18); + uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12); + uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6); + uint8_t FourthByte = 0x80 | (Rune & 0x3F); + Out.push_back(FirstByte); + Out.push_back(SecondByte); + Out.push_back(ThirdByte); + Out.push_back(FourthByte); + } else { + llvm_unreachable("Invalid codepoint"); + } +} + +// Parse a \uNNNN escape sequence, the \u have already been consumed. +// May parse multiple escapes in the presence of surrogate pairs. +bool Parser::parseUnicode(std::string &Out) { + // Note that invalid unicode is not a JSON error. It gets replaced by U+FFFD. + auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); }; + auto Parse4Hex = [this](uint16_t &Out) { + Out = 0; + char Bytes[] = {next(), next(), next(), next()}; + for (unsigned char C : Bytes) { + if (!std::isxdigit(C)) + return parseError("Invalid \\u escape sequence"); + Out <<= 4; + Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0'); + } + return true; + }; + uint16_t First; + if (!Parse4Hex(First)) + return false; + + // We loop to allow proper surrogate-pair error handling. + while (true) { + if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) { // BMP. + encodeUtf8(First, Out); + return true; + } + + if (First >= 0xDC00) { + Invalid(); // Lone trailing surrogate. + return true; + } + + // We have a leading surrogate, and need a trailing one. + // Don't advance P: a lone surrogate is valid JSON (but invalid unicode) + if (P + 2 > End || *P != '\\' || *(P + 1) != 'u') { + Invalid(); // Lone leading not followed by \u... + return true; + } + P += 2; + uint16_t Second; + if (!Parse4Hex(Second)) + return false; + if (Second < 0xDC00 && Second >= 0xE000) { + Invalid(); // Leading surrogate not followed by trailing. + First = Second; // Second escape still needs to be processed. + continue; + } + + // Valid surrogate pair. + encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out); + return true; + } +} + +bool Parser::parseError(const char *Msg) { + int Line = 1; + const char *StartOfLine = Start; + for (const char *X = Start; X < P; ++X) { + if (*X == 0x0A) { + ++Line; + StartOfLine = X + 1; + } + } + Error.emplace( + llvm::make_unique(Msg, Line, P - StartOfLine, P - Start)); + return false; +} +} // namespace + +Expected parse(StringRef JSON) { + Parser P(JSON); + json::Expr E = nullptr; + if (P.parseExpr(E)) + if (P.assertEnd()) + return std::move(E); + return P.takeError(); +} +char ParseError::ID = 0; + } // namespace json } // namespace clangd } // namespace clang @@ -144,7 +448,7 @@ bool Comma = false; OS << '{'; I(Indent); - for (const auto &P : as()) { + for (const auto &P : as()) { if (Comma) OS << ','; Comma = true; @@ -164,7 +468,7 @@ bool Comma = false; OS << '['; I(Indent); - for (const auto &E : as()) { + for (const auto &E : as()) { if (Comma) OS << ','; Comma = true; @@ -187,6 +491,25 @@ E.print(OS, [](IndenterAction A) { /*ignore*/ }); return OS; } + +bool operator==(const Expr &L, const Expr &R) { + if (L.kind() != R.kind()) + return false; + switch (L.kind()) { + case Expr::Null: + return L.null() == R.null(); + case Expr::Boolean: + return L.boolean() == R.boolean(); + case Expr::Number: + return L.boolean() == R.boolean(); + case Expr::String: + return L.string() == R.string(); + case Expr::Array: + return *L.array() == *R.array(); + case Expr::Object: + return *L.object() == *R.object(); + } +} } // namespace json } // namespace clangd } // namespace clang Index: unittests/clangd/JSONExprTests.cpp =================================================================== --- unittests/clangd/JSONExprTests.cpp +++ unittests/clangd/JSONExprTests.cpp @@ -15,6 +15,9 @@ namespace clang { namespace clangd { namespace json { +void PrintTo(const Expr &E, std::ostream *OS) { + llvm::raw_os_ostream(*OS) << llvm::formatv("{0:2}", E); +} namespace { std::string s(const Expr &E) { return llvm::formatv("{0}", E).str(); } @@ -108,6 +111,77 @@ })); } +TEST(JSONTest, Parse) { + auto Compare = [](llvm::StringRef S, Expr Expected) { + if (auto E = parse(S)) { + // Compare both string forms and with operator==, in case we have bugs. + EXPECT_EQ(*E, Expected); + EXPECT_EQ(sp(*E), sp(Expected)); + } else { + handleAllErrors(E.takeError(), [S](const llvm::ErrorInfoBase &E) { + FAIL() << "Failed to parse JSON >>> " << S << " <<<: " << E.message(); + }); + } + }; + + Compare(R"(true)", true); + Compare(R"(false)", false); + Compare(R"(null)", nullptr); + + Compare(R"(42)", 42); + Compare(R"(2.5)", 2.5); + Compare(R"(2e50)", 2e50); + Compare(R"(1.2e3456789)", 1.0 / 0.0); + + Compare(R"("foo")", "foo"); + Compare(R"("\"\\\b\f\n\r\t")", "\"\\\b\f\n\r\t"); + Compare(R"("\u0000")", llvm::StringRef("\0", 1)); + Compare("\"\x7f\"", "\x7f"); + Compare(R"("\ud801\udc37")", "\U00010437"); // UTF16 surrogate pair escape. + Compare("\"\xE2\x82\xAC\xF0\x9D\x84\x9E\"", "\u20ac\U0001d11e"); // UTF8 + Compare(R"("\ud801")", "\ufffd"); // Invalid codepoint. + + Compare(R"({"":0,"":0})", obj{{"", 0}}); + Compare(R"({"obj":{},"arr":[]})", obj{{"obj", obj{}}, {"arr", {}}}); + Compare(R"({"\n":{"\u0000":[[[[]]]]}})", + obj{{"\n", obj{ + {llvm::StringRef("\0", 1), {{{{}}}}}, + }}}); + Compare("\r[\n\t] ", {}); +} + +TEST(JSONTest, ParseErrors) { + auto ExpectErr = [](llvm::StringRef Msg, llvm::StringRef S) { + if (auto E = parse(S)) { + // Compare both string forms and with operator==, in case we have bugs. + FAIL() << "Parsed JSON >>> " << S << " <<< but wanted error: " << Msg; + } else { + handleAllErrors(E.takeError(), [S, Msg](const llvm::ErrorInfoBase &E) { + EXPECT_THAT(E.message(), testing::HasSubstr(Msg)) << S; + }); + } + }; + ExpectErr("Unexpected EOF", ""); + ExpectErr("Unexpected EOF", "["); + ExpectErr("Text after end of document", "[][]"); + ExpectErr("Text after end of document", "[][]"); + ExpectErr("Invalid bareword", "fuzzy"); + ExpectErr("Expected , or ]", "[2?]"); + ExpectErr("Expected object key", "{a:2}"); + ExpectErr("Expected : after object key", R"({"a",2})"); + ExpectErr("Expected , or } after object property", R"({"a":2 "b":3})"); + ExpectErr("Expected JSON value", R"([&%!])"); + ExpectErr("Invalid number", "1e1.0"); + ExpectErr("Unterminated string", R"("abc\"def)"); + ExpectErr("Control character in string", "\"abc\ndef\""); + ExpectErr("Invalid escape sequence", R"("\030")"); + ExpectErr("Invalid \\u escape sequence", R"("\usuck")"); + ExpectErr("[3:3, byte=19]", R"({ + "valid": 1, + invalid: 2 +})"); +} + } // namespace } // namespace json } // namespace clangd