Index: include/llvm/Support/JSON.h
===================================================================
--- include/llvm/Support/JSON.h
+++ include/llvm/Support/JSON.h
@@ -53,12 +53,37 @@
 
 namespace llvm {
 namespace json {
+
+// === String encodings ===
+//
+// JSON strings are character sequences (not byte sequences like std::string).
+// We need to know the encoding, and for simplicity only support UTF-8.
+//
+//   - When parsing, invalid UTF-8 is a syntax error like any other
+//
+//   - When creating Values from strings, callers must ensure they are UTF-8.
+//        with asserts on, invalid UTF-8 will crash the program
+//        with asserts off, we'll substitute the replacement character (U+FFFD)
+//     Callers can use json::isUTF8() and json::fixUTF8() for validation.
+//
+//   - When retrieving strings from Values (e.g. asString()), the result will
+//     always be valid UTF-8.
+
+/// Returns true if \p S is valid UTF-8, which is required for use as JSON.
+/// Doesn't check for invalid codepoints, which don't cause problems for us.
+/// If it returns false, \p Offset is set to a byte offset near the first error.
+bool isUTF8(llvm::StringRef S, size_t *ErrOffset = nullptr);
+/// Replaces invalid UTF-8 sequences in \p S with the replacement character
+/// (U+FFFD). The returned string is valid UTF-8.
+std::string fixUTF8(llvm::StringRef S);
+
 class Array;
 class ObjectKey;
 class Value;
 
 /// An Object is a JSON object, which maps strings to heterogenous JSON values.
 /// The string keys may be owned or references.
+/// They must be valid UTF-8, validated by caller! See comment above.
 class Object : public std::map<ObjectKey, Value> {
 public:
   explicit Object() {}
@@ -179,6 +204,7 @@
 /// And parsed:
 ///   Expected<Value> E = json::parse("[1, 2, null]");
 ///   assert(E && E->kind() == Value::Array);
+///
 class Value {
 public:
   enum Kind {
@@ -200,16 +226,26 @@
   Value(json::Object &&Properties) : Type(T_Object) {
     create<json::Object>(std::move(Properties));
   }
-  // Strings: types with value semantics.
-  Value(std::string &&V) : Type(T_String) { create<std::string>(std::move(V)); }
-  Value(const std::string &V) : Type(T_String) { create<std::string>(V); }
-  Value(const llvm::SmallVectorImpl<char> &V) : Type(T_String) {
-    create<std::string>(V.begin(), V.end());
+  // Strings: types with value semantics. Must be valid UTF-8.
+  Value(std::string V) : Type(T_String) {
+    if (!LLVM_LIKELY(isUTF8(V))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      V = fixUTF8(std::move(V));
+    }
+    create<std::string>(std::move(V));
   }
+  Value(const llvm::SmallVectorImpl<char> &V)
+      : Value(std::string(V.begin(), V.end())){};
   Value(const llvm::formatv_object_base &V) : Value(V.str()){};
-  // Strings: types with reference semantics.
-  Value(llvm::StringRef V) : Type(T_StringRef) { create<llvm::StringRef>(V); }
-  Value(const char *V) : Type(T_StringRef) { create<llvm::StringRef>(V); }
+  // Strings: types with reference semantics. Must be valid UTF-8.
+  Value(StringRef V) : Type(T_StringRef) {
+    create<llvm::StringRef>(V);
+    if (!LLVM_LIKELY(isUTF8(V))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      *this = Value(fixUTF8(V));
+    }
+  }
+  Value(const char *V) : Value(StringRef(V)) {}
   Value(std::nullptr_t) : Type(T_Null) {}
   // Prevent implicit conversions to boolean.
   template <typename T, typename = typename std::enable_if<
@@ -360,13 +396,23 @@
 /// ObjectKey is a used to capture keys in Object. Like Value but:
 ///   - only strings are allowed
 ///   - it's optimized for the string literal case (Owned == nullptr)
+/// Like Value, strings must be UTF-8. See isUTF8 documentation for details.
 class ObjectKey {
 public:
-  ObjectKey(const char *S) : Data(S) {}
-  ObjectKey(llvm::StringRef S) : Data(S) {}
-  ObjectKey(std::string &&V)
-      : Owned(new std::string(std::move(V))), Data(*Owned) {}
-  ObjectKey(const std::string &V) : Owned(new std::string(V)), Data(*Owned) {}
+  ObjectKey(const char *S) : ObjectKey(StringRef(S)) {}
+  ObjectKey(std::string S) : Owned(new std::string(std::move(S))) {
+    if (!LLVM_LIKELY(isUTF8(*Owned))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      *Owned = fixUTF8(std::move(*Owned));
+    }
+    Data = *Owned;
+  }
+  ObjectKey(llvm::StringRef S) : Data(S) {
+    if (!LLVM_LIKELY(isUTF8(Data))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      *this = ObjectKey(fixUTF8(S));
+    }
+  }
   ObjectKey(const llvm::SmallVectorImpl<char> &V)
       : ObjectKey(std::string(V.begin(), V.end())) {}
   ObjectKey(const llvm::formatv_object_base &V) : ObjectKey(V.str()) {}
@@ -538,6 +584,7 @@
     return llvm::inconvertibleErrorCode();
   }
 };
+
 } // namespace json
 
 /// Allow printing json::Value with formatv().
Index: lib/Support/JSON.cpp
===================================================================
--- lib/Support/JSON.cpp
+++ lib/Support/JSON.cpp
@@ -205,6 +205,14 @@
     Parser(StringRef JSON)
         : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
 
+    bool checkUTF8() {
+      size_t ErrOffset;
+      if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
+        return true;
+      P = Start + ErrOffset; // For line/column calculation.
+      return parseError("Invalid UTF-8 sequence");
+    }
+
     bool parseValue(Value &Out);
 
     bool assertEnd() {
@@ -499,13 +507,90 @@
 Expected<Value> parse(StringRef JSON) {
   Parser P(JSON);
   Value E = nullptr;
-  if (P.parseValue(E))
-    if (P.assertEnd())
-      return std::move(E);
+  if (P.checkUTF8())
+    if (P.parseValue(E))
+      if (P.assertEnd())
+        return std::move(E);
   return P.takeError();
 }
 char ParseError::ID = 0;
 
+// Advances I past the character starting at S[I].
+// Returns its length, or 0 if invalid.
+LLVM_ATTRIBUTE_ALWAYS_INLINE
+static int measureChar(llvm::StringRef S, size_t &I) {
+  unsigned char C = S[I++];
+  if (LLVM_LIKELY(!(C & 0x80))) // ASCII
+    return 1;
+
+  auto EatTrailing = [&]() -> unsigned char {
+    if (LLVM_LIKELY(I < S.size() && ((unsigned char)S[I] & 0xC0) == 0x80))
+      return S[I++];
+    return 0;
+  };
+  // Multi-byte character length is encoded in the leading ones of C.
+  // We'll return if valid and break if invalid.
+  switch (countLeadingOnes(C)) {
+  case 1: // 10xxx is a continuation byte.
+    break;
+  case 2: // 110xxxxx 10xxxxxx.
+    // U+80 = C2 80 is the first two-byte character.
+    if (C < 0xC2 || !EatTrailing())
+      break;
+    return 2;
+  case 3: // 1110xxxx 10xxxxxx 10xxxxxx.
+    // U+800 = E0 A0 80 is the first three-byte character.
+    if (C == 0xE0) {
+      if (EatTrailing() < 0xA0 || !EatTrailing())
+        break;
+    } else if (!EatTrailing() || !EatTrailing()) {
+      break;
+    }
+    return 3;
+  case 4: // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
+    // U+10000 = F0 90 80 80 is the first three-byte character.
+    if (C == 0xF0) {
+      if (EatTrailing() < 0x90 || !EatTrailing() || !EatTrailing())
+        break;
+    } else if (!EatTrailing() || !EatTrailing() || !EatTrailing()) {
+      break;
+    }
+    return 4;
+  default: // 11111xxx is invalid.
+    break;
+  }
+  // If we fell off the end of the switch, the character is invalid.
+  while (EatTrailing()) // Advance to the next plausible character.
+    ;
+  return 0;
+}
+
+bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
+  // We could optimize for the all-ASCII case, but this already compiles down
+  // to a tight loop in that case.
+  for (size_t I = 0; I < S.size();)
+    if (!measureChar(S, I)) {
+      if (ErrOffset)
+        *ErrOffset = I - 1; // I points *after* the error, rewind into range.
+      return false;
+    }
+  return true;
+}
+
+std::string fixUTF8(llvm::StringRef S) {
+  std::string Result;
+  for (size_t I = 0; I < S.size();)
+    if (int Len = measureChar(S, I))
+      Result.append(S.data() + I - Len, Len);
+    else {
+      // UTF-8 encoded replacement character (U+FFFD).
+      Result.push_back(0xef);
+      Result.push_back(0xbf);
+      Result.push_back(0xbd);
+    }
+  return Result;
+}
+
 } // namespace json
 } // namespace llvm
 
Index: unittests/Support/JSONTest.cpp
===================================================================
--- unittests/Support/JSONTest.cpp
+++ unittests/Support/JSONTest.cpp
@@ -27,6 +27,14 @@
   EXPECT_EQ(R"("foo")", s("foo"));
   EXPECT_EQ("[1,2,3]", s({1, 2, 3}));
   EXPECT_EQ(R"({"x":10,"y":20})", s(Object{{"x", 10}, {"y", 20}}));
+
+#ifdef NDEBUG
+  EXPECT_EQ(R"("�")", s("\xC0\x80"));
+  EXPECT_EQ(R"({"�":0})", s(Object{{"\xC0\x80", 0}}));
+#else
+  EXPECT_DEATH(s("\xC0\x80"), "Invalid UTF-8");
+  EXPECT_DEATH(s(Object{{"\xC0\x80", 0}}), "Invalid UTF-8");
+#endif
 }
 
 TEST(JSONTest, Constructors) {
@@ -179,6 +187,31 @@
   "valid": 1,
   invalid: 2
 })");
+  ExpectErr("Invalid UTF-8 sequence", "\"\xC0\x80\""); // WTF-8 null
+}
+
+// Direct tests of isUTF8 and fixUTF8. Internal uses are also tested elsewhere.
+TEST(JSONTest, UTF8) {
+  for (const char *Valid : {
+           "this is ASCII text",
+           "thïs tëxt häs BMP chäräctërs",
+           "𐌶𐌰L𐌾𐍈 C𐍈𐌼𐌴𐍃",
+           "invalid \xED\xA0\x80 codepoints \xF4\x90\x80\x80 not checked",
+       }) {
+    EXPECT_TRUE(isUTF8(Valid)) << Valid;
+    EXPECT_EQ(fixUTF8(Valid), Valid);
+  }
+  for (auto Invalid : std::vector<std::pair<const char *, const char *>>{
+           {"lone trailing \x81\x82 bytes", "lone trailing � bytes"},
+           {"missing trailing \xD0 bytes", "missing trailing � bytes"},
+           {"truncated character \xD0", "truncated character �"},
+           {"not \xC1\x80 the \xE0\x9f\xBF shortest \xF0\x83\x83\x83 encoding",
+            "not � the � shortest � encoding"},
+           {"too \xF9\x80\x80\x80\x80 long", "too � long"},
+       }) {
+    EXPECT_FALSE(isUTF8(Invalid.first)) << Invalid.first;
+    EXPECT_EQ(fixUTF8(Invalid.first), Invalid.second);
+  }
 }
 
 TEST(JSONTest, Inspection) {