diff --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp --- a/llvm/lib/Support/YAMLParser.cpp +++ b/llvm/lib/Support/YAMLParser.cpp @@ -200,13 +200,12 @@ StringRef::iterator End = Range.end(); // 1 byte: [0x00, 0x7f] // Bit pattern: 0xxxxxxx - if ((*Position & 0x80) == 0) { - return std::make_pair(*Position, 1); + if (Position < End && (*Position & 0x80) == 0) { + return std::make_pair(*Position, 1); } // 2 bytes: [0x80, 0x7ff] // Bit pattern: 110xxxxx 10xxxxxx - if (Position + 1 != End && - ((*Position & 0xE0) == 0xC0) && + if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) && ((*(Position + 1) & 0xC0) == 0x80)) { uint32_t codepoint = ((*Position & 0x1F) << 6) | (*(Position + 1) & 0x3F); @@ -215,8 +214,7 @@ } // 3 bytes: [0x8000, 0xffff] // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx - if (Position + 2 != End && - ((*Position & 0xF0) == 0xE0) && + if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) && ((*(Position + 1) & 0xC0) == 0x80) && ((*(Position + 2) & 0xC0) == 0x80)) { uint32_t codepoint = ((*Position & 0x0F) << 12) | @@ -230,8 +228,7 @@ } // 4 bytes: [0x10000, 0x10FFFF] // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - if (Position + 3 != End && - ((*Position & 0xF8) == 0xF0) && + if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) && ((*(Position + 1) & 0xC0) == 0x80) && ((*(Position + 2) & 0xC0) == 0x80) && ((*(Position + 3) & 0xC0) == 0x80)) { @@ -773,7 +770,7 @@ IsSimpleKeyAllowed = true; Failed = false; std::unique_ptr InputBufferOwner = - MemoryBuffer::getMemBuffer(Buffer); + MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false); SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); } @@ -1036,7 +1033,7 @@ } void Scanner::skipComment() { - if (*Current != '#') + if (Current == End || *Current != '#') return; while (true) { // This may skip more than one byte, thus Column is only incremented @@ -1051,7 +1048,7 @@ void Scanner::scanToNextToken() { while (true) { - while (*Current == ' ' || *Current == '\t') { + while (Current != End && (*Current == ' ' || *Current == '\t')) { skip(1); } @@ -1286,7 +1283,7 @@ && wasEscaped(Start + 1, Current)); } else { skip(1); - while (true) { + while (Current != End) { // Skip a ' followed by another '. if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { skip(2); @@ -1334,13 +1331,14 @@ unsigned LeadingBlanks = 0; assert(Indent >= -1 && "Indent must be >= -1 !"); unsigned indent = static_cast(Indent + 1); - while (true) { + while (Current != End) { if (*Current == '#') break; - while (!isBlankOrBreak(Current)) { - if ( FlowLevel && *Current == ':' - && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { + while (Current != End && !isBlankOrBreak(Current)) { + if (FlowLevel && *Current == ':' && + (Current + 1 == End || + !(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) { setError("Found unexpected ':' while scanning a plain scalar", Current); return false; } @@ -1410,7 +1408,7 @@ StringRef::iterator Start = Current; unsigned ColStart = Column; skip(1); - while(true) { + while (Current != End) { if ( *Current == '[' || *Current == ']' || *Current == '{' || *Current == '}' || *Current == ',' diff --git a/llvm/tools/llvm-yaml-parser-fuzzer/yaml-parser-fuzzer.cpp b/llvm/tools/llvm-yaml-parser-fuzzer/yaml-parser-fuzzer.cpp --- a/llvm/tools/llvm-yaml-parser-fuzzer/yaml-parser-fuzzer.cpp +++ b/llvm/tools/llvm-yaml-parser-fuzzer/yaml-parser-fuzzer.cpp @@ -21,12 +21,32 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { std::vector Input(Data, Data + Size); + // Ensure we don't crash on any arbitrary byte string. + isValidYaml(Input.data(), Input.size()); + + // Ensure we don't crash on byte strings with no null characters. + Input.erase(std::remove(Input.begin(), Input.end(), 0), Input.end()); + Input.shrink_to_fit(); + bool IsValidWithout0s = isValidYaml(Input.data(), Input.size()); + // Ensure we don't crash on byte strings where the only null character is // one-past-the-end of the actual input to the parser. - Input.erase(std::remove(Input.begin(), Input.end(), 0), Input.end()); Input.push_back(0); Input.shrink_to_fit(); - isValidYaml(Input.data(), Input.size() - 1); + bool IsValidWhen0Terminated = isValidYaml(Input.data(), Input.size() - 1); + + // Ensure we don't crash on byte strings with no null characters, but with + // an invalid character one-past-the-end of the actual input to the parser. + Input.back() = 1; + bool IsValidWhen1Terminated = isValidYaml(Input.data(), Input.size() - 1); + + // The parser should either accept all of these inputs, or reject all of + // them, because the parser sees an identical byte string in each case. This + // should hopefully catch some cases where the parser is sensitive to what is + // present one-past-the-end of the actual input. + if (IsValidWithout0s != IsValidWhen0Terminated || + IsValidWhen0Terminated != IsValidWhen1Terminated) + LLVM_BUILTIN_TRAP; return 0; } diff --git a/llvm/unittests/Support/YAMLIOTest.cpp b/llvm/unittests/Support/YAMLIOTest.cpp --- a/llvm/unittests/Support/YAMLIOTest.cpp +++ b/llvm/unittests/Support/YAMLIOTest.cpp @@ -3111,5 +3111,95 @@ TEST(YAMLIO, TestEmptyAnchor) { Input yin("*"); EXPECT_FALSE(yin.setCurrentDocument()); +} + +TEST(YAMLIO, TestScannerNoNullEmpty) { + std::vector str{}; + Input yin(llvm::StringRef(str.data(), str.size())); + yin.setCurrentDocument(); + EXPECT_FALSE(yin.error()); +} + +TEST(YAMLIO, TestScannerNoNullSequenceOfNull) { + std::vector str{'-'}; + Input yin(llvm::StringRef(str.data(), str.size())); + yin.setCurrentDocument(); + EXPECT_FALSE(yin.error()); +} + +TEST(YAMLIO, TestScannerNoNullSimpleSequence) { + std::vector str{'-', ' ', 'a'}; + Input yin(llvm::StringRef(str.data(), str.size())); + yin.setCurrentDocument(); + EXPECT_FALSE(yin.error()); +} + +TEST(YAMLIO, TestScannerNoNullUnbalancedMap) { + std::vector str{'{'}; + Input yin(llvm::StringRef(str.data(), str.size())); + yin.setCurrentDocument(); + EXPECT_TRUE(yin.error()); +} + +TEST(YAMLIO, TestScannerNoNullEmptyMap) { + std::vector str{'{', '}'}; + Input yin(llvm::StringRef(str.data(), str.size())); + yin.setCurrentDocument(); + EXPECT_FALSE(yin.error()); +} + +TEST(YAMLIO, TestScannerNoNullUnbalancedSequence) { + std::vector str{'['}; + Input yin(llvm::StringRef(str.data(), str.size())); + yin.setCurrentDocument(); + EXPECT_TRUE(yin.error()); +} + +TEST(YAMLIO, TestScannerNoNullEmptySequence) { + std::vector str{'[', ']'}; + Input yin(llvm::StringRef(str.data(), str.size())); + yin.setCurrentDocument(); + EXPECT_FALSE(yin.error()); +} + +TEST(YAMLIO, TestScannerNoNullScalarUnbalancedDoubleQuote) { + std::vector str{'"'}; + Input yin(llvm::StringRef(str.data(), str.size())); + yin.setCurrentDocument(); + EXPECT_TRUE(yin.error()); +} + +TEST(YAMLIO, TestScannerNoNullScalarUnbalancedSingleQuote) { + std::vector str{'\''}; + Input yin(llvm::StringRef(str.data(), str.size())); + yin.setCurrentDocument(); + EXPECT_TRUE(yin.error()); +} + +TEST(YAMLIO, TestScannerNoNullEmptyAlias) { + std::vector str{'&'}; + Input yin(llvm::StringRef(str.data(), str.size())); + yin.setCurrentDocument(); + EXPECT_TRUE(yin.error()); +} + +TEST(YAMLIO, TestScannerNoNullEmptyAnchor) { + std::vector str{'*'}; + Input yin(llvm::StringRef(str.data(), str.size())); + yin.setCurrentDocument(); + EXPECT_TRUE(yin.error()); +} + +TEST(YAMLIO, TestScannerNoNullDecodeInvalidUTF8) { + std::vector str{'\xef'}; + Input yin(llvm::StringRef(str.data(), str.size())); + yin.setCurrentDocument(); + EXPECT_TRUE(yin.error()); +} + +TEST(YAMLIO, TestScannerNoNullScanPlainScalarInFlow) { + std::vector str{'{', 'a', ':'}; + Input yin(llvm::StringRef(str.data(), str.size())); + yin.setCurrentDocument(); EXPECT_TRUE(yin.error()); }