Index: include/llvm/Support/YAMLParser.h =================================================================== --- include/llvm/Support/YAMLParser.h +++ include/llvm/Support/YAMLParser.h @@ -107,6 +107,7 @@ enum NodeKind { NK_Null, NK_Scalar, + NK_BlockScalar, NK_KeyValue, NK_Mapping, NK_Sequence, @@ -222,6 +223,36 @@ SmallVectorImpl &Storage) const; }; +/// \brief A block scalar node is an opaque datum that can be presented as a +/// series of zero or more Unicode scalar values. +/// +/// Example: +/// | +/// Hello +/// World +class BlockScalarNode : public Node { + void anchor() override; + +public: + BlockScalarNode(std::unique_ptr &D, StringRef Anchor, StringRef Tag, + std::string &Value, StringRef RawVal) + : Node(NK_BlockScalar, D, Anchor, Tag), Value(std::move(Value)) { + SMLoc Start = SMLoc::getFromPointer(RawVal.begin()); + SMLoc End = SMLoc::getFromPointer(RawVal.end()); + SourceRange = SMRange(Start, End); + } + + /// \brief Gets the value of this node as a StringRef. + StringRef getValue() const { return Value; } + + static inline bool classof(const Node *N) { + return N->getType() == NK_BlockScalar; + } + +private: + std::string Value; +}; + /// \brief A key and value pair. While not technically a Node under the YAML /// representation graph, it is easier to treat them this way. /// @@ -252,8 +283,10 @@ Node *getValue(); void skip() override { - getKey()->skip(); - getValue()->skip(); + if (Node *Key = getKey()) + Key->skip(); + if (Node *Val = getValue()) + Val->skip(); } static inline bool classof(const Node *N) { Index: lib/Support/YAMLParser.cpp =================================================================== --- lib/Support/YAMLParser.cpp +++ lib/Support/YAMLParser.cpp @@ -101,6 +101,7 @@ void Node::anchor() {} void NullNode::anchor() {} void ScalarNode::anchor() {} +void BlockScalarNode::anchor() {} void KeyValueNode::anchor() {} void MappingNode::anchor() {} void SequenceNode::anchor() {} @@ -128,6 +129,7 @@ TK_Key, TK_Value, TK_Scalar, + TK_BlockScalar, TK_Alias, TK_Anchor, TK_Tag @@ -137,6 +139,9 @@ /// of the token in the input. StringRef Range; + /// The value of a block scalar node. + std::string Value; + Token() : Kind(TK_Error) {} }; } @@ -348,6 +353,14 @@ /// b-break. StringRef::iterator skip_b_break(StringRef::iterator Position); + /// @brief Skip a single s-space[31] starting at Position. + /// + /// An s-space is 0x20 + /// + /// @returns The code unit after the s-space, or Position if it's not a + /// s-space. + StringRef::iterator skip_s_space(StringRef::iterator Position); + /// @brief Skip a single s-white[33] starting at Position. /// /// A s-white is 0x20 | 0x9 @@ -417,6 +430,9 @@ , Token::TokenKind Kind , TokenQueueT::iterator InsertPoint); + /// @brief Skip a single-line comment if it's present on the current line. + void skipComment(); + /// @brief Skip whitespace and comments until the start of the next token. void scanToNextToken(); @@ -608,6 +624,9 @@ case Token::TK_Scalar: OS << "Scalar: "; break; + case Token::TK_BlockScalar: + OS << "Block Scalar: "; + break; case Token::TK_Alias: OS << "Alias: "; break; @@ -812,6 +831,13 @@ return Position; } +StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { + if (Position == End) + return Position; + if (*Position == ' ') + return Position + 1; + return Position; +} StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { if (Position == End) @@ -967,24 +993,27 @@ return true; } +void Scanner::skipComment() { + if (*Current != '#') + return; + while (true) { + // This may skip more than one byte, thus Column is only incremented + // for code points. + StringRef::iterator i = skip_nb_char(Current); + if (i == Current) + break; + Current = i; + ++Column; + } +} + void Scanner::scanToNextToken() { while (true) { while (*Current == ' ' || *Current == '\t') { skip(1); } - // Skip comment. - if (*Current == '#') { - while (true) { - // This may skip more than one byte, thus Column is only incremented - // for code points. - StringRef::iterator i = skip_nb_char(Current); - if (i == Current) - break; - Current = i; - ++Column; - } - } + skipComment(); // Skip EOL. StringRef::iterator i = skip_b_break(Current); @@ -1368,37 +1397,196 @@ } bool Scanner::scanBlockScalar(bool IsLiteral) { - StringRef::iterator Start = Current; - skip(1); // Eat | or > - while(true) { - StringRef::iterator i = skip_nb_char(Current); - if (i == Current) { - if (Column == 0) - break; - i = skip_b_break(Current); - if (i != Current) { - // We got a line break. - Column = 0; - ++Line; - Current = i; - continue; - } else { - // There was an error, which should already have been printed out. - return false; - } - } - Current = i; - ++Column; + std::string Str; + raw_string_ostream OS(Str); + auto BlockStart = Current; + // If this is true, then the scanner has to look for a first non-blank line + // to determine the indenation of the block scalar. + bool LookingForIndent = true; + unsigned BlockIndent = 0; + unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; + unsigned LineBreaks = 0; + + unsigned MaxAllSpaceLineCharacters = 0; + StringRef::iterator LongestAllSpaceLine; + + bool FoldNextLine = false; + + skip(1); // Eat '|' or '>' + + // Block Header ([162] c-b-block-header) + // Chomping indicator ([164] c-chomping-indicator) + char ChompingIndicator = ' '; + if (Current != End && (*Current == '+' || *Current == '-')) { + ChompingIndicator = *Current; + skip(1); + } + // Indentation indicator ([163] c-indentation-indicator) + if (Current != End && (*Current >= '1' && *Current <= '9')) { + BlockIndent = unsigned(*Current - '0'); + LookingForIndent = false; + skip(1); } + // Check for chomping indicator once again. + if (Current != End && (*Current == '+' || *Current == '-')) { + ChompingIndicator = *Current; + skip(1); + } + Current = skip_while(&Scanner::skip_s_white, Current); + skipComment(); - if (Start == Current) { - setError("Got empty block scalar", Start); + if (Current == End) { + // End of file after block header, we have an empty scalar. + Token T; + T.Kind = Token::TK_BlockScalar; + T.Range = StringRef(BlockStart, Current - BlockStart); + TokenQueue.push_back(T); + return true; + } + auto I = skip_b_break(Current); + if (I == Current) { + setError("Expected a line break after block scalar header", Current); return false; } + Current = I; + Column = 0; + ++Line; + BlockStart = Current; + + while (true) { + // Handle the block's indentation + if (LookingForIndent) { + while (true) { + auto I = skip_s_space(Current); + if (I == Current) + break; + Current = I; + ++Column; + } + // If there is a non breaking character next, then this is a non empty + // line, thus we can use the discovered indentation as the block's + // indentation. + if (skip_nb_char(Current) != Current) { + if (Column <= BlockExitIndent) { + // This is the end of the block literal, so we exit the loop. + break; + } else { + BlockIndent = Column; + LookingForIndent = false; + // So here we enforce the rule that the leading all space lines can't + // have more characters than the block's indentation level + if (MaxAllSpaceLineCharacters > BlockIndent) { + setError("A leading all-space line must not have too many spaces", + LongestAllSpaceLine); + } + } + } else if (skip_b_break(Current) != Current) { + // This is an all space line, so we have to record the amount of + // space characters it has so that later when we find the first + // text line that defines the indentation level we can make sure + // that all previous all space lines don't have more space + // characters than the indentation level. + if (Column > MaxAllSpaceLineCharacters) { + MaxAllSpaceLineCharacters = Column; + LongestAllSpaceLine = Current; + } + } + } else { + while (Column < BlockIndent) { + auto I = skip_s_space(Current); + if (I == Current) + break; + Current = I; + ++Column; + } + // If this line isn't empty then we have to check the indentation to + // see if the block scalar ends. + if (skip_nb_char(Current) != Current) { + if (Column <= BlockExitIndent) { + // This is the end of the block literal, exit the loop. + break; + } else if (Column < BlockIndent) { + if (Current != End && *Current == '#') { + // This is a trailing comment, exit the loop. + break; + } else { + setError("A text line is less indented than the block scalar", + Current); + } + } + } + } + + // Fold this line if necessary, by removing an additional line break. + // Folding is applied to text lines which follow other text lines and don't + // have any extra spaces (or tabs) after the indentation spaces. + bool IsLineFolded = false; + if (!IsLiteral && FoldNextLine && Column == BlockIndent && + skip_s_white(Current) == Current && skip_nb_char(Current) != Current && + LineBreaks) { + IsLineFolded = true; + --LineBreaks; + } + + // Parse the current line. + auto Start = Current; + Current = skip_while(&Scanner::skip_nb_char, Current); + Column += (Current - Start); + if (Start != Current) { + for (unsigned I = 0; I < LineBreaks; ++I) + OS << "\n"; + if (IsLineFolded && !LineBreaks) + OS << ' '; + LineBreaks = 0; + auto Line = StringRef(Start, Current - Start); + OS << Line; + // Don't fold the next line when this line ends with a whitespace + // character. + FoldNextLine = Line.back() != ' ' && Line.back() != '\t'; + } else { + FoldNextLine = false; + } + + // Check for EOF. + if (Current == End) { + // Ensure that there is at least one line break before the end of file. + if (!LineBreaks) + LineBreaks = 1; + break; + } + auto I = skip_b_break(Current); + if (I == Current) { + // There was an error, which should already have been printed out. + return false; + } + // We've got a line break + Column = 0; + ++Line; + Current = I; + ++LineBreaks; + } + + // Don't output any trailing new lines if the stripping chomping behaviour is + // specified. + if (ChompingIndicator == '-') + LineBreaks = 0; + // Clip trailing lines (default chomping behaviour). The final line break + // is preserved as long as the string isn't empty, but the other trailing + // lines aren't kept. + else if (ChompingIndicator != '+') + LineBreaks = OS.str().empty() ? 0 : 1; + + for (unsigned I = 0; I < LineBreaks; ++I) + OS << "\n"; + + // New lines may start a simple key. + if (!FlowLevel) + IsSimpleKeyAllowed = true; Token T; - T.Kind = Token::TK_Scalar; - T.Range = StringRef(Start, Current - Start); + T.Kind = Token::TK_BlockScalar; + T.Range = StringRef(BlockStart, Current - BlockStart); + T.Value = std::move(OS.str()); TokenQueue.push_back(T); return true; } @@ -1600,6 +1788,7 @@ case NK_Null: return "tag:yaml.org,2002:null"; case NK_Scalar: + case NK_BlockScalar: // TODO: Tag resolution. return "tag:yaml.org,2002:str"; case NK_Mapping: @@ -2131,6 +2320,11 @@ , AnchorInfo.Range.substr(1) , TagInfo.Range , T.Range); + case Token::TK_BlockScalar: + getNext(); + return new (NodeAllocator) + BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), + TagInfo.Range, T.Value, T.Range); case Token::TK_Key: // Don't eat the TK_Key, KeyValueNode expects it. return new (NodeAllocator) Index: test/YAMLParser/spec-09-14.test =================================================================== --- test/YAMLParser/spec-09-14.test +++ test/YAMLParser/spec-09-14.test @@ -1,9 +1,6 @@ -# RUN: yaml-bench -canonical %s 2>&1 | FileCheck %s +# RUN: not yaml-bench -canonical %s 2>&1 | FileCheck %s # -# FIXME: This test should actually fail. Yaml bench should report an error that -# says that the '---' and '...' document start/end markers must not be specified -# as the first content line of a non-indented plain scalar. -# CHECK: !!str +# CHECK: error: Expected a line break after block scalar header --- --- ||| : foo Index: test/YAMLParser/spec-09-18.test =================================================================== --- test/YAMLParser/spec-09-18.test +++ test/YAMLParser/spec-09-18.test @@ -1,4 +1,8 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "literal\n" +# CHECK: !!str " folded\n" +# CHECK: !!str "keep\n\n" +# CHECK: !!str " strip" - | # Just the style literal Index: test/YAMLParser/spec-09-19.test =================================================================== --- test/YAMLParser/spec-09-19.test +++ test/YAMLParser/spec-09-19.test @@ -1,4 +1,6 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "literal\n" +# CHECK: !!str "folded\n" - | literal Index: test/YAMLParser/spec-09-20.test =================================================================== --- test/YAMLParser/spec-09-20.test +++ test/YAMLParser/spec-09-20.test @@ -1,4 +1,8 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "detected\n" +# CHECK: !!str "\n\n# detected\n" +# CHECK: !!str " explicit\n" +# CHECK: !!str "\t\ndetected\n" - | detected Index: test/YAMLParser/spec-09-21.test =================================================================== --- test/YAMLParser/spec-09-21.test +++ test/YAMLParser/spec-09-21.test @@ -9,4 +9,4 @@ - |1 text -# CHECK: error +# CHECK: 8:2: error: A text line is less indented than the block scalar Index: test/YAMLParser/spec-09-22.test =================================================================== --- test/YAMLParser/spec-09-22.test +++ test/YAMLParser/spec-09-22.test @@ -1,6 +1,12 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "text" +# CHECK: !!str "text\n" +# CHECK: !!str "text\n\n" strip: |- - text
clip: | - text…keep: |+ - text
 + text +clip: | + text +keep: |+ + text + Index: test/YAMLParser/spec-09-24.test =================================================================== --- test/YAMLParser/spec-09-24.test +++ test/YAMLParser/spec-09-24.test @@ -1,8 +1,13 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: ? !!str "strip" +# CHECK: : !!str "" +# CHECK: ? !!str "clip" +# CHECK: : !!str "" +# CHECK: ? !!str "keep" +# CHECK: : !!str "\n" strip: >- clip: > keep: |+ - Index: test/YAMLParser/spec-09-25.test =================================================================== --- test/YAMLParser/spec-09-25.test +++ test/YAMLParser/spec-09-25.test @@ -1,4 +1,5 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "literal\n\ttext\n" | # Simple block scalar literal Index: test/YAMLParser/spec-09-26.test =================================================================== --- test/YAMLParser/spec-09-26.test +++ test/YAMLParser/spec-09-26.test @@ -1,4 +1,5 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "\n\nliteral\n\ntext\n" | Index: test/YAMLParser/spec-09-27.test =================================================================== --- test/YAMLParser/spec-09-27.test +++ /dev/null @@ -1,10 +0,0 @@ -# RUN: yaml-bench -canonical %s - -| - - - literal - - text - - # Comment Index: test/YAMLParser/spec-09-28.test =================================================================== --- test/YAMLParser/spec-09-28.test +++ /dev/null @@ -1,10 +0,0 @@ -# RUN: yaml-bench -canonical %s - -| - - - literal - - text - - # Comment Index: test/YAMLParser/spec-09-29.test =================================================================== --- test/YAMLParser/spec-09-29.test +++ test/YAMLParser/spec-09-29.test @@ -1,4 +1,5 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "folded text\n\tlines\n" > # Simple folded scalar folded Index: test/YAMLParser/spec-09-30.test =================================================================== --- test/YAMLParser/spec-09-30.test +++ test/YAMLParser/spec-09-30.test @@ -1,4 +1,5 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "folded line\n\nnext line\n\n * bullet\n * list\n\nlast line\n" > folded Index: test/YAMLParser/spec-09-31.test =================================================================== --- test/YAMLParser/spec-09-31.test +++ test/YAMLParser/spec-09-31.test @@ -1,4 +1,5 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "folded line\n\nnext line\n\n * bullet\n * list\n\nlast line\n" > folded Index: test/YAMLParser/spec-09-32.test =================================================================== --- test/YAMLParser/spec-09-32.test +++ test/YAMLParser/spec-09-32.test @@ -1,4 +1,5 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "folded line\n\nnext line\n\n * bullet\n * list\n\nlast line\n" > folded Index: test/YAMLParser/spec-09-33.test =================================================================== --- test/YAMLParser/spec-09-33.test +++ test/YAMLParser/spec-09-33.test @@ -1,4 +1,5 @@ -# RUN: yaml-bench -canonical %s +# RUN: yaml-bench -canonical %s | FileCheck %s +# CHECK: !!str "folded line\n\nnext line\n\n * bullet\n * list\n\nlast line\n" > folded Index: unittests/Support/YAMLParserTest.cpp =================================================================== --- unittests/Support/YAMLParserTest.cpp +++ unittests/Support/YAMLParserTest.cpp @@ -130,6 +130,33 @@ ExpectParseSuccess("Array of arrays", "[[]]"); } +TEST(YAMLParser, ParsesBlockLiteralScalars) { + ExpectParseSuccess("Block literal scalar", "test: |\n Hello\n World\n"); + ExpectParseSuccess("Block literal scalar EOF", "test: |\n Hello\n World"); + ExpectParseSuccess("Empty block literal scalar header EOF", "test: | "); + ExpectParseSuccess("Empty block literal scalar", "test: |\ntest2: 20"); + ExpectParseSuccess("Empty block literal scalar 2", "- | \n \n\n \n- 42"); + ExpectParseSuccess("Block literal scalar in sequence", + "- |\n Testing\n Out\n\n- 22"); + ExpectParseSuccess("Block literal scalar in document", + "--- |\n Document\n..."); + ExpectParseSuccess("Empty non indented lines still count", + "- |\n First line\n \n\n Another line\n\n- 2"); + ExpectParseSuccess("Comment in block literal scalar header", + "test: | # Comment \n No Comment\ntest 2: | # Void"); + ExpectParseSuccess("Chomping indicators in block literal scalar header", + "test: |- \n Hello\n\ntest 2: |+ \n\n World\n\n\n"); + ExpectParseSuccess("Indent indicators in block literal scalar header", + "test: |1 \n \n Hello \n World\n"); + ExpectParseSuccess("Chomping and indent indicators in block literals", + "test: |-1\n Hello\ntest 2: |9+\n World"); + ExpectParseSuccess("Trailing comments in block literals", + "test: |\n Content\n # Trailing\n #Comment\ntest 2: 3"); + ExpectParseError("Invalid block scalar header", "test: | failure"); + ExpectParseError("Invalid line indentation", "test: |\n First line\n Error"); + ExpectParseError("Long leading space line", "test: |\n \n Test\n"); +} + TEST(YAMLParser, HandlesEndOfFileGracefully) { ExpectParseError("In string starting with EOF", "[\""); ExpectParseError("In string hitting EOF", "[\" "); Index: utils/yaml-bench/YAMLBench.cpp =================================================================== --- utils/yaml-bench/YAMLBench.cpp +++ utils/yaml-bench/YAMLBench.cpp @@ -96,6 +96,8 @@ SmallString<32> Storage; StringRef Val = sn->getValue(Storage); outs() << prettyTag(n) << " \"" << yaml::escape(Val) << "\""; + } else if (yaml::BlockScalarNode *BN = dyn_cast(n)) { + outs() << prettyTag(n) << " \"" << yaml::escape(BN->getValue()) << "\""; } else if (yaml::SequenceNode *sn = dyn_cast(n)) { outs() << prettyTag(n) << " [\n"; ++Indent;