diff --git a/llvm/include/llvm/DebugInfo/Symbolize/Markup.h b/llvm/include/llvm/DebugInfo/Symbolize/Markup.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/DebugInfo/Symbolize/Markup.h @@ -0,0 +1,100 @@ +//===- Markup.h -------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares the log symbolizer markup data model and parser. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H +#define LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H + +#include + +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Regex.h" + +namespace llvm { +namespace symbolize { + +/// An element of symbolizer markup. +/// +/// If only the Text field is set, this represents a region of text outside a +/// markup element. ANSI SGR control codes are also reported this way; if +/// detected, then the control code will be the entirety of the Text field, and +/// any surrounding text will be reported as preceding and following elements. +struct MarkupElement { + /// The full text of this element in the input. + StringRef Text; + + /// If this represents a tag, the tag itself. Otherwise, empty. + StringRef Tag; + + /// If this represents a tag with fields, a list of the field contents. + /// Otherwise, empty. + SmallVector Fields; + + bool operator==(const MarkupElement &Other) const { + return Text == Other.Text && Tag == Other.Tag && Fields == Other.Fields; + } + bool operator!=(const MarkupElement &Other) const { + return !(*this == Other); + } +}; + +/// Parses a log containing symbolizer markup into a sequence of elements. +class MarkupParser { +public: + MarkupParser(); + + /// Parses an individual \p Line of input. + /// + /// After parseLine() is called, it must not be called again until + /// nextElement() returns None. The markup elements returned by nextElement() + /// may reference the input string, so it must be retained by the caller until + /// the last use. + void parseLine(StringRef Line); + + /// Returns the next element in the input sequence. + /// + /// This is either a markup element or a region of text. The next call to + /// nextElement() may invalidate the contents of the element returned by the + /// previous call. + /// + /// \returns the next markup element or None if none remain. + Optional nextElement() { + if (!NextIdx) + NextIdx = 0; + if (*NextIdx == Buffer.size()) { + NextIdx.reset(); + Buffer.clear(); + return None; + } + return std::move(Buffer[(*NextIdx)++]); + } + +private: + Optional parseElement(StringRef Line); + void parseTextOutsideMarkup(StringRef Text); + + // Buffer for elements parsed from the current line. + SmallVector Buffer; + + // Next buffer index to return or None if nextElement has not yet been called. + Optional NextIdx; + + // Regular expression matching supported ANSI SGR escape sequences. + const Regex SGRSyntax; +}; + +} // end namespace symbolize +} // end namespace llvm + +#endif // LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H diff --git a/llvm/lib/DebugInfo/Symbolize/CMakeLists.txt b/llvm/lib/DebugInfo/Symbolize/CMakeLists.txt --- a/llvm/lib/DebugInfo/Symbolize/CMakeLists.txt +++ b/llvm/lib/DebugInfo/Symbolize/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_component_library(LLVMSymbolize DIFetcher.cpp DIPrinter.cpp + Markup.cpp SymbolizableObjectFile.cpp Symbolize.cpp diff --git a/llvm/lib/DebugInfo/Symbolize/Markup.cpp b/llvm/lib/DebugInfo/Symbolize/Markup.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/DebugInfo/Symbolize/Markup.cpp @@ -0,0 +1,113 @@ +//===- lib/DebugInfo/Symbolize/Markup.cpp ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines the log symbolizer markup data model and parser. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/Symbolize/Markup.h" + +#include "llvm/ADT/StringExtras.h" + +namespace llvm { +namespace symbolize { + +// Matches the following: +// "\033[0m" +// "\033[1m" +// "\033[30m" -- "\033[37m" +static const char SGRSyntaxStr[] = "\033\\[([0-1]|3[0-7])m"; + +MarkupParser::MarkupParser() : SGRSyntax(SGRSyntaxStr) {} + +static StringRef takeTo(StringRef Str, StringRef::iterator Pos) { + return Str.take_front(Pos - Str.begin()); +} +static void advanceTo(StringRef &Str, StringRef::iterator Pos) { + Str = Str.drop_front(Pos - Str.begin()); +} + +void MarkupParser::parseLine(StringRef Line) { + assert(Buffer.empty() && + "Cannot call parseLine before all elements have been extracted."); + while (!Line.empty()) { + // Find the first valid markup element, if any. + if (Optional Element = parseElement(Line)) { + parseTextOutsideMarkup(takeTo(Line, Element->Text.begin())); + Buffer.push_back(std::move(*Element)); + advanceTo(Line, Element->Text.end()); + } else { + // The line doesn't contain any more markup elements, so emit it as text. + parseTextOutsideMarkup(Line); + return; + } + } +} + +// Finds and returns the next valid markup element in the given line. Returns +// None if the line contains no valid elements. +Optional MarkupParser::parseElement(StringRef Line) { + while (true) { + // Find next element using begin and end markers. + size_t BeginPos = Line.find("{{{"); + if (BeginPos == StringRef::npos) + return None; + size_t EndPos = Line.find("}}}", BeginPos + 3); + if (EndPos == StringRef::npos) + return None; + EndPos += 3; + MarkupElement Element; + Element.Text = Line.slice(BeginPos, EndPos); + Line = Line.substr(EndPos); + + // Parse tag. + StringRef Content = Element.Text.drop_front(3).drop_back(3); + StringRef FieldsContent; + std::tie(Element.Tag, FieldsContent) = Content.split(':'); + if (Element.Tag.empty()) + continue; + if (!llvm::all_of(Element.Tag, [](char C) { return 'a' <= C && C <= 'z'; })) + continue; + + // Parse fields. + if (!FieldsContent.empty()) + FieldsContent.split(Element.Fields, ":"); + else if (Content.back() == ':') + Element.Fields.push_back(FieldsContent); + + return Element; + } +} + +static MarkupElement textElement(StringRef Text) { + MarkupElement Element; + Element.Text = Text; + return Element; +} + +// Parses a region of text known to be outside any markup elements. Such text +// may still contain SGR control codes, so these are +void MarkupParser::parseTextOutsideMarkup(StringRef Text) { + if (Text.empty()) + return; + SmallVector Matches; + while (SGRSyntax.match(Text, &Matches)) { + // Emit any text before the SGR element. + if (Matches.begin()->begin() != Text.begin()) + Buffer.push_back(textElement(takeTo(Text, Matches.begin()->begin()))); + + Buffer.push_back(textElement(*Matches.begin())); + advanceTo(Text, Matches.begin()->end()); + } + if (!Text.empty()) + Buffer.push_back(textElement(Text)); +} + +} // end namespace symbolize +} // end namespace llvm diff --git a/llvm/unittests/DebugInfo/CMakeLists.txt b/llvm/unittests/DebugInfo/CMakeLists.txt --- a/llvm/unittests/DebugInfo/CMakeLists.txt +++ b/llvm/unittests/DebugInfo/CMakeLists.txt @@ -3,3 +3,4 @@ add_subdirectory(GSYM) add_subdirectory(MSF) add_subdirectory(PDB) +add_subdirectory(Symbolizer) diff --git a/llvm/unittests/DebugInfo/Symbolizer/CMakeLists.txt b/llvm/unittests/DebugInfo/Symbolizer/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/llvm/unittests/DebugInfo/Symbolizer/CMakeLists.txt @@ -0,0 +1,3 @@ +set(LLVM_LINK_COMPONENTS Symbolize) +add_llvm_unittest(DebugInfoSymbolizerTests MarkupTest.cpp) +target_link_libraries(DebugInfoSymbolizerTests PRIVATE LLVMTestingSupport) diff --git a/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp b/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp new file mode 100644 --- /dev/null +++ b/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp @@ -0,0 +1,155 @@ + +//===- unittest/DebugInfo/Symbolizer/MarkupTest.cpp - Markup parser tests -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/Symbolize/Markup.h" + +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/FormatVariadic.h" + +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace { + +using namespace llvm; +using namespace llvm::symbolize; +using namespace testing; + +Matcher +isElement(StringRef Text, StringRef Tag = "", + Matcher> Fields = IsEmpty()) { + return AllOf(Field("Text", &MarkupElement::Text, Text), + Field("Tag", &MarkupElement::Tag, Tag), + Field("Fields", &MarkupElement::Fields, Fields)); +} + +TEST(SymbolizerMarkup, NoLines) { + EXPECT_EQ(MarkupParser{}.nextElement(), None); +} + +TEST(SymbolizerMarkup, LinesWithoutMarkup) { + MarkupParser Parser; + + Parser.parseLine("text"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("text"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("{{{"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("{{{"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("{{{}}"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("{{{}}"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("{{}}}"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("{{}}}"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("{{{tag:"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("{{{tag:"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("{{{tag:}}"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("{{{tag:}}"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("{{{tag:field}}"); + EXPECT_THAT(Parser.nextElement(), + testing::Optional(isElement("{{{tag:field}}"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("{{{t2g}}}"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("{{{t2g}}}"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("{{{tAg}}}"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("{{{tAg}}}"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("a\033[2mb"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("a\033[2mb"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("a\033[38mb"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("a\033[38mb"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("a\033[4mb"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("a\033[4mb"))); + EXPECT_THAT(Parser.nextElement(), None); +} + +TEST(SymbolizerMarkup, LinesWithMarkup) { + MarkupParser Parser; + + Parser.parseLine("{{{tag}}}"); + EXPECT_THAT(Parser.nextElement(), + testing::Optional(isElement("{{{tag}}}", "tag"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("{{{tag:f1:f2:f3}}}"); + EXPECT_THAT(Parser.nextElement(), + testing::Optional(isElement("{{{tag:f1:f2:f3}}}", "tag", + ElementsAre("f1", "f2", "f3")))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("{{{tag:}}}"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement( + "{{{tag:}}}", "tag", ElementsAre("")))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("a{{{b}}}c{{{d}}}e"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("a"))); + EXPECT_THAT(Parser.nextElement(), + testing::Optional(isElement("{{{b}}}", "b"))); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("c"))); + EXPECT_THAT(Parser.nextElement(), + testing::Optional(isElement("{{{d}}}", "d"))); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("e"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("{{{}}}{{{tag}}}"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("{{{}}}"))); + EXPECT_THAT(Parser.nextElement(), + testing::Optional(isElement("{{{tag}}}", "tag"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("{{{t2g}}}{{{tag}}}"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("{{{t2g}}}"))); + EXPECT_THAT(Parser.nextElement(), + testing::Optional(isElement("{{{tag}}}", "tag"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("{{{tAg}}}{{{tag}}}"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("{{{tAg}}}"))); + EXPECT_THAT(Parser.nextElement(), + testing::Optional(isElement("{{{tag}}}", "tag"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("\033[0mA\033[1mB\033[30mC\033[37m"); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("\033[0m"))); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("A"))); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("\033[1m"))); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("B"))); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("\033[30m"))); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("C"))); + EXPECT_THAT(Parser.nextElement(), testing::Optional(isElement("\033[37m"))); + EXPECT_THAT(Parser.nextElement(), None); + + Parser.parseLine("{{{tag:\033[0m}}}"); + EXPECT_THAT(Parser.nextElement(), + testing::Optional(isElement("{{{tag:\033[0m}}}", "tag", + ElementsAre("\033[0m")))); + EXPECT_THAT(Parser.nextElement(), None); +} + +} // namespace