diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -3509,6 +3509,35 @@ source_filename = "/path/to/source.c" +.. _structured_data: + +Structured Data +--------------- + +Dictionaries of key-value pairs are used in some cases to represent data in an +easily extendable, human-readable manner. + +The labels used in key-value pairs are identifiers followed immediately by a +colon (':'), like the label of a named basic block. + +:Syntax: + +:: + + sdata ::= '{' (sdata_field ',')* sdata_field? '}' + sdata_field ::= label sdata_value + sdata_value ::= 'type' type + ::= 'iN' integer + ::= 'i1' 'true' | 'i1' 'false' + +:Examples: + +:: + + {} + { layout: type float, } + { foo: i1 true, bar: i32 10 } + .. _typesystem: Type System diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h --- a/llvm/include/llvm/AsmParser/LLParser.h +++ b/llvm/include/llvm/AsmParser/LLParser.h @@ -20,6 +20,7 @@ #include "llvm/IR/FMF.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/ModuleSummaryIndex.h" +#include "llvm/IR/StructuredData.h" #include #include @@ -165,6 +166,8 @@ // Map of module ID to path. std::map ModuleIdMap; + sdata::SymbolTableLockGuard SymbolTableLock; + /// Only the llvm-as tool may set this to false to bypass /// UpgradeDebuginfo so it can generate broken bitcode. bool UpgradeDebugInfo; @@ -558,6 +561,10 @@ bool parseGlobalObjectMetadataAttachment(GlobalObject &GO); bool parseOptionalFunctionMetadata(Function &F); + bool parseStructuredData( + function_ref + ParseField); + template bool parseMDField(LocTy Loc, StringRef Name, FieldTy &Result); template bool parseMDField(StringRef Name, FieldTy &Result); diff --git a/llvm/include/llvm/IR/StructuredData.h b/llvm/include/llvm/IR/StructuredData.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/IR/StructuredData.h @@ -0,0 +1,212 @@ +//===- llvm/IR/StructuredData.h ---------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides structured data objects that are used as an intermediate +// abstraction for (de)serializing extensible IR objects. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_IR_STRUCTUREDDATA_H +#define LLVM_IR_STRUCTUREDDATA_H + +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" + +namespace llvm { + +class LLVMContext; +class MDNode; +class Type; + +namespace sdata { + +class RegisterSymbol; +class SymbolTableLockGuard; + +/// A symbol is a unique'd well-known string, like the key of a field in a +/// structured data dictionary, or the name of an enum value. +/// +/// Use @ref RegisterSymbol to register symbol names. +/// +/// WARNING: Do not use symbols for user-provided strings. Should the need to +/// store strings in structured data arise, an explicit string type +/// should be added to @ref Value. +class Symbol { +private: + friend class RegisterSymbol; + friend class SymbolTableLockGuard; + + unsigned Id = 0; + StringRef String; + +public: + Symbol() = default; + Symbol(const RegisterSymbol &RS); + + StringRef getAsString() const { return String; } + + bool operator==(const RegisterSymbol &RHS) const; + bool operator!=(const RegisterSymbol &RHS) const { return !(*this == RHS); } + bool operator==(const Symbol &RHS) const { + if (Id != 0 && RHS.Id != 0) + return Id == RHS.Id; + return String == RHS.String; + } + bool operator!=(const Symbol &RHS) const { return !(*this == RHS); } +}; + +/// Register a constant known string as a "symbol" for used in structured data. +/// +/// Symbols must be registered before creating/reading structured data that +/// uses them. +/// +/// Symbols are registered and unique'd globally. They should be constructed +/// lazily with a static lifetime as needed, e.g. using the function-local +/// static variable pattern below. +/// +/// Example: +/// @code +/// struct MySymbols { +/// sdata::RegisterSymbol MyKeyword("mykeyword"); +/// sdata::RegisterSymbol Foo("foo"); +/// sdata::RegisterSymbol Bar("bar"); +/// // ... +/// +/// static MySymbols &get() { +/// static MySymbols S; +/// return S; +/// } +/// }; +/// +/// void registerMySymbols() { +/// (void)MySymbols::get(); +/// } +/// @endcode +class RegisterSymbol { +public: + explicit RegisterSymbol(StringRef Str); + + Symbol get() const { return S; } + +private: + Symbol S; +}; + +/// Thread-safe access to the table of registered symbols. +/// +/// A read lock on the symbol table is held for the life-time of this object. +/// +/// WARNING: This mechanism should *only* be used by the IR parser and bitcode +/// reader! Everything else should use @ref RegisterSymbol instead. +class SymbolTableLockGuard { +public: + SymbolTableLockGuard(); + ~SymbolTableLockGuard(); + + Symbol getSymbol(LLVMContext &Context, StringRef String) const; +}; + +/// A value of structured data. +class Value { +private: + using Storage = std::variant; + + Storage S; + +public: + Value() = default; + explicit Value(Type *T) : S(T) {} + explicit Value(bool B) : S(APInt(1, B ? 1 : 0)) {} + explicit Value(APInt I) : S(I) {} + + Value &operator=(Type *T) { + assert(T); + S = T; + return *this; + } + Value &operator=(bool B) { + S = APInt(1, B ? 1 : 0); + return *this; + } + Value &operator=(APInt I) { + S = I; + return *this; + } + + bool isAPInt() const { return std::holds_alternative(S); } + bool isBool() const { + return isAPInt() && std::get(S).getBitWidth() == 1; + } + bool isType() const { return std::holds_alternative(S); } + + const APInt &getAPInt() const { + assert(isAPInt()); + return std::get(S); + } + bool getBool() const { + assert(isBool()); + return std::get(S).getZExtValue(); + } + Type *getType() const { + assert(isType()); + return std::get(S); + } +}; + +/// Describes the "schema" of a field of structured data. +/// +/// This is used to describe structures for bitcode abbreviation. +class SchemaField { +public: + enum class Type { + /// Fixed-width APInt (possibly a boolean). TypeData is the number of bits. + Int, + + /// LLVM type + Type, + }; + +private: + Symbol TheKey; + Type TheType; + unsigned TypeData; + +public: + SchemaField(Symbol K, Type T, unsigned TD = 0) + : TheKey(K), TheType(T), TypeData(TD) { + assert((T != Type::Int || TD != 0) && + "integer schema types must have a bit width"); + } + + Symbol getKey() const { return TheKey; } + Type getType() const { return TheType; } + unsigned getTypeBitWidth() const { + assert(TheType == Type::Int); + return TypeData; + } +}; + +// Convenience function to create an Error object when an error is encountered +// while deserializing structured data. +Error makeDeserializeError(const Twine &Msg); + +inline Symbol::Symbol(const RegisterSymbol &RS) { *this = RS.get(); } + +inline bool Symbol::operator==(const RegisterSymbol &RHS) const { + // Use the assumption that symbols are registered before structured data is + // created. + return Id == RHS.get().Id; +} + +} // end namespace sdata + +} // end namespace llvm + +#endif diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -4190,6 +4190,82 @@ return false; } +/// parseStructuredData +/// ::= '{' (key value (',' key value))? ','? '}' +/// +/// value ::= 'type' type +/// ::= 'i1' 'true' | 'i1' 'false' +/// ::= 'iN' integer +bool LLParser::parseStructuredData( + function_ref ParseField) { + if (parseToken(lltok::lbrace, "expected '{' here")) + return true; + + while (Lex.getKind() != lltok::rbrace) { + if (Lex.getKind() != lltok::LabelStr) + return tokError("expected '}' or field label here"); + + LocTy KeyLoc = Lex.getLoc(); + sdata::Symbol Key = SymbolTableLock.getSymbol(Context, Lex.getStrVal()); + Lex.Lex(); + + LocTy ValueLoc = Lex.getLoc(); + sdata::Value V; + switch (Lex.getKind()) { + case lltok::kw_type: { + Lex.Lex(); // eat 'type' + + Type *T; + if (parseType(T, /*AllowVoid=*/true)) + return true; + + V = sdata::Value(T); + break; + } + case lltok::Type: { + Type *Ty = Lex.getTyVal(); + if (auto *IntTy = dyn_cast(Ty)) { + Lex.Lex(); + + switch (Lex.getKind()) { + case lltok::APSInt: + V = sdata::Value(Lex.getAPSIntVal().extOrTrunc(IntTy->getBitWidth())); + Lex.Lex(); + break; + case lltok::kw_true: + case lltok::kw_false: + if (IntTy->getBitWidth() != 1) + return tokError("true/false can only be used with i1"); + V = sdata::Value(Lex.getKind() == lltok::kw_true); + Lex.Lex(); + break; + default: + return tokError("expected an integer value"); + } + + break; + } + + return tokError("only integer types are supported in structured data"); + } + + default: + return tokError("expected structured data value"); + } + + if (ParseField(KeyLoc, Key, ValueLoc, V)) + return true; + + if (Lex.getKind() == lltok::rbrace) + break; + if (parseToken(lltok::comma, "expected ',' or '}' here")) + return true; + } + + Lex.Lex(); // eat the '}' + return false; +} + bool LLParser::parseMDTuple(MDNode *&MD, bool IsDistinct) { SmallVector Elts; if (parseMDNodeVector(Elts)) diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -58,6 +58,7 @@ #include "llvm/IR/ModuleSlotTracker.h" #include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/StructuredData.h" #include "llvm/IR/Type.h" #include "llvm/IR/TypeFinder.h" #include "llvm/IR/TypedPointerType.h" @@ -2633,6 +2634,9 @@ printConstVCalls(const std::vector &VCallList, const char *Tag); + void + printStructuredData(ArrayRef> Fields); + private: /// Print out metadata attachments. void printMetadataAttachments( @@ -4583,6 +4587,30 @@ printUseListOrder(Pair.first, Pair.second); } +void AssemblyWriter::printStructuredData( + ArrayRef> Fields) { + Out << "{\n"; + for (const auto &Field : Fields) { + Out << " " << Field.first.getAsString() << ": "; + if (Field.second.isBool()) { + if (Field.second.getBool()) + Out << "i1 true"; + else + Out << "i1 false"; + } else if (Field.second.isAPInt()) { + const APInt &I = Field.second.getAPInt(); + Out << 'i' << I.getBitWidth() << ' ' << I; + } else if (Field.second.isType()) { + Out << "type "; + TypePrinter.print(Field.second.getType(), Out); + } else { + llvm_unreachable("unhandled sdata::Value type"); + } + Out << ",\n"; + } + Out << "}\n"; +} + //===----------------------------------------------------------------------===// // External Interface declarations //===----------------------------------------------------------------------===// diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt --- a/llvm/lib/IR/CMakeLists.txt +++ b/llvm/lib/IR/CMakeLists.txt @@ -57,6 +57,7 @@ ReplaceConstant.cpp Statepoint.cpp StructuralHash.cpp + StructuredData.cpp Type.cpp TypedPointerType.cpp TypeFinder.cpp diff --git a/llvm/lib/IR/StructuredData.cpp b/llvm/lib/IR/StructuredData.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/IR/StructuredData.cpp @@ -0,0 +1,95 @@ +//===- StructuredData.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/StructuredData.h" + +#include "LLVMContextImpl.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/Support/RWMutex.h" +#include "llvm/Support/StringSaver.h" + +using namespace llvm; +using namespace sdata; + +namespace { + +struct SymbolTable { + sys::RWMutex Mutex; + BumpPtrAllocator Allocator; + StringSaver Saver{Allocator}; + std::vector IdToName; + DenseMap NameToId; + + static SymbolTable &instance() { + static SymbolTable Map; + return Map; + } +}; + +enum class DeserializeErrorCode : int { + Generic = 1, +}; + +class DeserializeErrorCategory : public std::error_category { +public: + const char *name() const noexcept override { + return "Structure Data Deserialize Error"; + } + + std::string message(int condition) const override { + return "Error while deserializing structured data"; + } + + static DeserializeErrorCategory &get() { + static DeserializeErrorCategory TheCategory; + return TheCategory; + } +}; + +} // anonymous namespace + +sdata::RegisterSymbol::RegisterSymbol(StringRef Str) { + SymbolTable &ST = SymbolTable::instance(); + sys::ScopedWriter Lock(ST.Mutex); + auto I = ST.NameToId.find(Str); + if (I == ST.NameToId.end()) { + StringRef Saved = ST.Saver.save(Str); + ST.IdToName.push_back(Saved); + I = ST.NameToId.try_emplace(Saved, ST.IdToName.size()).first; + } + + S.Id = I->second; + S.String = ST.IdToName[S.Id - 1]; +} + +SymbolTableLockGuard::SymbolTableLockGuard() { + SymbolTable::instance().Mutex.lock_shared(); +} + +SymbolTableLockGuard::~SymbolTableLockGuard() { + SymbolTable::instance().Mutex.unlock_shared(); +} + +Symbol SymbolTableLockGuard::getSymbol(LLVMContext &Ctx, + StringRef String) const { + SymbolTable &ST = SymbolTable::instance(); + Symbol S; + S.Id = ST.NameToId.lookup(String); + if (S.Id != 0) + S.String = ST.IdToName[S.Id - 1]; + else + S.String = Ctx.pImpl->Saver.save(String); + return S; +} + +Error llvm::sdata::makeDeserializeError(const Twine &Msg) { + return createStringError( + std::error_code(static_cast(DeserializeErrorCode::Generic), + DeserializeErrorCategory::get()), + Msg); +}