Index: cfe/trunk/include/clang/AST/Expr.h =================================================================== --- cfe/trunk/include/clang/AST/Expr.h +++ cfe/trunk/include/clang/AST/Expr.h @@ -1568,98 +1568,131 @@ /// char X[2] = "foobar"; /// In this case, getByteLength() will return 6, but the string literal will /// have type "char[2]". -class StringLiteral : public Expr { +class StringLiteral final + : public Expr, + private llvm::TrailingObjects { + friend class ASTStmtReader; + friend TrailingObjects; + + /// StringLiteral is followed by several trailing objects. They are in order: + /// + /// * A single unsigned storing the length in characters of this string. The + /// length in bytes is this length times the width of a single character. + /// Always present and stored as a trailing objects because storing it in + /// StringLiteral would increase the size of StringLiteral by sizeof(void *) + /// due to alignment requirements. If you add some data to StringLiteral, + /// consider moving it inside StringLiteral. + /// + /// * An array of getNumConcatenated() SourceLocation, one for each of the + /// token this string is made of. + /// + /// * An array of getByteLength() char used to store the string data. + public: enum StringKind { Ascii, Wide, UTF8, UTF16, UTF32 }; private: - friend class ASTStmtReader; + unsigned numTrailingObjects(OverloadToken) const { return 1; } + unsigned numTrailingObjects(OverloadToken) const { + return getNumConcatenated(); + } - union { - const char *asChar; - const uint16_t *asUInt16; - const uint32_t *asUInt32; - } StrData; - unsigned Length; - unsigned CharByteWidth : 4; - unsigned Kind : 3; - unsigned IsPascal : 1; - unsigned NumConcatenated; - SourceLocation TokLocs[1]; - - StringLiteral(QualType Ty) : - Expr(StringLiteralClass, Ty, VK_LValue, OK_Ordinary, false, false, false, - false) {} + unsigned numTrailingObjects(OverloadToken) const { + return getByteLength(); + } + + char *getStrDataAsChar() { return getTrailingObjects(); } + const char *getStrDataAsChar() const { return getTrailingObjects(); } + + const uint16_t *getStrDataAsUInt16() const { + return reinterpret_cast(getTrailingObjects()); + } + + const uint32_t *getStrDataAsUInt32() const { + return reinterpret_cast(getTrailingObjects()); + } + + /// Build a string literal. + StringLiteral(const ASTContext &Ctx, StringRef Str, StringKind Kind, + bool Pascal, QualType Ty, const SourceLocation *Loc, + unsigned NumConcatenated); + + /// Build an empty string literal. + StringLiteral(EmptyShell Empty, unsigned NumConcatenated, unsigned Length, + unsigned CharByteWidth); /// Map a target and string kind to the appropriate character width. static unsigned mapCharByteWidth(TargetInfo const &Target, StringKind SK); + /// Set one of the string literal token. + void setStrTokenLoc(unsigned TokNum, SourceLocation L) { + assert(TokNum < getNumConcatenated() && "Invalid tok number"); + getTrailingObjects()[TokNum] = L; + } + public: /// This is the "fully general" constructor that allows representation of /// strings formed from multiple concatenated tokens. - static StringLiteral *Create(const ASTContext &C, StringRef Str, + static StringLiteral *Create(const ASTContext &Ctx, StringRef Str, StringKind Kind, bool Pascal, QualType Ty, - const SourceLocation *Loc, unsigned NumStrs); + const SourceLocation *Loc, + unsigned NumConcatenated); /// Simple constructor for string literals made from one token. - static StringLiteral *Create(const ASTContext &C, StringRef Str, + static StringLiteral *Create(const ASTContext &Ctx, StringRef Str, StringKind Kind, bool Pascal, QualType Ty, SourceLocation Loc) { - return Create(C, Str, Kind, Pascal, Ty, &Loc, 1); + return Create(Ctx, Str, Kind, Pascal, Ty, &Loc, 1); } /// Construct an empty string literal. - static StringLiteral *CreateEmpty(const ASTContext &C, unsigned NumStrs); + static StringLiteral *CreateEmpty(const ASTContext &Ctx, + unsigned NumConcatenated, unsigned Length, + unsigned CharByteWidth); StringRef getString() const { - assert(CharByteWidth==1 - && "This function is used in places that assume strings use char"); - return StringRef(StrData.asChar, getByteLength()); + assert(getCharByteWidth() == 1 && + "This function is used in places that assume strings use char"); + return StringRef(getStrDataAsChar(), getByteLength()); } /// Allow access to clients that need the byte representation, such as /// ASTWriterStmt::VisitStringLiteral(). StringRef getBytes() const { // FIXME: StringRef may not be the right type to use as a result for this. - if (CharByteWidth == 1) - return StringRef(StrData.asChar, getByteLength()); - if (CharByteWidth == 4) - return StringRef(reinterpret_cast(StrData.asUInt32), - getByteLength()); - assert(CharByteWidth == 2 && "unsupported CharByteWidth"); - return StringRef(reinterpret_cast(StrData.asUInt16), - getByteLength()); + return StringRef(getStrDataAsChar(), getByteLength()); } void outputString(raw_ostream &OS) const; uint32_t getCodeUnit(size_t i) const { - assert(i < Length && "out of bounds access"); - if (CharByteWidth == 1) - return static_cast(StrData.asChar[i]); - if (CharByteWidth == 4) - return StrData.asUInt32[i]; - assert(CharByteWidth == 2 && "unsupported CharByteWidth"); - return StrData.asUInt16[i]; - } - - unsigned getByteLength() const { return CharByteWidth*Length; } - unsigned getLength() const { return Length; } - unsigned getCharByteWidth() const { return CharByteWidth; } - - /// Sets the string data to the given string data. - void setString(const ASTContext &C, StringRef Str, - StringKind Kind, bool IsPascal); - - StringKind getKind() const { return static_cast(Kind); } - - - bool isAscii() const { return Kind == Ascii; } - bool isWide() const { return Kind == Wide; } - bool isUTF8() const { return Kind == UTF8; } - bool isUTF16() const { return Kind == UTF16; } - bool isUTF32() const { return Kind == UTF32; } - bool isPascal() const { return IsPascal; } + assert(i < getLength() && "out of bounds access"); + switch (getCharByteWidth()) { + case 1: + return static_cast(getStrDataAsChar()[i]); + case 2: + return getStrDataAsUInt16()[i]; + case 4: + return getStrDataAsUInt32()[i]; + } + llvm_unreachable("Unsupported character width!"); + } + + unsigned getByteLength() const { return getCharByteWidth() * getLength(); } + unsigned getLength() const { return *getTrailingObjects(); } + unsigned getCharByteWidth() const { return StringLiteralBits.CharByteWidth; } + + StringKind getKind() const { + return static_cast(StringLiteralBits.Kind); + } + + bool isAscii() const { return getKind() == Ascii; } + bool isWide() const { return getKind() == Wide; } + bool isUTF8() const { return getKind() == UTF8; } + bool isUTF16() const { return getKind() == UTF16; } + bool isUTF32() const { return getKind() == UTF32; } + bool isPascal() const { return StringLiteralBits.IsPascal; } bool containsNonAscii() const { for (auto c : getString()) @@ -1677,15 +1710,14 @@ /// getNumConcatenated - Get the number of string literal tokens that were /// concatenated in translation phase #6 to form this string literal. - unsigned getNumConcatenated() const { return NumConcatenated; } + unsigned getNumConcatenated() const { + return StringLiteralBits.NumConcatenated; + } + /// Get one of the string literal token. SourceLocation getStrTokenLoc(unsigned TokNum) const { - assert(TokNum < NumConcatenated && "Invalid tok number"); - return TokLocs[TokNum]; - } - void setStrTokenLoc(unsigned TokNum, SourceLocation L) { - assert(TokNum < NumConcatenated && "Invalid tok number"); - TokLocs[TokNum] = L; + assert(TokNum < getNumConcatenated() && "Invalid tok number"); + return getTrailingObjects()[TokNum]; } /// getLocationOfByte - Return a source location that points to the specified @@ -1702,14 +1734,18 @@ unsigned *StartTokenByteOffset = nullptr) const; typedef const SourceLocation *tokloc_iterator; - tokloc_iterator tokloc_begin() const { return TokLocs; } - tokloc_iterator tokloc_end() const { return TokLocs + NumConcatenated; } - SourceLocation getBeginLoc() const LLVM_READONLY { return TokLocs[0]; } - SourceLocation getEndLoc() const LLVM_READONLY { - return TokLocs[NumConcatenated - 1]; + tokloc_iterator tokloc_begin() const { + return getTrailingObjects(); + } + + tokloc_iterator tokloc_end() const { + return getTrailingObjects() + getNumConcatenated(); } + SourceLocation getBeginLoc() const LLVM_READONLY { return *tokloc_begin(); } + SourceLocation getEndLoc() const LLVM_READONLY { return *(tokloc_end() - 1); } + static bool classof(const Stmt *T) { return T->getStmtClass() == StringLiteralClass; } Index: cfe/trunk/include/clang/AST/Stmt.h =================================================================== --- cfe/trunk/include/clang/AST/Stmt.h +++ cfe/trunk/include/clang/AST/Stmt.h @@ -366,6 +366,28 @@ unsigned IsExact : 1; }; + class StringLiteralBitfields { + friend class ASTStmtReader; + friend class StringLiteral; + + unsigned : NumExprBits; + + /// The kind of this string literal. + /// One of the enumeration values of StringLiteral::StringKind. + unsigned Kind : 3; + + /// The width of a single character in bytes. Only values of 1, 2, + /// and 4 bytes are supported. StringLiteral::mapCharByteWidth maps + /// the target + string kind to the appropriate CharByteWidth. + unsigned CharByteWidth : 3; + + unsigned IsPascal : 1; + + /// The number of concatenated token this string is made of. + /// This is the number of trailing SourceLocation. + unsigned NumConcatenated; + }; + class CharacterLiteralBitfields { friend class CharacterLiteral; @@ -566,6 +588,7 @@ PredefinedExprBitfields PredefinedExprBits; DeclRefExprBitfields DeclRefExprBits; FloatingLiteralBitfields FloatingLiteralBits; + StringLiteralBitfields StringLiteralBits; CharacterLiteralBitfields CharacterLiteralBits; UnaryOperatorBitfields UnaryOperatorBits; UnaryExprOrTypeTraitExprBitfields UnaryExprOrTypeTraitExprBits; Index: cfe/trunk/lib/AST/Expr.cpp =================================================================== --- cfe/trunk/lib/AST/Expr.cpp +++ cfe/trunk/lib/AST/Expr.cpp @@ -912,42 +912,80 @@ return CharByteWidth; } -StringLiteral *StringLiteral::Create(const ASTContext &C, StringRef Str, - StringKind Kind, bool Pascal, QualType Ty, - const SourceLocation *Loc, - unsigned NumStrs) { - assert(C.getAsConstantArrayType(Ty) && +StringLiteral::StringLiteral(const ASTContext &Ctx, StringRef Str, + StringKind Kind, bool Pascal, QualType Ty, + const SourceLocation *Loc, + unsigned NumConcatenated) + : Expr(StringLiteralClass, Ty, VK_LValue, OK_Ordinary, false, false, false, + false) { + assert(Ctx.getAsConstantArrayType(Ty) && "StringLiteral must be of constant array type!"); + unsigned CharByteWidth = mapCharByteWidth(Ctx.getTargetInfo(), Kind); + unsigned ByteLength = Str.size(); + assert((ByteLength % CharByteWidth == 0) && + "The size of the data must be a multiple of CharByteWidth!"); + + // Avoid the expensive division. The compiler should be able to figure it + // out by itself. However as of clang 7, even with the appropriate + // llvm_unreachable added just here, it is not able to do so. + unsigned Length; + switch (CharByteWidth) { + case 1: + Length = ByteLength; + break; + case 2: + Length = ByteLength / 2; + break; + case 4: + Length = ByteLength / 4; + break; + default: + llvm_unreachable("Unsupported character width!"); + } - // Allocate enough space for the StringLiteral plus an array of locations for - // any concatenated string tokens. - void *Mem = - C.Allocate(sizeof(StringLiteral) + sizeof(SourceLocation) * (NumStrs - 1), - alignof(StringLiteral)); - StringLiteral *SL = new (Mem) StringLiteral(Ty); + StringLiteralBits.Kind = Kind; + StringLiteralBits.CharByteWidth = CharByteWidth; + StringLiteralBits.IsPascal = Pascal; + StringLiteralBits.NumConcatenated = NumConcatenated; + *getTrailingObjects() = Length; - // OPTIMIZE: could allocate this appended to the StringLiteral. - SL->setString(C,Str,Kind,Pascal); + // Initialize the trailing array of SourceLocation. + // This is safe since SourceLocation is POD-like. + std::memcpy(getTrailingObjects(), Loc, + NumConcatenated * sizeof(SourceLocation)); - SL->TokLocs[0] = Loc[0]; - SL->NumConcatenated = NumStrs; + // Initialize the trailing array of char holding the string data. + std::memcpy(getTrailingObjects(), Str.data(), ByteLength); +} - if (NumStrs != 1) - memcpy(&SL->TokLocs[1], Loc+1, sizeof(SourceLocation)*(NumStrs-1)); - return SL; +StringLiteral::StringLiteral(EmptyShell Empty, unsigned NumConcatenated, + unsigned Length, unsigned CharByteWidth) + : Expr(StringLiteralClass, Empty) { + StringLiteralBits.CharByteWidth = CharByteWidth; + StringLiteralBits.NumConcatenated = NumConcatenated; + *getTrailingObjects() = Length; } -StringLiteral *StringLiteral::CreateEmpty(const ASTContext &C, - unsigned NumStrs) { - void *Mem = - C.Allocate(sizeof(StringLiteral) + sizeof(SourceLocation) * (NumStrs - 1), - alignof(StringLiteral)); - StringLiteral *SL = - new (Mem) StringLiteral(C.adjustStringLiteralBaseType(QualType())); - SL->CharByteWidth = 0; - SL->Length = 0; - SL->NumConcatenated = NumStrs; - return SL; +StringLiteral *StringLiteral::Create(const ASTContext &Ctx, StringRef Str, + StringKind Kind, bool Pascal, QualType Ty, + const SourceLocation *Loc, + unsigned NumConcatenated) { + void *Mem = Ctx.Allocate(totalSizeToAlloc( + 1, NumConcatenated, Str.size()), + alignof(StringLiteral)); + return new (Mem) + StringLiteral(Ctx, Str, Kind, Pascal, Ty, Loc, NumConcatenated); +} + +StringLiteral *StringLiteral::CreateEmpty(const ASTContext &Ctx, + unsigned NumConcatenated, + unsigned Length, + unsigned CharByteWidth) { + void *Mem = Ctx.Allocate(totalSizeToAlloc( + 1, NumConcatenated, Length * CharByteWidth), + alignof(StringLiteral)); + return new (Mem) + StringLiteral(EmptyShell(), NumConcatenated, Length, CharByteWidth); } void StringLiteral::outputString(raw_ostream &OS) const { @@ -1046,42 +1084,6 @@ OS << '"'; } -void StringLiteral::setString(const ASTContext &C, StringRef Str, - StringKind Kind, bool IsPascal) { - //FIXME: we assume that the string data comes from a target that uses the same - // code unit size and endianness for the type of string. - this->Kind = Kind; - this->IsPascal = IsPascal; - - CharByteWidth = mapCharByteWidth(C.getTargetInfo(),Kind); - assert((Str.size()%CharByteWidth == 0) - && "size of data must be multiple of CharByteWidth"); - Length = Str.size()/CharByteWidth; - - switch(CharByteWidth) { - case 1: { - char *AStrData = new (C) char[Length]; - std::memcpy(AStrData,Str.data(),Length*sizeof(*AStrData)); - StrData.asChar = AStrData; - break; - } - case 2: { - uint16_t *AStrData = new (C) uint16_t[Length]; - std::memcpy(AStrData,Str.data(),Length*sizeof(*AStrData)); - StrData.asUInt16 = AStrData; - break; - } - case 4: { - uint32_t *AStrData = new (C) uint32_t[Length]; - std::memcpy(AStrData,Str.data(),Length*sizeof(*AStrData)); - StrData.asUInt32 = AStrData; - break; - } - default: - llvm_unreachable("unsupported CharByteWidth"); - } -} - /// getLocationOfByte - Return a source location that points to the specified /// byte of this string literal. /// Index: cfe/trunk/lib/Serialization/ASTReaderStmt.cpp =================================================================== --- cfe/trunk/lib/Serialization/ASTReaderStmt.cpp +++ cfe/trunk/lib/Serialization/ASTReaderStmt.cpp @@ -595,22 +595,35 @@ void ASTStmtReader::VisitStringLiteral(StringLiteral *E) { VisitExpr(E); - unsigned Len = Record.readInt(); - assert(Record.peekInt() == E->getNumConcatenated() && - "Wrong number of concatenated tokens!"); - Record.skipInts(1); - auto kind = static_cast(Record.readInt()); - bool isPascal = Record.readInt(); - // Read string data - auto B = &Record.peekInt(); - SmallString<16> Str(B, B + Len); - E->setString(Record.getContext(), Str, kind, isPascal); - Record.skipInts(Len); + // NumConcatenated, Length and CharByteWidth are set by the empty + // ctor since they are needed to allocate storage for the trailing objects. + unsigned NumConcatenated = Record.readInt(); + unsigned Length = Record.readInt(); + unsigned CharByteWidth = Record.readInt(); + assert((NumConcatenated == E->getNumConcatenated()) && + "Wrong number of concatenated tokens!"); + assert((Length == E->getLength()) && "Wrong Length!"); + assert((CharByteWidth == E->getCharByteWidth()) && "Wrong character width!"); + E->StringLiteralBits.Kind = Record.readInt(); + E->StringLiteralBits.IsPascal = Record.readInt(); + + // The character width is originally computed via mapCharByteWidth. + // Check that the deserialized character width is consistant with the result + // of calling mapCharByteWidth. + assert((CharByteWidth == + StringLiteral::mapCharByteWidth(Record.getContext().getTargetInfo(), + E->getKind())) && + "Wrong character width!"); - // Read source locations - for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I) + // Deserialize the trailing array of SourceLocation. + for (unsigned I = 0; I < NumConcatenated; ++I) E->setStrTokenLoc(I, ReadSourceLocation()); + + // Deserialize the trailing array of char holding the string data. + char *StrData = E->getStrDataAsChar(); + for (unsigned I = 0; I < Length * CharByteWidth; ++I) + StrData[I] = Record.readInt(); } void ASTStmtReader::VisitCharacterLiteral(CharacterLiteral *E) { @@ -2423,8 +2436,11 @@ break; case EXPR_STRING_LITERAL: - S = StringLiteral::CreateEmpty(Context, - Record[ASTStmtReader::NumExprFields + 1]); + S = StringLiteral::CreateEmpty( + Context, + /* NumConcatenated=*/Record[ASTStmtReader::NumExprFields + 0], + /* Length=*/Record[ASTStmtReader::NumExprFields + 1], + /* CharByteWidth=*/Record[ASTStmtReader::NumExprFields + 2]); break; case EXPR_CHARACTER_LITERAL: Index: cfe/trunk/lib/Serialization/ASTWriterStmt.cpp =================================================================== --- cfe/trunk/lib/Serialization/ASTWriterStmt.cpp +++ cfe/trunk/lib/Serialization/ASTWriterStmt.cpp @@ -518,17 +518,23 @@ void ASTStmtWriter::VisitStringLiteral(StringLiteral *E) { VisitExpr(E); - Record.push_back(E->getByteLength()); + + // Store the various bits of data of StringLiteral. Record.push_back(E->getNumConcatenated()); + Record.push_back(E->getLength()); + Record.push_back(E->getCharByteWidth()); Record.push_back(E->getKind()); Record.push_back(E->isPascal()); - // FIXME: String data should be stored as a blob at the end of the - // StringLiteral. However, we can't do so now because we have no - // provision for coping with abbreviations when we're jumping around - // the AST file during deserialization. - Record.append(E->getBytes().begin(), E->getBytes().end()); + + // Store the trailing array of SourceLocation. for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I) Record.AddSourceLocation(E->getStrTokenLoc(I)); + + // Store the trailing array of char holding the string data. + StringRef StrData = E->getBytes(); + for (unsigned I = 0, N = E->getByteLength(); I != N; ++I) + Record.push_back(StrData[I]); + Code = serialization::EXPR_STRING_LITERAL; }