Index: include/clang/AST/Expr.h =================================================================== --- include/clang/AST/Expr.h +++ include/clang/AST/Expr.h @@ -1554,11 +1554,12 @@ }; /// StringLiteral - This represents a string literal expression, e.g. "foo" -/// or L"bar" (wide strings). The actual string is returned by getBytes() -/// is NOT null-terminated, and the length of the string is determined by -/// calling getByteLength(). The C type for a string is always a -/// ConstantArrayType. In C++, the char type is const qualified, in C it is -/// not. +/// or L"bar" (wide strings). The actual string data can be obtained with +/// getBytes() and is NOT null-terminated. The length of the string data is +/// determined by calling getByteLength(). +/// +/// The C type for a string is always a ConstantArrayType. In C++, the char +/// type is const qualified, in C it is not. /// /// Note that strings in C can be formed by concatenation of multiple string /// literal pptokens in translation phase #6. This keeps track of the locations @@ -1569,131 +1570,156 @@ /// char X[2] = "foobar"; /// In this case, getByteLength() will return 6, but the string literal will /// have type "char[2]". -class StringLiteral : public Expr { +class StringLiteral final + : public Expr, + private llvm::TrailingObjects { + friend class ASTStmtReader; + friend TrailingObjects; + + /// StringLiteral is followed by several trailing objects. They are in order: + /// + /// * A single unsigned storing the length in characters of this string. The + /// length in bytes is this length times the width of a single character. + /// Always present and stored as a trailing objects because storing it in + /// StringLiteral would increase the size of StringLiteral by sizeof(void *) + /// due to alignment requirements. If you add some data to StringLiteral, + /// consider moving it inside StringLiteral. + /// + /// * An array of getNumConcatenated() SourceLocation, one for each of the + /// token this string is made of. + /// + /// * An array of getByteLength() char used to store the string data. + public: - enum StringKind { - Ascii, - Wide, - UTF8, - UTF16, - UTF32 - }; + enum StringKind { Ascii, Wide, UTF8, UTF16, UTF32 }; private: - friend class ASTStmtReader; + unsigned numTrailingObjects(OverloadToken) const { return 1; } + unsigned numTrailingObjects(OverloadToken) const { + return getNumConcatenated(); + } - union { - const char *asChar; - const uint16_t *asUInt16; - const uint32_t *asUInt32; - } StrData; - unsigned Length; - unsigned CharByteWidth : 4; - unsigned Kind : 3; - unsigned IsPascal : 1; - unsigned NumConcatenated; - SourceLocation TokLocs[1]; - - StringLiteral(QualType Ty) : - Expr(StringLiteralClass, Ty, VK_LValue, OK_Ordinary, false, false, false, - false) {} - - static int mapCharByteWidth(TargetInfo const &target,StringKind k); + unsigned numTrailingObjects(OverloadToken) const { + return getByteLength(); + } + + char *getStrDataAsChar() { return getTrailingObjects(); } + const char *getStrDataAsChar() const { return getTrailingObjects(); } + + const uint16_t *getStrDataAsUInt16() const { + return reinterpret_cast(getTrailingObjects()); + } + + const uint32_t *getStrDataAsUInt32() const { + return reinterpret_cast(getTrailingObjects()); + } + + /// Build a string literal. + StringLiteral(const ASTContext &Ctx, StringRef Str, StringKind Kind, + bool Pascal, QualType Ty, const SourceLocation *Loc, + unsigned NumConcatenated); + + /// Build an empty string literal. + StringLiteral(EmptyShell Empty, unsigned NumConcatenated, unsigned Length, + unsigned CharByteWidth); + + /// Map a target and string kind to the appropriate character width. + static unsigned mapCharByteWidth(TargetInfo const &Target, StringKind SK); + + /// Set one of the string literal token. + void setStrTokenLoc(unsigned TokNum, SourceLocation L) { + assert(TokNum < getNumConcatenated() && "Invalid tok number"); + getTrailingObjects()[TokNum] = L; + } public: /// This is the "fully general" constructor that allows representation of /// strings formed from multiple concatenated tokens. - static StringLiteral *Create(const ASTContext &C, StringRef Str, + static StringLiteral *Create(const ASTContext &Ctx, StringRef Str, StringKind Kind, bool Pascal, QualType Ty, - const SourceLocation *Loc, unsigned NumStrs); + const SourceLocation *Loc, + unsigned NumConcatenated); /// Simple constructor for string literals made from one token. - static StringLiteral *Create(const ASTContext &C, StringRef Str, + static StringLiteral *Create(const ASTContext &Ctx, StringRef Str, StringKind Kind, bool Pascal, QualType Ty, SourceLocation Loc) { - return Create(C, Str, Kind, Pascal, Ty, &Loc, 1); + return Create(Ctx, Str, Kind, Pascal, Ty, &Loc, 1); } /// Construct an empty string literal. - static StringLiteral *CreateEmpty(const ASTContext &C, unsigned NumStrs); + static StringLiteral *CreateEmpty(const ASTContext &Ctx, + unsigned NumConcatenated, unsigned Length, + unsigned CharByteWidth); StringRef getString() const { - assert(CharByteWidth==1 - && "This function is used in places that assume strings use char"); - return StringRef(StrData.asChar, getByteLength()); + assert(getCharByteWidth() == 1 && + "This function is used in places that assume strings use char"); + return StringRef(getStrDataAsChar(), getByteLength()); } /// Allow access to clients that need the byte representation, such as /// ASTWriterStmt::VisitStringLiteral(). StringRef getBytes() const { // FIXME: StringRef may not be the right type to use as a result for this. - if (CharByteWidth == 1) - return StringRef(StrData.asChar, getByteLength()); - if (CharByteWidth == 4) - return StringRef(reinterpret_cast(StrData.asUInt32), - getByteLength()); - assert(CharByteWidth == 2 && "unsupported CharByteWidth"); - return StringRef(reinterpret_cast(StrData.asUInt16), - getByteLength()); + return StringRef(getStrDataAsChar(), getByteLength()); } void outputString(raw_ostream &OS) const; uint32_t getCodeUnit(size_t i) const { - assert(i < Length && "out of bounds access"); - if (CharByteWidth == 1) - return static_cast(StrData.asChar[i]); - if (CharByteWidth == 4) - return StrData.asUInt32[i]; - assert(CharByteWidth == 2 && "unsupported CharByteWidth"); - return StrData.asUInt16[i]; + assert(i < getLength() && "out of bounds access"); + switch (getCharByteWidth()) { + case 1: + return static_cast(getStrDataAsChar()[i]); + case 2: + return getStrDataAsUInt16()[i]; + case 4: + return getStrDataAsUInt32()[i]; + } + llvm_unreachable("Unsupported character width!"); } - unsigned getByteLength() const { return CharByteWidth*Length; } - unsigned getLength() const { return Length; } - unsigned getCharByteWidth() const { return CharByteWidth; } - - /// Sets the string data to the given string data. - void setString(const ASTContext &C, StringRef Str, - StringKind Kind, bool IsPascal); - - StringKind getKind() const { return static_cast(Kind); } + unsigned getByteLength() const { return getCharByteWidth() * getLength(); } + unsigned getLength() const { return *getTrailingObjects(); } + unsigned getCharByteWidth() const { return StringLiteralBits.CharByteWidth; } + StringKind getKind() const { + return static_cast(StringLiteralBits.Kind); + } - bool isAscii() const { return Kind == Ascii; } - bool isWide() const { return Kind == Wide; } - bool isUTF8() const { return Kind == UTF8; } - bool isUTF16() const { return Kind == UTF16; } - bool isUTF32() const { return Kind == UTF32; } - bool isPascal() const { return IsPascal; } + bool isAscii() const { return getKind() == Ascii; } + bool isWide() const { return getKind() == Wide; } + bool isUTF8() const { return getKind() == UTF8; } + bool isUTF16() const { return getKind() == UTF16; } + bool isUTF32() const { return getKind() == UTF32; } + bool isPascal() const { return StringLiteralBits.IsPascal; } bool containsNonAscii() const { - StringRef Str = getString(); - for (unsigned i = 0, e = Str.size(); i != e; ++i) - if (!isASCII(Str[i])) + for (auto c : getString()) + if (!isASCII(c)) return true; return false; } bool containsNonAsciiOrNull() const { - StringRef Str = getString(); - for (unsigned i = 0, e = Str.size(); i != e; ++i) - if (!isASCII(Str[i]) || !Str[i]) + for (auto c : getString()) + if (!isASCII(c) || !c) return true; return false; } /// getNumConcatenated - Get the number of string literal tokens that were /// concatenated in translation phase #6 to form this string literal. - unsigned getNumConcatenated() const { return NumConcatenated; } + unsigned getNumConcatenated() const { + return StringLiteralBits.NumConcatenated; + } + /// Get one of the string literal token. SourceLocation getStrTokenLoc(unsigned TokNum) const { - assert(TokNum < NumConcatenated && "Invalid tok number"); - return TokLocs[TokNum]; - } - void setStrTokenLoc(unsigned TokNum, SourceLocation L) { - assert(TokNum < NumConcatenated && "Invalid tok number"); - TokLocs[TokNum] = L; + assert(TokNum < getNumConcatenated() && "Invalid tok number"); + return getTrailingObjects()[TokNum]; } /// getLocationOfByte - Return a source location that points to the specified @@ -1710,14 +1736,18 @@ unsigned *StartTokenByteOffset = nullptr) const; typedef const SourceLocation *tokloc_iterator; - tokloc_iterator tokloc_begin() const { return TokLocs; } - tokloc_iterator tokloc_end() const { return TokLocs + NumConcatenated; } - SourceLocation getBeginLoc() const LLVM_READONLY { return TokLocs[0]; } - SourceLocation getEndLoc() const LLVM_READONLY { - return TokLocs[NumConcatenated - 1]; + tokloc_iterator tokloc_begin() const { + return getTrailingObjects(); } + tokloc_iterator tokloc_end() const { + return getTrailingObjects() + getNumConcatenated(); + } + + SourceLocation getBeginLoc() const LLVM_READONLY { return *tokloc_begin(); } + SourceLocation getEndLoc() const LLVM_READONLY { return *(tokloc_end() - 1); } + static bool classof(const Stmt *T) { return T->getStmtClass() == StringLiteralClass; } Index: include/clang/AST/Stmt.h =================================================================== --- include/clang/AST/Stmt.h +++ include/clang/AST/Stmt.h @@ -338,6 +338,28 @@ SourceLocation Loc; }; + class StringLiteralBitfields { + friend class ASTStmtReader; + friend class StringLiteral; + + unsigned : NumExprBits; + + /// The kind of this string literal. + /// One of the enumeration values of StringLiteral::StringKind. + unsigned Kind : 3; + + /// The width of a single character in bytes. Only values of 1, 2, + /// and 4 bytes are supported. StringLiteral::mapCharByteWidth maps + /// the target + string kind to the appropriate CharByteWidth. + unsigned CharByteWidth : 3; + + unsigned IsPascal : 1; + + /// The number of concatenated token this string is made of. + /// This is the number of trailing SourceLocation. + unsigned NumConcatenated; + }; + class CharacterLiteralBitfields { friend class CharacterLiteral; @@ -564,6 +586,7 @@ // Expressions ExprBitfields ExprBits; PredefinedExprBitfields PredefinedExprBits; + StringLiteralBitfields StringLiteralBits; CharacterLiteralBitfields CharacterLiteralBits; FloatingLiteralBitfields FloatingLiteralBits; UnaryExprOrTypeTraitExprBitfields UnaryExprOrTypeTraitExprBits; Index: lib/AST/Expr.cpp =================================================================== --- lib/AST/Expr.cpp +++ lib/AST/Expr.cpp @@ -887,66 +887,105 @@ return V.convertToDouble(); } -int StringLiteral::mapCharByteWidth(TargetInfo const &target,StringKind k) { - int CharByteWidth = 0; - switch(k) { - case Ascii: - case UTF8: - CharByteWidth = target.getCharWidth(); - break; - case Wide: - CharByteWidth = target.getWCharWidth(); - break; - case UTF16: - CharByteWidth = target.getChar16Width(); - break; - case UTF32: - CharByteWidth = target.getChar32Width(); - break; +unsigned StringLiteral::mapCharByteWidth(TargetInfo const &Target, + StringKind SK) { + unsigned CharByteWidth = 0; + switch (SK) { + case Ascii: + case UTF8: + CharByteWidth = Target.getCharWidth(); + break; + case Wide: + CharByteWidth = Target.getWCharWidth(); + break; + case UTF16: + CharByteWidth = Target.getChar16Width(); + break; + case UTF32: + CharByteWidth = Target.getChar32Width(); + break; } assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple"); CharByteWidth /= 8; - assert((CharByteWidth==1 || CharByteWidth==2 || CharByteWidth==4) - && "character byte widths supported are 1, 2, and 4 only"); + assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) && + "The only supported character byte widths are 1,2 and 4!"); return CharByteWidth; } -StringLiteral *StringLiteral::Create(const ASTContext &C, StringRef Str, - StringKind Kind, bool Pascal, QualType Ty, - const SourceLocation *Loc, - unsigned NumStrs) { - assert(C.getAsConstantArrayType(Ty) && +StringLiteral::StringLiteral(const ASTContext &Ctx, StringRef Str, + StringKind Kind, bool Pascal, QualType Ty, + const SourceLocation *Loc, + unsigned NumConcatenated) + : Expr(StringLiteralClass, Ty, VK_LValue, OK_Ordinary, false, false, false, + false) { + assert(Ctx.getAsConstantArrayType(Ty) && "StringLiteral must be of constant array type!"); + unsigned CharByteWidth = mapCharByteWidth(Ctx.getTargetInfo(), Kind); + unsigned ByteLength = Str.size(); + assert((ByteLength % CharByteWidth == 0) && + "The size of the data must be a multiple of CharByteWidth!"); + + // Avoid the expensive division. The compiler should be able to figure it + // out by itself. However as of clang 7, even with the appropriate + // llvm_unreachable added just here, it is not able to do so. + unsigned Length; + switch (CharByteWidth) { + case 1: + Length = ByteLength; + break; + case 2: + Length = ByteLength / 2; + break; + case 4: + Length = ByteLength / 4; + break; + default: + llvm_unreachable("Unsupported character width!"); + } - // Allocate enough space for the StringLiteral plus an array of locations for - // any concatenated string tokens. - void *Mem = - C.Allocate(sizeof(StringLiteral) + sizeof(SourceLocation) * (NumStrs - 1), - alignof(StringLiteral)); - StringLiteral *SL = new (Mem) StringLiteral(Ty); + StringLiteralBits.Kind = Kind; + StringLiteralBits.CharByteWidth = CharByteWidth; + StringLiteralBits.IsPascal = Pascal; + StringLiteralBits.NumConcatenated = NumConcatenated; + *getTrailingObjects() = Length; - // OPTIMIZE: could allocate this appended to the StringLiteral. - SL->setString(C,Str,Kind,Pascal); + // Initialize the trailing array of SourceLocation. + // This is safe since SourceLocation is POD-like. + std::memcpy(getTrailingObjects(), Loc, + NumConcatenated * sizeof(SourceLocation)); - SL->TokLocs[0] = Loc[0]; - SL->NumConcatenated = NumStrs; + // Initialize the trailing array of char holding the string data. + std::memcpy(getTrailingObjects(), Str.data(), ByteLength); +} - if (NumStrs != 1) - memcpy(&SL->TokLocs[1], Loc+1, sizeof(SourceLocation)*(NumStrs-1)); - return SL; +StringLiteral::StringLiteral(EmptyShell Empty, unsigned NumConcatenated, + unsigned Length, unsigned CharByteWidth) + : Expr(StringLiteralClass, Empty) { + StringLiteralBits.CharByteWidth = CharByteWidth; + StringLiteralBits.NumConcatenated = NumConcatenated; + *getTrailingObjects() = Length; } -StringLiteral *StringLiteral::CreateEmpty(const ASTContext &C, - unsigned NumStrs) { - void *Mem = - C.Allocate(sizeof(StringLiteral) + sizeof(SourceLocation) * (NumStrs - 1), - alignof(StringLiteral)); - StringLiteral *SL = - new (Mem) StringLiteral(C.adjustStringLiteralBaseType(QualType())); - SL->CharByteWidth = 0; - SL->Length = 0; - SL->NumConcatenated = NumStrs; - return SL; +StringLiteral *StringLiteral::Create(const ASTContext &Ctx, StringRef Str, + StringKind Kind, bool Pascal, QualType Ty, + const SourceLocation *Loc, + unsigned NumConcatenated) { + void *Mem = Ctx.Allocate(totalSizeToAlloc( + 1, NumConcatenated, Str.size()), + alignof(StringLiteral)); + return new (Mem) + StringLiteral(Ctx, Str, Kind, Pascal, Ty, Loc, NumConcatenated); +} + +StringLiteral *StringLiteral::CreateEmpty(const ASTContext &Ctx, + unsigned NumConcatenated, + unsigned Length, + unsigned CharByteWidth) { + void *Mem = Ctx.Allocate(totalSizeToAlloc( + 1, NumConcatenated, Length * CharByteWidth), + alignof(StringLiteral)); + return new (Mem) + StringLiteral(EmptyShell(), NumConcatenated, Length, CharByteWidth); } void StringLiteral::outputString(raw_ostream &OS) const { @@ -1045,42 +1084,6 @@ OS << '"'; } -void StringLiteral::setString(const ASTContext &C, StringRef Str, - StringKind Kind, bool IsPascal) { - //FIXME: we assume that the string data comes from a target that uses the same - // code unit size and endianness for the type of string. - this->Kind = Kind; - this->IsPascal = IsPascal; - - CharByteWidth = mapCharByteWidth(C.getTargetInfo(),Kind); - assert((Str.size()%CharByteWidth == 0) - && "size of data must be multiple of CharByteWidth"); - Length = Str.size()/CharByteWidth; - - switch(CharByteWidth) { - case 1: { - char *AStrData = new (C) char[Length]; - std::memcpy(AStrData,Str.data(),Length*sizeof(*AStrData)); - StrData.asChar = AStrData; - break; - } - case 2: { - uint16_t *AStrData = new (C) uint16_t[Length]; - std::memcpy(AStrData,Str.data(),Length*sizeof(*AStrData)); - StrData.asUInt16 = AStrData; - break; - } - case 4: { - uint32_t *AStrData = new (C) uint32_t[Length]; - std::memcpy(AStrData,Str.data(),Length*sizeof(*AStrData)); - StrData.asUInt32 = AStrData; - break; - } - default: - llvm_unreachable("unsupported CharByteWidth"); - } -} - /// getLocationOfByte - Return a source location that points to the specified /// byte of this string literal. /// @@ -1102,7 +1105,8 @@ const LangOptions &Features, const TargetInfo &Target, unsigned *StartToken, unsigned *StartTokenByteOffset) const { - assert((Kind == StringLiteral::Ascii || Kind == StringLiteral::UTF8) && + assert((getKind() == StringLiteral::Ascii || + getKind() == StringLiteral::UTF8) && "Only narrow string literals are currently supported"); // Loop over all of the tokens in this string until we find the one that @@ -1170,8 +1174,6 @@ } } - - /// getOpcodeStr - Turn an Opcode enum value into the punctuation char it /// corresponds to, e.g. "sizeof" or "[pre]++". StringRef UnaryOperator::getOpcodeStr(Opcode Op) { Index: lib/Serialization/ASTReaderStmt.cpp =================================================================== --- lib/Serialization/ASTReaderStmt.cpp +++ lib/Serialization/ASTReaderStmt.cpp @@ -602,22 +602,35 @@ void ASTStmtReader::VisitStringLiteral(StringLiteral *E) { VisitExpr(E); - unsigned Len = Record.readInt(); - assert(Record.peekInt() == E->getNumConcatenated() && - "Wrong number of concatenated tokens!"); - Record.skipInts(1); - auto kind = static_cast(Record.readInt()); - bool isPascal = Record.readInt(); - // Read string data - auto B = &Record.peekInt(); - SmallString<16> Str(B, B + Len); - E->setString(Record.getContext(), Str, kind, isPascal); - Record.skipInts(Len); - - // Read source locations - for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I) + // NumConcatenated, Length and CharByteWidth are set by the empty + // ctor since they are needed to allocate storage for the trailing objects. + unsigned NumConcatenated = Record.readInt(); + unsigned Length = Record.readInt(); + unsigned CharByteWidth = Record.readInt(); + assert((NumConcatenated == E->getNumConcatenated()) && + "Wrong number of concatenated tokens!"); + assert((Length == E->getLength()) && "Wrong Length!"); + assert((CharByteWidth == E->getCharByteWidth()) && "Wrong character width!"); + E->StringLiteralBits.Kind = Record.readInt(); + E->StringLiteralBits.IsPascal = Record.readInt(); + + // The character width is originally computed via mapCharByteWidth. + // Check that the deserialized character width is consistant with the result + // of calling mapCharByteWidth. + assert((CharByteWidth == + StringLiteral::mapCharByteWidth(Record.getContext().getTargetInfo(), + E->getKind())) && + "Wrong character width!"); + + // Deserialize the trailing array of SourceLocation. + for (unsigned I = 0; I < NumConcatenated; ++I) E->setStrTokenLoc(I, ReadSourceLocation()); + + // Deserialize the trailing array of char holding the string data. + char *StrData = E->getStrDataAsChar(); + for (unsigned I = 0; I < Length * CharByteWidth; ++I) + StrData[I] = Record.readInt(); } void ASTStmtReader::VisitCharacterLiteral(CharacterLiteral *E) { @@ -2433,8 +2446,11 @@ break; case EXPR_STRING_LITERAL: - S = StringLiteral::CreateEmpty(Context, - Record[ASTStmtReader::NumExprFields + 1]); + S = StringLiteral::CreateEmpty( + Context, + /* NumConcatenated=*/Record[ASTStmtReader::NumExprFields + 0], + /* Length=*/Record[ASTStmtReader::NumExprFields + 1], + /* CharByteWidth=*/Record[ASTStmtReader::NumExprFields + 2]); break; case EXPR_CHARACTER_LITERAL: Index: lib/Serialization/ASTWriterStmt.cpp =================================================================== --- lib/Serialization/ASTWriterStmt.cpp +++ lib/Serialization/ASTWriterStmt.cpp @@ -527,17 +527,23 @@ void ASTStmtWriter::VisitStringLiteral(StringLiteral *E) { VisitExpr(E); - Record.push_back(E->getByteLength()); + + // Store the various bits of data of StringLiteral. Record.push_back(E->getNumConcatenated()); + Record.push_back(E->getLength()); + Record.push_back(E->getCharByteWidth()); Record.push_back(E->getKind()); Record.push_back(E->isPascal()); - // FIXME: String data should be stored as a blob at the end of the - // StringLiteral. However, we can't do so now because we have no - // provision for coping with abbreviations when we're jumping around - // the AST file during deserialization. - Record.append(E->getBytes().begin(), E->getBytes().end()); + + // Store the trailing array of SourceLocation. for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I) Record.AddSourceLocation(E->getStrTokenLoc(I)); + + // Store the trailing array of char holding the string data. + StringRef StrData = E->getBytes(); + for (unsigned I = 0, N = E->getByteLength(); I != N; ++I) + Record.push_back(StrData[I]); + Code = serialization::EXPR_STRING_LITERAL; }