Index: include/clang/AST/Expr.h =================================================================== --- include/clang/AST/Expr.h +++ include/clang/AST/Expr.h @@ -1468,10 +1468,12 @@ const uint32_t *asUInt32; } StrData; unsigned Length; - unsigned CharByteWidth : 4; + unsigned CharByteWidth : 3; unsigned Kind : 3; unsigned IsPascal : 1; + unsigned IsRaw : 1; unsigned NumConcatenated; + const char *RawPrefix; SourceLocation TokLocs[1]; StringLiteral(QualType Ty) : @@ -1484,14 +1486,16 @@ /// This is the "fully general" constructor that allows representation of /// strings formed from multiple concatenated tokens. static StringLiteral *Create(const ASTContext &C, StringRef Str, - StringKind Kind, bool Pascal, QualType Ty, + StringKind Kind, bool Pascal, bool Raw, + StringRef RawPrefix, QualType Ty, const SourceLocation *Loc, unsigned NumStrs); /// Simple constructor for string literals made from one token. static StringLiteral *Create(const ASTContext &C, StringRef Str, - StringKind Kind, bool Pascal, QualType Ty, + StringKind Kind, bool Pascal, bool Raw, + StringRef RawPrefix, QualType Ty, SourceLocation Loc) { - return Create(C, Str, Kind, Pascal, Ty, &Loc, 1); + return Create(C, Str, Kind, Pascal, Raw, RawPrefix, Ty, &Loc, 1); } /// \brief Construct an empty string literal. @@ -1535,7 +1539,8 @@ /// \brief Sets the string data to the given string data. void setString(const ASTContext &C, StringRef Str, - StringKind Kind, bool IsPascal); + StringKind Kind, bool IsPascal, bool IsRaw, + StringRef RawPrefix); StringKind getKind() const { return static_cast(Kind); } @@ -1547,6 +1552,20 @@ bool isUTF32() const { return Kind == UTF32; } bool isPascal() const { return IsPascal; } + /// \brief A string literal is considered to be a raw string literal only when + // all of the string tokens (before translation stage 6 concatenation) are raw + // strings with the same prefix. So, for instance: + // R"(a)" R"(b)" -> R"(ab)", R"test(a)test" R"test(b)test" -> R"test(ab)test" + // R"test(a)test" R"ing(b)ing" -> "ab". FIXME: This is unfortunate because we + // lose information in the AST. + bool isRaw() const { return IsRaw; } + + /// \brief Gets the prefix used when creating a raw string literal; note that + /// the prefix is optional, so this may return an empty StringRef. + StringRef getRawPrefix() const { + return RawPrefix ? StringRef(RawPrefix) : StringRef(); + } + bool containsNonAsciiOrNull() const { StringRef Str = getString(); for (unsigned i = 0, e = Str.size(); i != e; ++i) Index: include/clang/Lex/LiteralSupport.h =================================================================== --- include/clang/Lex/LiteralSupport.h +++ include/clang/Lex/LiteralSupport.h @@ -196,6 +196,8 @@ SmallString<32> UDSuffixBuf; unsigned UDSuffixToken; unsigned UDSuffixOffset; + SmallString<8> RawPrefix; + public: StringLiteralParser(ArrayRef StringToks, Preprocessor &PP, bool Complain = true); @@ -211,8 +213,12 @@ bool hadError; - bool Pascal; + bool Pascal, Raw; + StringRef GetRawStringPrefix() const { + return StringRef(RawPrefix.data(), RawPrefix.size()); + } + StringRef GetString() const { return StringRef(ResultBuf.data(), GetStringLength()); } Index: lib/AST/Expr.cpp =================================================================== --- lib/AST/Expr.cpp +++ lib/AST/Expr.cpp @@ -824,7 +824,8 @@ } StringLiteral *StringLiteral::Create(const ASTContext &C, StringRef Str, - StringKind Kind, bool Pascal, QualType Ty, + StringKind Kind, bool Pascal, bool Raw, + StringRef RawPrefix, QualType Ty, const SourceLocation *Loc, unsigned NumStrs) { assert(C.getAsConstantArrayType(Ty) && @@ -838,7 +839,7 @@ StringLiteral *SL = new (Mem) StringLiteral(Ty); // OPTIMIZE: could allocate this appended to the StringLiteral. - SL->setString(C,Str,Kind,Pascal); + SL->setString(C, Str, Kind, Pascal, Raw, RawPrefix); SL->TokLocs[0] = Loc[0]; SL->NumConcatenated = NumStrs; @@ -857,6 +858,8 @@ SL->CharByteWidth = 0; SL->Length = 0; SL->NumConcatenated = NumStrs; + SL->RawPrefix = nullptr; + SL->IsRaw = false; return SL; } @@ -868,97 +871,128 @@ case UTF16: OS << 'u'; break; case UTF32: OS << 'U'; break; } + + if (IsRaw) + OS << "R"; OS << '"'; + if (RawPrefix) + OS << RawPrefix; + if (IsRaw) + OS << "("; + static const char Hex[] = "0123456789ABCDEF"; unsigned LastSlashX = getLength(); for (unsigned I = 0, N = getLength(); I != N; ++I) { - switch (uint32_t Char = getCodeUnit(I)) { - default: - // FIXME: Convert UTF-8 back to codepoints before rendering. + if (IsRaw) { + // For raw strings, print the contents directly to the stream without + // converting to a more human-readable format. + OS << (char)getCodeUnit(I); + } else { + switch (uint32_t Char = getCodeUnit(I)) { + default: + // FIXME: Convert UTF-8 back to codepoints before rendering. - // Convert UTF-16 surrogate pairs back to codepoints before rendering. - // Leave invalid surrogates alone; we'll use \x for those. - if (getKind() == UTF16 && I != N - 1 && Char >= 0xd800 && - Char <= 0xdbff) { - uint32_t Trail = getCodeUnit(I + 1); - if (Trail >= 0xdc00 && Trail <= 0xdfff) { - Char = 0x10000 + ((Char - 0xd800) << 10) + (Trail - 0xdc00); - ++I; + // Convert UTF-16 surrogate pairs back to codepoints before rendering. + // Leave invalid surrogates alone; we'll use \x for those. + if (getKind() == UTF16 && I != N - 1 && Char >= 0xd800 && + Char <= 0xdbff) { + uint32_t Trail = getCodeUnit(I + 1); + if (Trail >= 0xdc00 && Trail <= 0xdfff) { + Char = 0x10000 + ((Char - 0xd800) << 10) + (Trail - 0xdc00); + ++I; + } } - } - if (Char > 0xff) { - // If this is a wide string, output characters over 0xff using \x - // escapes. Otherwise, this is a UTF-16 or UTF-32 string, and Char is a - // codepoint: use \x escapes for invalid codepoints. - if (getKind() == Wide || - (Char >= 0xd800 && Char <= 0xdfff) || Char >= 0x110000) { - // FIXME: Is this the best way to print wchar_t? - OS << "\\x"; - int Shift = 28; - while ((Char >> Shift) == 0) - Shift -= 4; - for (/**/; Shift >= 0; Shift -= 4) - OS << Hex[(Char >> Shift) & 15]; - LastSlashX = I; + if (Char > 0xff) { + // If this is a wide string, output characters over 0xff using \x + // escapes. Otherwise, this is a UTF-16 or UTF-32 string, and Char is + // a codepoint: use \x escapes for invalid codepoints. + if (getKind() == Wide || + (Char >= 0xd800 && Char <= 0xdfff) || Char >= 0x110000) { + // FIXME: Is this the best way to print wchar_t? + OS << "\\x"; + int Shift = 28; + while ((Char >> Shift) == 0) + Shift -= 4; + for (/**/; Shift >= 0; Shift -= 4) + OS << Hex[(Char >> Shift) & 15]; + LastSlashX = I; + break; + } + + if (Char > 0xffff) + OS << "\\U00" + << Hex[(Char >> 20) & 15] + << Hex[(Char >> 16) & 15]; + else + OS << "\\u"; + OS << Hex[(Char >> 12) & 15] + << Hex[(Char >> 8) & 15] + << Hex[(Char >> 4) & 15] + << Hex[(Char >> 0) & 15]; break; } - if (Char > 0xffff) - OS << "\\U00" - << Hex[(Char >> 20) & 15] - << Hex[(Char >> 16) & 15]; - else - OS << "\\u"; - OS << Hex[(Char >> 12) & 15] - << Hex[(Char >> 8) & 15] - << Hex[(Char >> 4) & 15] - << Hex[(Char >> 0) & 15]; - break; - } - - // If we used \x... for the previous character, and this character is a - // hexadecimal digit, prevent it being slurped as part of the \x. - if (LastSlashX + 1 == I) { - switch (Char) { + // If we used \x... for the previous character, and this character is a + // hexadecimal digit, prevent it being slurped as part of the \x. + if (LastSlashX + 1 == I) { + switch (Char) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': OS << "\"\""; + } } - } - assert(Char <= 0xff && - "Characters above 0xff should already have been handled."); + assert(Char <= 0xff && + "Characters above 0xff should already have been handled."); - if (isPrintable(Char)) - OS << (char)Char; - else // Output anything hard as an octal escape. - OS << '\\' - << (char)('0' + ((Char >> 6) & 7)) - << (char)('0' + ((Char >> 3) & 7)) - << (char)('0' + ((Char >> 0) & 7)); - break; - // Handle some common non-printable cases to make dumps prettier. - case '\\': OS << "\\\\"; break; - case '"': OS << "\\\""; break; - case '\n': OS << "\\n"; break; - case '\t': OS << "\\t"; break; - case '\a': OS << "\\a"; break; - case '\b': OS << "\\b"; break; + if (isPrintable(Char)) + OS << (char)Char; + else // Output anything hard as an octal escape. + OS << '\\' + << (char)('0' + ((Char >> 6) & 7)) + << (char)('0' + ((Char >> 3) & 7)) + << (char)('0' + ((Char >> 0) & 7)); + break; + // Handle some common non-printable cases to make dumps prettier. + case '\\': OS << "\\\\"; break; + case '"': OS << "\\\""; break; + case '\n': OS << "\\n"; break; + case '\t': OS << "\\t"; break; + case '\a': OS << "\\a"; break; + case '\b': OS << "\\b"; break; + } } } + if (IsRaw) + OS << ")"; + if (RawPrefix) + OS << RawPrefix; OS << '"'; } void StringLiteral::setString(const ASTContext &C, StringRef Str, - StringKind Kind, bool IsPascal) { + StringKind Kind, bool IsPascal, bool IsRaw, + StringRef RawPrefix) { //FIXME: we assume that the string data comes from a target that uses the same // code unit size and endianess for the type of string. this->Kind = Kind; this->IsPascal = IsPascal; + this->IsRaw = IsRaw; + + if (!RawPrefix.empty()) { + assert(IsRaw && "Nonempty raw prefix for a string that is not raw?"); + size_t PrefixLen = RawPrefix.size(); + char *PrefixData = new (C) char[PrefixLen + 1]; + std::memcpy(PrefixData, RawPrefix.data(), PrefixLen); + PrefixData[PrefixLen] = 0; + this->RawPrefix = PrefixData; + } else { + this->RawPrefix = nullptr; + } CharByteWidth = mapCharByteWidth(C.getTargetInfo(),Kind); assert((Str.size()%CharByteWidth == 0) Index: lib/Frontend/Rewrite/RewriteModernObjC.cpp =================================================================== --- lib/Frontend/Rewrite/RewriteModernObjC.cpp +++ lib/Frontend/Rewrite/RewriteModernObjC.cpp @@ -605,7 +605,8 @@ Context->CharTy, llvm::APInt(32, Str.size() + 1), ArrayType::Normal, 0); return StringLiteral::Create(*Context, Str, StringLiteral::Ascii, - /*Pascal=*/false, StrType, SourceLocation()); + /*Pascal=*/false, /*Raw*/false, StringRef(), + StrType, SourceLocation()); } }; Index: lib/Frontend/Rewrite/RewriteObjC.cpp =================================================================== --- lib/Frontend/Rewrite/RewriteObjC.cpp +++ lib/Frontend/Rewrite/RewriteObjC.cpp @@ -499,7 +499,8 @@ Context->CharTy, llvm::APInt(32, Str.size() + 1), ArrayType::Normal, 0); return StringLiteral::Create(*Context, Str, StringLiteral::Ascii, - /*Pascal=*/false, StrType, SourceLocation()); + /*Pascal=*/false, /*Raw*/false, StringRef(), + StrType, SourceLocation()); } }; Index: lib/Lex/LiteralSupport.cpp =================================================================== --- lib/Lex/LiteralSupport.cpp +++ lib/Lex/LiteralSupport.cpp @@ -1255,7 +1255,7 @@ : SM(PP.getSourceManager()), Features(PP.getLangOpts()), Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), - ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) { + ResultPtr(ResultBuf.data()), hadError(false), Pascal(false), Raw(false) { init(StringToks); } @@ -1332,6 +1332,7 @@ ResultPtr = &ResultBuf[0]; // Next byte to fill in. Pascal = false; + Raw = false; SourceLocation UDSuffixTokLoc; @@ -1411,6 +1412,19 @@ const char *Prefix = ThisTokBuf; while (ThisTokBuf[0] != '(') ++ThisTokBuf; + + // The raw prefix is everything between the R" and the (. However, we only + // track whether the literal is raw when all of the components of the + // literal are raw and use the same prefix. + StringRef PrefStr(Prefix, ThisTokBuf - Prefix); + if (0 == i || (Raw && !RawPrefix.compare(PrefStr))) { + Raw = true; + RawPrefix.assign(PrefStr); + } else { + Raw = false; + RawPrefix.clear(); + } + ++ThisTokBuf; // skip '(' // Remove same number of characters from the end Index: lib/Sema/SemaExpr.cpp =================================================================== --- lib/Sema/SemaExpr.cpp +++ lib/Sema/SemaExpr.cpp @@ -1586,8 +1586,9 @@ // Pass &StringTokLocs[0], StringTokLocs.size() to factory! StringLiteral *Lit = StringLiteral::Create(Context, Literal.GetString(), - Kind, Literal.Pascal, StrTy, - &StringTokLocs[0], + Kind, Literal.Pascal, Literal.Raw, + Literal.GetRawStringPrefix(), + StrTy, &StringTokLocs[0], StringTokLocs.size()); if (Literal.getUDSuffix().empty()) return Lit; @@ -3024,13 +3025,15 @@ ResTy = Context.getConstantArrayType(ResTy, LengthI, ArrayType::Normal, /*IndexTypeQuals*/ 0); SL = StringLiteral::Create(Context, RawChars, StringLiteral::Wide, - /*Pascal*/ false, ResTy, Loc); + /*Pascal*/ false, /*Raw*/ false, StringRef(), + ResTy, Loc); } else { ResTy = Context.CharTy.withConst(); ResTy = Context.getConstantArrayType(ResTy, LengthI, ArrayType::Normal, /*IndexTypeQuals*/ 0); SL = StringLiteral::Create(Context, Str, StringLiteral::Ascii, - /*Pascal*/ false, ResTy, Loc); + /*Pascal*/ false, /*Raw*/ false, StringRef(), + ResTy, Loc); } } @@ -3263,7 +3266,7 @@ ArrayType::Normal, 0); Expr *Lit = StringLiteral::Create( Context, StringRef(TokSpelling.data(), Length), StringLiteral::Ascii, - /*Pascal*/false, StrTy, &TokLoc, 1); + /*Pascal*/false, /*Raw*/false, StringRef(), StrTy, &TokLoc, 1); return BuildLiteralOperatorCall(R, OpNameInfo, Lit, TokLoc); } Index: lib/Sema/SemaExprObjC.cpp =================================================================== --- lib/Sema/SemaExprObjC.cpp +++ lib/Sema/SemaExprObjC.cpp @@ -70,10 +70,10 @@ CAT->getElementType(), llvm::APInt(32, StrBuf.size() + 1), CAT->getSizeModifier(), CAT->getIndexTypeCVRQualifiers()); S = StringLiteral::Create(Context, StrBuf, StringLiteral::Ascii, - /*Pascal=*/false, StrTy, &StrLocs[0], - StrLocs.size()); + /*Pascal=*/false, /*Raw=*/false, StringRef(), + StrTy, &StrLocs[0], StrLocs.size()); } - + return BuildObjCStringLiteral(AtLocs[0], S); } Index: lib/Serialization/ASTReaderStmt.cpp =================================================================== --- lib/Serialization/ASTReaderStmt.cpp +++ lib/Serialization/ASTReaderStmt.cpp @@ -506,10 +506,14 @@ StringLiteral::StringKind kind = static_cast(Record[Idx++]); bool isPascal = Record[Idx++]; + bool isRaw = Record[Idx++]; + unsigned RawPrefixLen = Record[Idx++]; + SmallString<8> RawPrefix(&Record[Idx], &Record[Idx] + RawPrefixLen); + Idx += RawPrefixLen; // Read string data SmallString<16> Str(&Record[Idx], &Record[Idx] + Len); - E->setString(Reader.getContext(), Str, kind, isPascal); + E->setString(Reader.getContext(), Str, kind, isPascal, isRaw, RawPrefix); Idx += Len; // Read source locations Index: lib/Serialization/ASTWriterStmt.cpp =================================================================== --- lib/Serialization/ASTWriterStmt.cpp +++ lib/Serialization/ASTWriterStmt.cpp @@ -429,6 +429,11 @@ Record.push_back(E->getNumConcatenated()); Record.push_back(E->getKind()); Record.push_back(E->isPascal()); + Record.push_back(E->isRaw()); + StringRef RawPrefix = E->getRawPrefix(); + Record.push_back(RawPrefix.size()); + Record.append(RawPrefix.begin(), RawPrefix.end()); + // FIXME: String data should be stored as a blob at the end of the // StringLiteral. However, we can't do so now because we have no // provision for coping with abbreviations when we're jumping around Index: test/Misc/ast-print-string-literal.cpp =================================================================== --- test/Misc/ast-print-string-literal.cpp +++ test/Misc/ast-print-string-literal.cpp @@ -0,0 +1,25 @@ +// RUN: %clang_cc1 -ast-print -std=c++1z %s -o - | FileCheck %s + +const char *S = R"T(This is a test)T"; + +const wchar_t *WS = LR"Teehee(This is a test +with a newline in it)Teehee"; + +const char *T = u8R"(This is also a test)"; + +const char *U = R"(This test has \b and a tab )"; + +const char *V = R"(This is a test )" R"test(that should concat to a non-raw string)test"; + +const char *W = R"test(This is a test )test" R"test(that should concat to a raw string)test"; + +const char *X = "This is a test " R"(that should concat to a non-raw string)"; + +// CHECK: const char *S = R"T(This is a test)T"; +// CHECK: const wchar_t *WS = LR"Teehee(This is a test +// CHECK-NEXT: with a newline in it)Teehee"; +// CHECK: const char *T = u8R"(This is also a test)"; +// CHECK: const char *U = R"(This test has \b and a tab )"; +// CHECK: const char *V = "This is a test that should concat to a non-raw string"; +// CHECK: const char *W = R"test(This is a test that should concat to a raw string)test"; +// CHECK: const char *X = "This is a test that should concat to a non-raw string"; Index: test/PCH/cxx-string-literal.cpp =================================================================== --- test/PCH/cxx-string-literal.cpp +++ test/PCH/cxx-string-literal.cpp @@ -0,0 +1,13 @@ +// RUN: %clang_cc1 -emit-pch -std=c++1z -o %t %s +// RUN: %clang_cc1 -std=c++1z -x ast -ast-print %t | FileCheck %s + +const char *S = R"T(This is a test)T"; +// CHECK: const char *S = R"T(This is a test)T"; + +const wchar_t *WS = LR"Teehee(This is a test +with a newline in it)Teehee"; +// CHECK: const wchar_t *WS = LR"Teehee(This is a test +// CHECK-NEXT: with a newline in it)Teehee"; + +const char *T = u8R"(This is also a test)"; +// CHECK: const char *T = u8R"(This is also a test)"; Index: test/SemaCXX/cxx11-ast-print.cpp =================================================================== --- test/SemaCXX/cxx11-ast-print.cpp +++ test/SemaCXX/cxx11-ast-print.cpp @@ -20,7 +20,7 @@ // CHECK: const char *p1 = "bar1"_foo; const char *p1 = "bar1"_foo; -// CHECK: const char *p2 = "bar2"_foo; +// CHECK: const char *p2 = R"x(bar2)x"_foo; const char *p2 = R"x(bar2)x"_foo; // CHECK: const char *p3 = u8"bar3"_foo; const char *p3 = u8"bar3"_foo;