Index: clang/lib/Analysis/FormatString.cpp =================================================================== --- clang/lib/Analysis/FormatString.cpp +++ clang/lib/Analysis/FormatString.cpp @@ -266,14 +266,15 @@ if (SpecifierBegin + 1 >= FmtStrEnd) return false; - const UTF8 *SB = reinterpret_cast(SpecifierBegin + 1); - const UTF8 *SE = reinterpret_cast(FmtStrEnd); + const llvm::UTF8 *SB = + reinterpret_cast(SpecifierBegin + 1); + const llvm::UTF8 *SE = reinterpret_cast(FmtStrEnd); const char FirstByte = *SB; // If the invalid specifier is a multibyte UTF-8 string, return the // total length accordingly so that the conversion specifier can be // properly updated to reflect a complete UTF-8 specifier. - unsigned NumBytes = getNumBytesForUTF8(FirstByte); + unsigned NumBytes = llvm::getNumBytesForUTF8(FirstByte); if (NumBytes == 1) return false; if (SB + NumBytes > SE) Index: clang/lib/CodeGen/CodeGenModule.cpp =================================================================== --- clang/lib/CodeGen/CodeGenModule.cpp +++ clang/lib/CodeGen/CodeGenModule.cpp @@ -3136,13 +3136,12 @@ // Otherwise, convert the UTF8 literals into a string of shorts. IsUTF16 = true; - SmallVector ToBuf(NumBytes + 1); // +1 for ending nulls. - const UTF8 *FromPtr = (const UTF8 *)String.data(); - UTF16 *ToPtr = &ToBuf[0]; + SmallVector ToBuf(NumBytes + 1); // +1 for ending nulls. + const llvm::UTF8 *FromPtr = (const llvm::UTF8 *)String.data(); + llvm::UTF16 *ToPtr = &ToBuf[0]; - (void)ConvertUTF8toUTF16(&FromPtr, FromPtr + NumBytes, - &ToPtr, ToPtr + NumBytes, - strictConversion); + (void)llvm::ConvertUTF8toUTF16(&FromPtr, FromPtr + NumBytes, &ToPtr, + ToPtr + NumBytes, llvm::strictConversion); // ConvertUTF8toUTF16 returns the length in ToPtr. StringLength = ToPtr - &ToBuf[0]; Index: clang/lib/Format/Encoding.h =================================================================== --- clang/lib/Format/Encoding.h +++ clang/lib/Format/Encoding.h @@ -33,16 +33,17 @@ /// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8, /// it is considered UTF8, otherwise we treat it as some 8-bit encoding. inline Encoding detectEncoding(StringRef Text) { - const UTF8 *Ptr = reinterpret_cast(Text.begin()); - const UTF8 *BufEnd = reinterpret_cast(Text.end()); - if (::isLegalUTF8String(&Ptr, BufEnd)) + const llvm::UTF8 *Ptr = reinterpret_cast(Text.begin()); + const llvm::UTF8 *BufEnd = reinterpret_cast(Text.end()); + if (llvm::isLegalUTF8String(&Ptr, BufEnd)) return Encoding_UTF8; return Encoding_Unknown; } inline unsigned getCodePointCountUTF8(StringRef Text) { unsigned CodePoints = 0; - for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) { + for (size_t i = 0, e = Text.size(); i < e; + i += llvm::getNumBytesForUTF8(Text[i])) { ++CodePoints; } return CodePoints; @@ -97,7 +98,7 @@ inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) { switch (Encoding) { case Encoding_UTF8: - return getNumBytesForUTF8(FirstChar); + return llvm::getNumBytesForUTF8(FirstChar); default: return 1; } @@ -136,7 +137,7 @@ ++I; return I; } - return 1 + getNumBytesForUTF8(Text[1]); + return 1 + llvm::getNumBytesForUTF8(Text[1]); } } Index: clang/lib/Frontend/TextDiagnostic.cpp =================================================================== --- clang/lib/Frontend/TextDiagnostic.cpp +++ clang/lib/Frontend/TextDiagnostic.cpp @@ -119,16 +119,17 @@ begin = reinterpret_cast(&*(SourceLine.begin() + *i)); end = begin + (SourceLine.size() - *i); - if (isLegalUTF8Sequence(begin, end)) { - UTF32 c; - UTF32 *cptr = &c; + if (llvm::isLegalUTF8Sequence(begin, end)) { + llvm::UTF32 c; + llvm::UTF32 *cptr = &c; unsigned char const *original_begin = begin; - unsigned char const *cp_end = begin+getNumBytesForUTF8(SourceLine[*i]); + unsigned char const *cp_end = + begin + llvm::getNumBytesForUTF8(SourceLine[*i]); - ConversionResult res = ConvertUTF8toUTF32(&begin, cp_end, &cptr, cptr+1, - strictConversion); + llvm::ConversionResult res = llvm::ConvertUTF8toUTF32( + &begin, cp_end, &cptr, cptr + 1, llvm::strictConversion); (void)res; - assert(conversionOK==res); + assert(llvm::conversionOK == res); assert(0 < begin-original_begin && "we must be further along in the string now"); *i += begin-original_begin; Index: clang/lib/Lex/Lexer.cpp =================================================================== --- clang/lib/Lex/Lexer.cpp +++ clang/lib/Lex/Lexer.cpp @@ -1485,13 +1485,13 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { const char *UnicodePtr = CurPtr; - UTF32 CodePoint; - ConversionResult Result = - llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr, - (const UTF8 *)BufferEnd, + llvm::UTF32 CodePoint; + llvm::ConversionResult Result = + llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr, + (const llvm::UTF8 *)BufferEnd, &CodePoint, - strictConversion); - if (Result != conversionOK || + llvm::strictConversion); + if (Result != llvm::conversionOK || !isAllowedIDChar(static_cast(CodePoint), LangOpts)) return false; @@ -3625,17 +3625,17 @@ break; } - UTF32 CodePoint; + llvm::UTF32 CodePoint; // We can't just reset CurPtr to BufferPtr because BufferPtr may point to // an escaped newline. --CurPtr; - ConversionResult Status = - llvm::convertUTF8Sequence((const UTF8 **)&CurPtr, - (const UTF8 *)BufferEnd, + llvm::ConversionResult Status = + llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, + (const llvm::UTF8 *)BufferEnd, &CodePoint, - strictConversion); - if (Status == conversionOK) { + llvm::strictConversion); + if (Status == llvm::conversionOK) { if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) return true; // KeepWhitespaceMode Index: clang/lib/Lex/LiteralSupport.cpp =================================================================== --- clang/lib/Lex/LiteralSupport.cpp +++ clang/lib/Lex/LiteralSupport.cpp @@ -402,7 +402,7 @@ if (CharByteWidth == 4) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. - UTF32 *ResultPtr = reinterpret_cast(ResultBuf); + llvm::UTF32 *ResultPtr = reinterpret_cast(ResultBuf); *ResultPtr = UcnVal; ResultBuf += 4; return; @@ -411,7 +411,7 @@ if (CharByteWidth == 2) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. - UTF16 *ResultPtr = reinterpret_cast(ResultBuf); + llvm::UTF16 *ResultPtr = reinterpret_cast(ResultBuf); if (UcnVal <= (UTF32)0xFFFF) { *ResultPtr = UcnVal; @@ -1114,11 +1114,11 @@ char const *tmp_in_start = start; uint32_t *tmp_out_start = buffer_begin; - ConversionResult res = - ConvertUTF8toUTF32(reinterpret_cast(&start), - reinterpret_cast(begin), - &buffer_begin, buffer_end, strictConversion); - if (res != conversionOK) { + llvm::ConversionResult res = + llvm::ConvertUTF8toUTF32(reinterpret_cast(&start), + reinterpret_cast(begin), + &buffer_begin, buffer_end, llvm::strictConversion); + if (res != llvm::conversionOK) { // If we see bad encoding for unprefixed character literals, warn and // simply copy the byte values, for compatibility with gcc and // older versions of clang. @@ -1510,13 +1510,13 @@ if (CharByteWidth == 4) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. - UTF32 *ResultWidePtr = reinterpret_cast(ResultPtr); + llvm::UTF32 *ResultWidePtr = reinterpret_cast(ResultPtr); *ResultWidePtr = ResultChar; ResultPtr += 4; } else if (CharByteWidth == 2) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. - UTF16 *ResultWidePtr = reinterpret_cast(ResultPtr); + llvm::UTF16 *ResultWidePtr = reinterpret_cast(ResultPtr); *ResultWidePtr = ResultChar & 0xFFFF; ResultPtr += 2; } else { @@ -1531,12 +1531,12 @@ if (CharByteWidth == 4) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. - UTF32 *ResultWidePtr = reinterpret_cast(ResultBuf.data()); + llvm::UTF32 *ResultWidePtr = reinterpret_cast(ResultBuf.data()); ResultWidePtr[0] = GetNumStringChars() - 1; } else if (CharByteWidth == 2) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. - UTF16 *ResultWidePtr = reinterpret_cast(ResultBuf.data()); + llvm::UTF16 *ResultWidePtr = reinterpret_cast(ResultBuf.data()); ResultWidePtr[0] = GetNumStringChars() - 1; } else { assert(CharByteWidth == 1 && "Unexpected char width"); @@ -1570,7 +1570,7 @@ static const char *resyncUTF8(const char *Err, const char *End) { if (Err == End) return End; - End = Err + std::min(getNumBytesForUTF8(*Err), End-Err); + End = Err + std::min(llvm::getNumBytesForUTF8(*Err), End-Err); while (++Err != End && (*Err & 0xC0) == 0x80) ; return Err; @@ -1582,7 +1582,7 @@ bool StringLiteralParser::CopyStringFragment(const Token &Tok, const char *TokBegin, StringRef Fragment) { - const UTF8 *ErrorPtrTmp; + const llvm::UTF8 *ErrorPtrTmp; if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp)) return false; Index: clang/lib/Sema/SemaChecking.cpp =================================================================== --- clang/lib/Sema/SemaChecking.cpp +++ clang/lib/Sema/SemaChecking.cpp @@ -3262,15 +3262,15 @@ if (Literal->containsNonAsciiOrNull()) { StringRef String = Literal->getString(); unsigned NumBytes = String.size(); - SmallVector ToBuf(NumBytes); - const UTF8 *FromPtr = (const UTF8 *)String.data(); - UTF16 *ToPtr = &ToBuf[0]; - - ConversionResult Result = ConvertUTF8toUTF16(&FromPtr, FromPtr + NumBytes, - &ToPtr, ToPtr + NumBytes, - strictConversion); + SmallVector ToBuf(NumBytes); + const llvm::UTF8 *FromPtr = (const llvm::UTF8 *)String.data(); + llvm::UTF16 *ToPtr = &ToBuf[0]; + + llvm::ConversionResult Result = + llvm::ConvertUTF8toUTF16(&FromPtr, FromPtr + NumBytes, &ToPtr, + ToPtr + NumBytes, llvm::strictConversion); // Check for conversion failure. - if (Result != conversionOK) + if (Result != llvm::conversionOK) Diag(Arg->getLocStart(), diag::warn_cfstring_truncated) << Arg->getSourceRange(); } @@ -4777,16 +4777,16 @@ // hex value. std::string CodePointStr; if (!llvm::sys::locale::isPrint(*csStart)) { - UTF32 CodePoint; - const UTF8 **B = reinterpret_cast(&csStart); - const UTF8 *E = - reinterpret_cast(csStart + csLen); - ConversionResult Result = - llvm::convertUTF8Sequence(B, E, &CodePoint, strictConversion); + llvm::UTF32 CodePoint; + const llvm::UTF8 **B = reinterpret_cast(&csStart); + const llvm::UTF8 *E = + reinterpret_cast(csStart + csLen); + llvm::ConversionResult Result = + llvm::convertUTF8Sequence(B, E, &CodePoint, llvm::strictConversion); - if (Result != conversionOK) { + if (Result != llvm::conversionOK) { unsigned char FirstChar = *csStart; - CodePoint = (UTF32)FirstChar; + CodePoint = (llvm::UTF32)FirstChar; } llvm::raw_string_ostream OS(CodePointStr); Index: clang/lib/Sema/SemaExpr.cpp =================================================================== --- clang/lib/Sema/SemaExpr.cpp +++ clang/lib/Sema/SemaExpr.cpp @@ -3070,8 +3070,9 @@ SmallString<32> &Target) { Target.resize(CharByteWidth * (Source.size() + 1)); char *ResultPtr = &Target[0]; - const UTF8 *ErrorPtr; - bool success = ConvertUTF8toWide(CharByteWidth, Source, ResultPtr, ErrorPtr); + const llvm::UTF8 *ErrorPtr; + bool success = + llvm::ConvertUTF8toWide(CharByteWidth, Source, ResultPtr, ErrorPtr); (void)success; assert(success); Target.resize(ResultPtr - &Target[0]); Index: lldb/source/DataFormatters/StringPrinter.cpp =================================================================== --- lldb/source/DataFormatters/StringPrinter.cpp +++ lldb/source/DataFormatters/StringPrinter.cpp @@ -133,7 +133,7 @@ uint8_t *&next) { StringPrinter::StringPrinterBufferPointer<> retval{nullptr}; - unsigned utf8_encoded_len = getNumBytesForUTF8(*buffer); + unsigned utf8_encoded_len = llvm::getNumBytesForUTF8(*buffer); if (1 + buffer_end - buffer < utf8_encoded_len) { // I don't have enough bytes - print whatever I have left @@ -266,9 +266,10 @@ // use this call if you already have an LLDB-side buffer for the data template static bool DumpUTFBufferToStream( - ConversionResult (*ConvertFunction)(const SourceDataType **, - const SourceDataType *, UTF8 **, UTF8 *, - ConversionFlags), + llvm::ConversionResult (*ConvertFunction)(const SourceDataType **, + const SourceDataType *, + llvm::UTF8 **, llvm::UTF8 *, + llvm::ConversionFlags), const StringPrinter::ReadBufferAndDumpToStreamOptions &dump_options) { Stream &stream(*dump_options.GetStream()); if (dump_options.GetPrefixToken() != 0) @@ -303,30 +304,29 @@ } lldb::DataBufferSP utf8_data_buffer_sp; - UTF8 *utf8_data_ptr = nullptr; - UTF8 *utf8_data_end_ptr = nullptr; + llvm::UTF8 *utf8_data_ptr = nullptr; + llvm::UTF8 *utf8_data_end_ptr = nullptr; if (ConvertFunction) { utf8_data_buffer_sp.reset(new DataBufferHeap(4 * bufferSPSize, 0)); - utf8_data_ptr = (UTF8 *)utf8_data_buffer_sp->GetBytes(); + utf8_data_ptr = (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes(); utf8_data_end_ptr = utf8_data_ptr + utf8_data_buffer_sp->GetByteSize(); ConvertFunction(&data_ptr, data_end_ptr, &utf8_data_ptr, - utf8_data_end_ptr, lenientConversion); + utf8_data_end_ptr, llvm::lenientConversion); if (false == zero_is_terminator) utf8_data_end_ptr = utf8_data_ptr; + // needed because the ConvertFunction will change the value of the + // data_ptr. utf8_data_ptr = - (UTF8 *)utf8_data_buffer_sp->GetBytes(); // needed because the - // ConvertFunction will - // change the value of the - // data_ptr + (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes(); } else { // just copy the pointers - the cast is necessary to make the compiler // happy // but this should only happen if we are reading UTF8 data - utf8_data_ptr = - const_cast(reinterpret_cast(data_ptr)); - utf8_data_end_ptr = - const_cast(reinterpret_cast(data_end_ptr)); + utf8_data_ptr = const_cast( + reinterpret_cast(data_ptr)); + utf8_data_end_ptr = const_cast( + reinterpret_cast(data_end_ptr)); } const bool escape_non_printables = dump_options.GetEscapeNonPrintables(); @@ -512,9 +512,10 @@ template static bool ReadUTFBufferAndDumpToStream( const StringPrinter::ReadStringAndDumpToStreamOptions &options, - ConversionResult (*ConvertFunction)(const SourceDataType **, - const SourceDataType *, UTF8 **, UTF8 *, - ConversionFlags)) { + llvm::ConversionResult (*ConvertFunction)(const SourceDataType **, + const SourceDataType *, + llvm::UTF8 **, llvm::UTF8 *, + llvm::ConversionFlags)) { assert(options.GetStream() && "need a Stream to print the string to"); if (options.GetLocation() == 0 || @@ -591,21 +592,23 @@ bool StringPrinter::ReadStringAndDumpToStream< StringPrinter::StringElementType::UTF8>( const ReadStringAndDumpToStreamOptions &options) { - return ReadUTFBufferAndDumpToStream(options, nullptr); + return ReadUTFBufferAndDumpToStream(options, nullptr); } template <> bool StringPrinter::ReadStringAndDumpToStream< StringPrinter::StringElementType::UTF16>( const ReadStringAndDumpToStreamOptions &options) { - return ReadUTFBufferAndDumpToStream(options, ConvertUTF16toUTF8); + return ReadUTFBufferAndDumpToStream(options, + llvm::ConvertUTF16toUTF8); } template <> bool StringPrinter::ReadStringAndDumpToStream< StringPrinter::StringElementType::UTF32>( const ReadStringAndDumpToStreamOptions &options) { - return ReadUTFBufferAndDumpToStream(options, ConvertUTF32toUTF8); + return ReadUTFBufferAndDumpToStream(options, + llvm::ConvertUTF32toUTF8); } template <> @@ -614,7 +617,7 @@ const ReadBufferAndDumpToStreamOptions &options) { assert(options.GetStream() && "need a Stream to print the string to"); - return DumpUTFBufferToStream(nullptr, options); + return DumpUTFBufferToStream(nullptr, options); } template <> @@ -632,7 +635,7 @@ const ReadBufferAndDumpToStreamOptions &options) { assert(options.GetStream() && "need a Stream to print the string to"); - return DumpUTFBufferToStream(ConvertUTF16toUTF8, options); + return DumpUTFBufferToStream(llvm::ConvertUTF16toUTF8, options); } template <> @@ -641,7 +644,7 @@ const ReadBufferAndDumpToStreamOptions &options) { assert(options.GetStream() && "need a Stream to print the string to"); - return DumpUTFBufferToStream(ConvertUTF32toUTF8, options); + return DumpUTFBufferToStream(llvm::ConvertUTF32toUTF8, options); } } // namespace formatters Index: lldb/source/Plugins/Process/minidump/MinidumpTypes.cpp =================================================================== --- lldb/source/Plugins/Process/minidump/MinidumpTypes.cpp +++ lldb/source/Plugins/Process/minidump/MinidumpTypes.cpp @@ -49,7 +49,7 @@ if (error.Fail() || *source_length > data.size() || *source_length % 2 != 0) return llvm::None; - auto source_start = reinterpret_cast(data.data()); + auto source_start = reinterpret_cast(data.data()); // source_length is the length of the string in bytes // we need the length of the string in UTF-16 characters/code points (16 bits // per char) @@ -57,12 +57,12 @@ const auto source_end = source_start + (*source_length) / 2; // resize to worst case length result.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT * (*source_length) / 2); - auto result_start = reinterpret_cast(&result[0]); + auto result_start = reinterpret_cast(&result[0]); const auto result_end = result_start + result.size(); - ConvertUTF16toUTF8(&source_start, source_end, &result_start, result_end, - strictConversion); + llvm::ConvertUTF16toUTF8(&source_start, source_end, &result_start, result_end, + llvm::strictConversion); const auto result_size = - std::distance(reinterpret_cast(&result[0]), result_start); + std::distance(reinterpret_cast(&result[0]), result_start); result.resize(result_size); // shrink to actual length return result; Index: llvm/include/llvm/Support/ConvertUTF.h =================================================================== --- llvm/include/llvm/Support/ConvertUTF.h +++ llvm/include/llvm/Support/ConvertUTF.h @@ -90,6 +90,14 @@ #ifndef LLVM_SUPPORT_CONVERTUTF_H #define LLVM_SUPPORT_CONVERTUTF_H +#include +#include + +// Wrap everything in namespace llvm so that programs can link with llvm and +// their own version of the unicode libraries. + +namespace llvm { + /* --------------------------------------------------------------------- The following 4 definitions are compiler-specific. The C standard does not guarantee that wchar_t has at least @@ -127,11 +135,6 @@ lenientConversion } ConversionFlags; -/* This is for C++ and does no harm in C */ -#ifdef __cplusplus -extern "C" { -#endif - ConversionResult ConvertUTF8toUTF16 ( const UTF8** sourceStart, const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); @@ -174,16 +177,9 @@ unsigned getNumBytesForUTF8(UTF8 firstByte); -#ifdef __cplusplus -} - /*************************************************************************/ /* Below are LLVM-specific wrappers of the functions above. */ -#include -#include - -namespace llvm { template class ArrayRef; template class SmallVectorImpl; class StringRef; @@ -292,8 +288,4 @@ } /* end namespace llvm */ -#endif - -/* --------------------------------------------------------------------- */ - #endif Index: llvm/lib/Support/CMakeLists.txt =================================================================== --- llvm/lib/Support/CMakeLists.txt +++ llvm/lib/Support/CMakeLists.txt @@ -40,7 +40,7 @@ COM.cpp CommandLine.cpp Compression.cpp - ConvertUTF.c + ConvertUTF.cpp ConvertUTFWrapper.cpp CrashRecoveryContext.cpp DataExtractor.cpp Index: llvm/lib/Support/ConvertUTF.cpp =================================================================== --- llvm/lib/Support/ConvertUTF.cpp +++ llvm/lib/Support/ConvertUTF.cpp @@ -53,6 +53,8 @@ #endif #include +namespace llvm { + static const int halfShift = 10; /* used for shifting by 10 bits */ static const UTF32 halfBase = 0x0010000UL; @@ -62,8 +64,6 @@ #define UNI_SUR_HIGH_END (UTF32)0xDBFF #define UNI_SUR_LOW_START (UTF32)0xDC00 #define UNI_SUR_LOW_END (UTF32)0xDFFF -#define false 0 -#define true 1 /* --------------------------------------------------------------------- */ @@ -706,3 +706,5 @@ similarly unrolled loops. --------------------------------------------------------------------- */ + +} // namespace llvm