Index: cfe/trunk/lib/Analysis/FormatString.cpp =================================================================== --- cfe/trunk/lib/Analysis/FormatString.cpp +++ cfe/trunk/lib/Analysis/FormatString.cpp @@ -266,14 +266,15 @@ if (SpecifierBegin + 1 >= FmtStrEnd) return false; - const UTF8 *SB = reinterpret_cast(SpecifierBegin + 1); - const UTF8 *SE = reinterpret_cast(FmtStrEnd); + const llvm::UTF8 *SB = + reinterpret_cast(SpecifierBegin + 1); + const llvm::UTF8 *SE = reinterpret_cast(FmtStrEnd); const char FirstByte = *SB; // If the invalid specifier is a multibyte UTF-8 string, return the // total length accordingly so that the conversion specifier can be // properly updated to reflect a complete UTF-8 specifier. - unsigned NumBytes = getNumBytesForUTF8(FirstByte); + unsigned NumBytes = llvm::getNumBytesForUTF8(FirstByte); if (NumBytes == 1) return false; if (SB + NumBytes > SE) Index: cfe/trunk/lib/CodeGen/CodeGenModule.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CodeGenModule.cpp +++ cfe/trunk/lib/CodeGen/CodeGenModule.cpp @@ -3136,13 +3136,12 @@ // Otherwise, convert the UTF8 literals into a string of shorts. IsUTF16 = true; - SmallVector ToBuf(NumBytes + 1); // +1 for ending nulls. - const UTF8 *FromPtr = (const UTF8 *)String.data(); - UTF16 *ToPtr = &ToBuf[0]; - - (void)ConvertUTF8toUTF16(&FromPtr, FromPtr + NumBytes, - &ToPtr, ToPtr + NumBytes, - strictConversion); + SmallVector ToBuf(NumBytes + 1); // +1 for ending nulls. + const llvm::UTF8 *FromPtr = (const llvm::UTF8 *)String.data(); + llvm::UTF16 *ToPtr = &ToBuf[0]; + + (void)llvm::ConvertUTF8toUTF16(&FromPtr, FromPtr + NumBytes, &ToPtr, + ToPtr + NumBytes, llvm::strictConversion); // ConvertUTF8toUTF16 returns the length in ToPtr. StringLength = ToPtr - &ToBuf[0]; Index: cfe/trunk/lib/Format/Encoding.h =================================================================== --- cfe/trunk/lib/Format/Encoding.h +++ cfe/trunk/lib/Format/Encoding.h @@ -33,16 +33,17 @@ /// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8, /// it is considered UTF8, otherwise we treat it as some 8-bit encoding. inline Encoding detectEncoding(StringRef Text) { - const UTF8 *Ptr = reinterpret_cast(Text.begin()); - const UTF8 *BufEnd = reinterpret_cast(Text.end()); - if (::isLegalUTF8String(&Ptr, BufEnd)) + const llvm::UTF8 *Ptr = reinterpret_cast(Text.begin()); + const llvm::UTF8 *BufEnd = reinterpret_cast(Text.end()); + if (llvm::isLegalUTF8String(&Ptr, BufEnd)) return Encoding_UTF8; return Encoding_Unknown; } inline unsigned getCodePointCountUTF8(StringRef Text) { unsigned CodePoints = 0; - for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) { + for (size_t i = 0, e = Text.size(); i < e; + i += llvm::getNumBytesForUTF8(Text[i])) { ++CodePoints; } return CodePoints; @@ -97,7 +98,7 @@ inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) { switch (Encoding) { case Encoding_UTF8: - return getNumBytesForUTF8(FirstChar); + return llvm::getNumBytesForUTF8(FirstChar); default: return 1; } @@ -136,7 +137,7 @@ ++I; return I; } - return 1 + getNumBytesForUTF8(Text[1]); + return 1 + llvm::getNumBytesForUTF8(Text[1]); } } Index: cfe/trunk/lib/Frontend/TextDiagnostic.cpp =================================================================== --- cfe/trunk/lib/Frontend/TextDiagnostic.cpp +++ cfe/trunk/lib/Frontend/TextDiagnostic.cpp @@ -119,16 +119,17 @@ begin = reinterpret_cast(&*(SourceLine.begin() + *i)); end = begin + (SourceLine.size() - *i); - if (isLegalUTF8Sequence(begin, end)) { - UTF32 c; - UTF32 *cptr = &c; + if (llvm::isLegalUTF8Sequence(begin, end)) { + llvm::UTF32 c; + llvm::UTF32 *cptr = &c; unsigned char const *original_begin = begin; - unsigned char const *cp_end = begin+getNumBytesForUTF8(SourceLine[*i]); + unsigned char const *cp_end = + begin + llvm::getNumBytesForUTF8(SourceLine[*i]); - ConversionResult res = ConvertUTF8toUTF32(&begin, cp_end, &cptr, cptr+1, - strictConversion); + llvm::ConversionResult res = llvm::ConvertUTF8toUTF32( + &begin, cp_end, &cptr, cptr + 1, llvm::strictConversion); (void)res; - assert(conversionOK==res); + assert(llvm::conversionOK == res); assert(0 < begin-original_begin && "we must be further along in the string now"); *i += begin-original_begin; Index: cfe/trunk/lib/Lex/Lexer.cpp =================================================================== --- cfe/trunk/lib/Lex/Lexer.cpp +++ cfe/trunk/lib/Lex/Lexer.cpp @@ -1485,13 +1485,13 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { const char *UnicodePtr = CurPtr; - UTF32 CodePoint; - ConversionResult Result = - llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr, - (const UTF8 *)BufferEnd, + llvm::UTF32 CodePoint; + llvm::ConversionResult Result = + llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr, + (const llvm::UTF8 *)BufferEnd, &CodePoint, - strictConversion); - if (Result != conversionOK || + llvm::strictConversion); + if (Result != llvm::conversionOK || !isAllowedIDChar(static_cast(CodePoint), LangOpts)) return false; @@ -3625,17 +3625,17 @@ break; } - UTF32 CodePoint; + llvm::UTF32 CodePoint; // We can't just reset CurPtr to BufferPtr because BufferPtr may point to // an escaped newline. --CurPtr; - ConversionResult Status = - llvm::convertUTF8Sequence((const UTF8 **)&CurPtr, - (const UTF8 *)BufferEnd, + llvm::ConversionResult Status = + llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, + (const llvm::UTF8 *)BufferEnd, &CodePoint, - strictConversion); - if (Status == conversionOK) { + llvm::strictConversion); + if (Status == llvm::conversionOK) { if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) return true; // KeepWhitespaceMode Index: cfe/trunk/lib/Lex/LiteralSupport.cpp =================================================================== --- cfe/trunk/lib/Lex/LiteralSupport.cpp +++ cfe/trunk/lib/Lex/LiteralSupport.cpp @@ -402,7 +402,7 @@ if (CharByteWidth == 4) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. - UTF32 *ResultPtr = reinterpret_cast(ResultBuf); + llvm::UTF32 *ResultPtr = reinterpret_cast(ResultBuf); *ResultPtr = UcnVal; ResultBuf += 4; return; @@ -411,7 +411,7 @@ if (CharByteWidth == 2) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. - UTF16 *ResultPtr = reinterpret_cast(ResultBuf); + llvm::UTF16 *ResultPtr = reinterpret_cast(ResultBuf); if (UcnVal <= (UTF32)0xFFFF) { *ResultPtr = UcnVal; @@ -1114,11 +1114,11 @@ char const *tmp_in_start = start; uint32_t *tmp_out_start = buffer_begin; - ConversionResult res = - ConvertUTF8toUTF32(reinterpret_cast(&start), - reinterpret_cast(begin), - &buffer_begin, buffer_end, strictConversion); - if (res != conversionOK) { + llvm::ConversionResult res = + llvm::ConvertUTF8toUTF32(reinterpret_cast(&start), + reinterpret_cast(begin), + &buffer_begin, buffer_end, llvm::strictConversion); + if (res != llvm::conversionOK) { // If we see bad encoding for unprefixed character literals, warn and // simply copy the byte values, for compatibility with gcc and // older versions of clang. @@ -1510,13 +1510,13 @@ if (CharByteWidth == 4) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. - UTF32 *ResultWidePtr = reinterpret_cast(ResultPtr); + llvm::UTF32 *ResultWidePtr = reinterpret_cast(ResultPtr); *ResultWidePtr = ResultChar; ResultPtr += 4; } else if (CharByteWidth == 2) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. - UTF16 *ResultWidePtr = reinterpret_cast(ResultPtr); + llvm::UTF16 *ResultWidePtr = reinterpret_cast(ResultPtr); *ResultWidePtr = ResultChar & 0xFFFF; ResultPtr += 2; } else { @@ -1531,12 +1531,12 @@ if (CharByteWidth == 4) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. - UTF32 *ResultWidePtr = reinterpret_cast(ResultBuf.data()); + llvm::UTF32 *ResultWidePtr = reinterpret_cast(ResultBuf.data()); ResultWidePtr[0] = GetNumStringChars() - 1; } else if (CharByteWidth == 2) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. - UTF16 *ResultWidePtr = reinterpret_cast(ResultBuf.data()); + llvm::UTF16 *ResultWidePtr = reinterpret_cast(ResultBuf.data()); ResultWidePtr[0] = GetNumStringChars() - 1; } else { assert(CharByteWidth == 1 && "Unexpected char width"); @@ -1570,7 +1570,7 @@ static const char *resyncUTF8(const char *Err, const char *End) { if (Err == End) return End; - End = Err + std::min(getNumBytesForUTF8(*Err), End-Err); + End = Err + std::min(llvm::getNumBytesForUTF8(*Err), End-Err); while (++Err != End && (*Err & 0xC0) == 0x80) ; return Err; @@ -1582,7 +1582,7 @@ bool StringLiteralParser::CopyStringFragment(const Token &Tok, const char *TokBegin, StringRef Fragment) { - const UTF8 *ErrorPtrTmp; + const llvm::UTF8 *ErrorPtrTmp; if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp)) return false; Index: cfe/trunk/lib/Sema/SemaChecking.cpp =================================================================== --- cfe/trunk/lib/Sema/SemaChecking.cpp +++ cfe/trunk/lib/Sema/SemaChecking.cpp @@ -3262,15 +3262,15 @@ if (Literal->containsNonAsciiOrNull()) { StringRef String = Literal->getString(); unsigned NumBytes = String.size(); - SmallVector ToBuf(NumBytes); - const UTF8 *FromPtr = (const UTF8 *)String.data(); - UTF16 *ToPtr = &ToBuf[0]; - - ConversionResult Result = ConvertUTF8toUTF16(&FromPtr, FromPtr + NumBytes, - &ToPtr, ToPtr + NumBytes, - strictConversion); + SmallVector ToBuf(NumBytes); + const llvm::UTF8 *FromPtr = (const llvm::UTF8 *)String.data(); + llvm::UTF16 *ToPtr = &ToBuf[0]; + + llvm::ConversionResult Result = + llvm::ConvertUTF8toUTF16(&FromPtr, FromPtr + NumBytes, &ToPtr, + ToPtr + NumBytes, llvm::strictConversion); // Check for conversion failure. - if (Result != conversionOK) + if (Result != llvm::conversionOK) Diag(Arg->getLocStart(), diag::warn_cfstring_truncated) << Arg->getSourceRange(); } @@ -4777,16 +4777,16 @@ // hex value. std::string CodePointStr; if (!llvm::sys::locale::isPrint(*csStart)) { - UTF32 CodePoint; - const UTF8 **B = reinterpret_cast(&csStart); - const UTF8 *E = - reinterpret_cast(csStart + csLen); - ConversionResult Result = - llvm::convertUTF8Sequence(B, E, &CodePoint, strictConversion); + llvm::UTF32 CodePoint; + const llvm::UTF8 **B = reinterpret_cast(&csStart); + const llvm::UTF8 *E = + reinterpret_cast(csStart + csLen); + llvm::ConversionResult Result = + llvm::convertUTF8Sequence(B, E, &CodePoint, llvm::strictConversion); - if (Result != conversionOK) { + if (Result != llvm::conversionOK) { unsigned char FirstChar = *csStart; - CodePoint = (UTF32)FirstChar; + CodePoint = (llvm::UTF32)FirstChar; } llvm::raw_string_ostream OS(CodePointStr); Index: cfe/trunk/lib/Sema/SemaExpr.cpp =================================================================== --- cfe/trunk/lib/Sema/SemaExpr.cpp +++ cfe/trunk/lib/Sema/SemaExpr.cpp @@ -3070,8 +3070,9 @@ SmallString<32> &Target) { Target.resize(CharByteWidth * (Source.size() + 1)); char *ResultPtr = &Target[0]; - const UTF8 *ErrorPtr; - bool success = ConvertUTF8toWide(CharByteWidth, Source, ResultPtr, ErrorPtr); + const llvm::UTF8 *ErrorPtr; + bool success = + llvm::ConvertUTF8toWide(CharByteWidth, Source, ResultPtr, ErrorPtr); (void)success; assert(success); Target.resize(ResultPtr - &Target[0]); Index: lldb/trunk/source/DataFormatters/StringPrinter.cpp =================================================================== --- lldb/trunk/source/DataFormatters/StringPrinter.cpp +++ lldb/trunk/source/DataFormatters/StringPrinter.cpp @@ -133,7 +133,7 @@ uint8_t *&next) { StringPrinter::StringPrinterBufferPointer<> retval{nullptr}; - unsigned utf8_encoded_len = getNumBytesForUTF8(*buffer); + unsigned utf8_encoded_len = llvm::getNumBytesForUTF8(*buffer); if (1 + buffer_end - buffer < utf8_encoded_len) { // I don't have enough bytes - print whatever I have left @@ -266,9 +266,10 @@ // use this call if you already have an LLDB-side buffer for the data template static bool DumpUTFBufferToStream( - ConversionResult (*ConvertFunction)(const SourceDataType **, - const SourceDataType *, UTF8 **, UTF8 *, - ConversionFlags), + llvm::ConversionResult (*ConvertFunction)(const SourceDataType **, + const SourceDataType *, + llvm::UTF8 **, llvm::UTF8 *, + llvm::ConversionFlags), const StringPrinter::ReadBufferAndDumpToStreamOptions &dump_options) { Stream &stream(*dump_options.GetStream()); if (dump_options.GetPrefixToken() != 0) @@ -303,30 +304,29 @@ } lldb::DataBufferSP utf8_data_buffer_sp; - UTF8 *utf8_data_ptr = nullptr; - UTF8 *utf8_data_end_ptr = nullptr; + llvm::UTF8 *utf8_data_ptr = nullptr; + llvm::UTF8 *utf8_data_end_ptr = nullptr; if (ConvertFunction) { utf8_data_buffer_sp.reset(new DataBufferHeap(4 * bufferSPSize, 0)); - utf8_data_ptr = (UTF8 *)utf8_data_buffer_sp->GetBytes(); + utf8_data_ptr = (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes(); utf8_data_end_ptr = utf8_data_ptr + utf8_data_buffer_sp->GetByteSize(); ConvertFunction(&data_ptr, data_end_ptr, &utf8_data_ptr, - utf8_data_end_ptr, lenientConversion); + utf8_data_end_ptr, llvm::lenientConversion); if (false == zero_is_terminator) utf8_data_end_ptr = utf8_data_ptr; + // needed because the ConvertFunction will change the value of the + // data_ptr. utf8_data_ptr = - (UTF8 *)utf8_data_buffer_sp->GetBytes(); // needed because the - // ConvertFunction will - // change the value of the - // data_ptr + (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes(); } else { // just copy the pointers - the cast is necessary to make the compiler // happy // but this should only happen if we are reading UTF8 data - utf8_data_ptr = - const_cast(reinterpret_cast(data_ptr)); - utf8_data_end_ptr = - const_cast(reinterpret_cast(data_end_ptr)); + utf8_data_ptr = const_cast( + reinterpret_cast(data_ptr)); + utf8_data_end_ptr = const_cast( + reinterpret_cast(data_end_ptr)); } const bool escape_non_printables = dump_options.GetEscapeNonPrintables(); @@ -512,9 +512,10 @@ template static bool ReadUTFBufferAndDumpToStream( const StringPrinter::ReadStringAndDumpToStreamOptions &options, - ConversionResult (*ConvertFunction)(const SourceDataType **, - const SourceDataType *, UTF8 **, UTF8 *, - ConversionFlags)) { + llvm::ConversionResult (*ConvertFunction)(const SourceDataType **, + const SourceDataType *, + llvm::UTF8 **, llvm::UTF8 *, + llvm::ConversionFlags)) { assert(options.GetStream() && "need a Stream to print the string to"); if (options.GetLocation() == 0 || @@ -591,21 +592,23 @@ bool StringPrinter::ReadStringAndDumpToStream< StringPrinter::StringElementType::UTF8>( const ReadStringAndDumpToStreamOptions &options) { - return ReadUTFBufferAndDumpToStream(options, nullptr); + return ReadUTFBufferAndDumpToStream(options, nullptr); } template <> bool StringPrinter::ReadStringAndDumpToStream< StringPrinter::StringElementType::UTF16>( const ReadStringAndDumpToStreamOptions &options) { - return ReadUTFBufferAndDumpToStream(options, ConvertUTF16toUTF8); + return ReadUTFBufferAndDumpToStream(options, + llvm::ConvertUTF16toUTF8); } template <> bool StringPrinter::ReadStringAndDumpToStream< StringPrinter::StringElementType::UTF32>( const ReadStringAndDumpToStreamOptions &options) { - return ReadUTFBufferAndDumpToStream(options, ConvertUTF32toUTF8); + return ReadUTFBufferAndDumpToStream(options, + llvm::ConvertUTF32toUTF8); } template <> @@ -614,7 +617,7 @@ const ReadBufferAndDumpToStreamOptions &options) { assert(options.GetStream() && "need a Stream to print the string to"); - return DumpUTFBufferToStream(nullptr, options); + return DumpUTFBufferToStream(nullptr, options); } template <> @@ -632,7 +635,7 @@ const ReadBufferAndDumpToStreamOptions &options) { assert(options.GetStream() && "need a Stream to print the string to"); - return DumpUTFBufferToStream(ConvertUTF16toUTF8, options); + return DumpUTFBufferToStream(llvm::ConvertUTF16toUTF8, options); } template <> @@ -641,7 +644,7 @@ const ReadBufferAndDumpToStreamOptions &options) { assert(options.GetStream() && "need a Stream to print the string to"); - return DumpUTFBufferToStream(ConvertUTF32toUTF8, options); + return DumpUTFBufferToStream(llvm::ConvertUTF32toUTF8, options); } } // namespace formatters Index: lldb/trunk/source/Plugins/Process/minidump/MinidumpTypes.cpp =================================================================== --- lldb/trunk/source/Plugins/Process/minidump/MinidumpTypes.cpp +++ lldb/trunk/source/Plugins/Process/minidump/MinidumpTypes.cpp @@ -49,7 +49,7 @@ if (error.Fail() || *source_length > data.size() || *source_length % 2 != 0) return llvm::None; - auto source_start = reinterpret_cast(data.data()); + auto source_start = reinterpret_cast(data.data()); // source_length is the length of the string in bytes // we need the length of the string in UTF-16 characters/code points (16 bits // per char) @@ -57,12 +57,12 @@ const auto source_end = source_start + (*source_length) / 2; // resize to worst case length result.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT * (*source_length) / 2); - auto result_start = reinterpret_cast(&result[0]); + auto result_start = reinterpret_cast(&result[0]); const auto result_end = result_start + result.size(); - ConvertUTF16toUTF8(&source_start, source_end, &result_start, result_end, - strictConversion); + llvm::ConvertUTF16toUTF8(&source_start, source_end, &result_start, result_end, + llvm::strictConversion); const auto result_size = - std::distance(reinterpret_cast(&result[0]), result_start); + std::distance(reinterpret_cast(&result[0]), result_start); result.resize(result_size); // shrink to actual length return result; Index: llvm/trunk/include/llvm/Support/ConvertUTF.h =================================================================== --- llvm/trunk/include/llvm/Support/ConvertUTF.h +++ llvm/trunk/include/llvm/Support/ConvertUTF.h @@ -90,6 +90,14 @@ #ifndef LLVM_SUPPORT_CONVERTUTF_H #define LLVM_SUPPORT_CONVERTUTF_H +#include +#include + +// Wrap everything in namespace llvm so that programs can link with llvm and +// their own version of the unicode libraries. + +namespace llvm { + /* --------------------------------------------------------------------- The following 4 definitions are compiler-specific. The C standard does not guarantee that wchar_t has at least @@ -127,11 +135,6 @@ lenientConversion } ConversionFlags; -/* This is for C++ and does no harm in C */ -#ifdef __cplusplus -extern "C" { -#endif - ConversionResult ConvertUTF8toUTF16 ( const UTF8** sourceStart, const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); @@ -174,16 +177,9 @@ unsigned getNumBytesForUTF8(UTF8 firstByte); -#ifdef __cplusplus -} - /*************************************************************************/ /* Below are LLVM-specific wrappers of the functions above. */ -#include -#include - -namespace llvm { template class ArrayRef; template class SmallVectorImpl; class StringRef; @@ -293,7 +289,3 @@ } /* end namespace llvm */ #endif - -/* --------------------------------------------------------------------- */ - -#endif Index: llvm/trunk/lib/Support/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Support/CMakeLists.txt +++ llvm/trunk/lib/Support/CMakeLists.txt @@ -40,7 +40,7 @@ COM.cpp CommandLine.cpp Compression.cpp - ConvertUTF.c + ConvertUTF.cpp ConvertUTFWrapper.cpp CrashRecoveryContext.cpp DataExtractor.cpp Index: llvm/trunk/lib/Support/ConvertUTF.c =================================================================== --- llvm/trunk/lib/Support/ConvertUTF.c +++ llvm/trunk/lib/Support/ConvertUTF.c @@ -1,708 +0,0 @@ -/*===--- ConvertUTF.c - Universal Character Names conversions ---------------=== - * - * The LLVM Compiler Infrastructure - * - * This file is distributed under the University of Illinois Open Source - * License. See LICENSE.TXT for details. - * - *===------------------------------------------------------------------------=*/ -/* - * Copyright 2001-2004 Unicode, Inc. - * - * Disclaimer - * - * This source code is provided as is by Unicode, Inc. No claims are - * made as to fitness for any particular purpose. No warranties of any - * kind are expressed or implied. The recipient agrees to determine - * applicability of information provided. If this file has been - * purchased on magnetic or optical media from Unicode, Inc., the - * sole remedy for any claim will be exchange of defective media - * within 90 days of receipt. - * - * Limitations on Rights to Redistribute This Code - * - * Unicode, Inc. hereby grants the right to freely use the information - * supplied in this file in the creation of products supporting the - * Unicode Standard, and to make copies of this file in any form - * for internal or external distribution as long as this notice - * remains attached. - */ - -/* --------------------------------------------------------------------- - - Conversions between UTF32, UTF-16, and UTF-8. Source code file. - Author: Mark E. Davis, 1994. - Rev History: Rick McGowan, fixes & updates May 2001. - Sept 2001: fixed const & error conditions per - mods suggested by S. Parent & A. Lillich. - June 2002: Tim Dodd added detection and handling of incomplete - source sequences, enhanced error detection, added casts - to eliminate compiler warnings. - July 2003: slight mods to back out aggressive FFFE detection. - Jan 2004: updated switches in from-UTF8 conversions. - Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. - - See the header file "ConvertUTF.h" for complete documentation. - ------------------------------------------------------------------------- */ - - -#include "llvm/Support/ConvertUTF.h" -#ifdef CVTUTF_DEBUG -#include -#endif -#include - -static const int halfShift = 10; /* used for shifting by 10 bits */ - -static const UTF32 halfBase = 0x0010000UL; -static const UTF32 halfMask = 0x3FFUL; - -#define UNI_SUR_HIGH_START (UTF32)0xD800 -#define UNI_SUR_HIGH_END (UTF32)0xDBFF -#define UNI_SUR_LOW_START (UTF32)0xDC00 -#define UNI_SUR_LOW_END (UTF32)0xDFFF -#define false 0 -#define true 1 - -/* --------------------------------------------------------------------- */ - -/* - * Index into the table below with the first byte of a UTF-8 sequence to - * get the number of trailing bytes that are supposed to follow it. - * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is - * left as-is for anyone who may want to do such conversion, which was - * allowed in earlier algorithms. - */ -static const char trailingBytesForUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 -}; - -/* - * Magic values subtracted from a buffer value during UTF8 conversion. - * This table contains as many values as there might be trailing bytes - * in a UTF-8 sequence. - */ -static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; - -/* - * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed - * into the first byte, depending on how many bytes follow. There are - * as many entries in this table as there are UTF-8 sequence types. - * (I.e., one byte sequence, two byte... etc.). Remember that sequencs - * for *legal* UTF-8 will be 4 or fewer bytes total. - */ -static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; - -/* --------------------------------------------------------------------- */ - -/* The interface converts a whole buffer to avoid function-call overhead. - * Constants have been gathered. Loops & conditionals have been removed as - * much as possible for efficiency, in favor of drop-through switches. - * (See "Note A" at the bottom of the file for equivalent code.) - * If your compiler supports it, the "isLegalUTF8" call can be turned - * into an inline function. - */ - - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF32toUTF16 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - if (target >= targetEnd) { - result = targetExhausted; break; - } - ch = *source++; - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = (UTF16)ch; /* normal case */ - } - } else if (ch > UNI_MAX_LEGAL_UTF32) { - if (flags == strictConversion) { - result = sourceIllegal; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - --source; /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); - *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); - } - } - *sourceStart = source; - *targetStart = target; - return result; -} - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF16toUTF32 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF32* target = *targetStart; - UTF32 ch, ch2; - while (source < sourceEnd) { - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source buffer... */ - if (source < sourceEnd) { - ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } else { /* We don't have the 16 bits following the high surrogate. */ - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - if (target >= targetEnd) { - source = oldSource; /* Back up source pointer! */ - result = targetExhausted; break; - } - *target++ = ch; - } - *sourceStart = source; - *targetStart = target; -#ifdef CVTUTF_DEBUG -if (result == sourceIllegal) { - fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); - fflush(stderr); -} -#endif - return result; -} -ConversionResult ConvertUTF16toUTF8 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source buffer... */ - if (source < sourceEnd) { - UTF32 ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } else { /* We don't have the 16 bits following the high surrogate. */ - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* Figure out how many bytes the result will require */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - } - - target += bytesToWrite; - if (target > targetEnd) { - source = oldSource; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; -} - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF32toUTF8 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - ch = *source++; - if (flags == strictConversion ) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* - * Figure out how many bytes the result will require. Turn any - * illegally large UTF32 things (> Plane 17) into replacement chars. - */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - result = sourceIllegal; - } - - target += bytesToWrite; - if (target > targetEnd) { - --source; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; -} - -/* --------------------------------------------------------------------- */ - -/* - * Utility routine to tell whether a sequence of bytes is legal UTF-8. - * This must be called with the length pre-determined by the first byte. - * If not calling this from ConvertUTF8to*, then the length can be set by: - * length = trailingBytesForUTF8[*source]+1; - * and the sequence is illegal right away if there aren't that many bytes - * available. - * If presented with a length > 4, this returns false. The Unicode - * definition of UTF-8 goes up to 4-byte sequences. - */ - -static Boolean isLegalUTF8(const UTF8 *source, int length) { - UTF8 a; - const UTF8 *srcptr = source+length; - switch (length) { - default: return false; - /* Everything else falls through when "true"... */ - case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - - switch (*source) { - /* no fall-through in this inner switch */ - case 0xE0: if (a < 0xA0) return false; break; - case 0xED: if (a > 0x9F) return false; break; - case 0xF0: if (a < 0x90) return false; break; - case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; - } - - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - } - if (*source > 0xF4) return false; - return true; -} - -/* --------------------------------------------------------------------- */ - -/* - * Exported function to return whether a UTF-8 sequence is legal or not. - * This is not used here; it's just exported. - */ -Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { - int length = trailingBytesForUTF8[*source]+1; - if (length > sourceEnd - source) { - return false; - } - return isLegalUTF8(source, length); -} - -/* --------------------------------------------------------------------- */ - -static unsigned -findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, - const UTF8 *sourceEnd) { - UTF8 b1, b2, b3; - - assert(!isLegalUTF8Sequence(source, sourceEnd)); - - /* - * Unicode 6.3.0, D93b: - * - * Maximal subpart of an ill-formed subsequence: The longest code unit - * subsequence starting at an unconvertible offset that is either: - * a. the initial subsequence of a well-formed code unit sequence, or - * b. a subsequence of length one. - */ - - if (source == sourceEnd) - return 0; - - /* - * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8 - * Byte Sequences. - */ - - b1 = *source; - ++source; - if (b1 >= 0xC2 && b1 <= 0xDF) { - /* - * First byte is valid, but we know that this code unit sequence is - * invalid, so the maximal subpart has to end after the first byte. - */ - return 1; - } - - if (source == sourceEnd) - return 1; - - b2 = *source; - ++source; - - if (b1 == 0xE0) { - return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1; - } - if (b1 >= 0xE1 && b1 <= 0xEC) { - return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; - } - if (b1 == 0xED) { - return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1; - } - if (b1 >= 0xEE && b1 <= 0xEF) { - return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; - } - if (b1 == 0xF0) { - if (b2 >= 0x90 && b2 <= 0xBF) { - if (source == sourceEnd) - return 2; - - b3 = *source; - return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; - } - return 1; - } - if (b1 >= 0xF1 && b1 <= 0xF3) { - if (b2 >= 0x80 && b2 <= 0xBF) { - if (source == sourceEnd) - return 2; - - b3 = *source; - return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; - } - return 1; - } - if (b1 == 0xF4) { - if (b2 >= 0x80 && b2 <= 0x8F) { - if (source == sourceEnd) - return 2; - - b3 = *source; - return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; - } - return 1; - } - - assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5); - /* - * There are no valid sequences that start with these bytes. Maximal subpart - * is defined to have length 1 in these cases. - */ - return 1; -} - -/* --------------------------------------------------------------------- */ - -/* - * Exported function to return the total number of bytes in a codepoint - * represented in UTF-8, given the value of the first byte. - */ -unsigned getNumBytesForUTF8(UTF8 first) { - return trailingBytesForUTF8[first] + 1; -} - -/* --------------------------------------------------------------------- */ - -/* - * Exported function to return whether a UTF-8 string is legal or not. - * This is not used here; it's just exported. - */ -Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) { - while (*source != sourceEnd) { - int length = trailingBytesForUTF8[**source] + 1; - if (length > sourceEnd - *source || !isLegalUTF8(*source, length)) - return false; - *source += length; - } - return true; -} - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF8toUTF16 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (extraBytesToRead >= sourceEnd - source) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (!isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = (UTF16)ch; /* normal case */ - } - } else if (ch > UNI_MAX_UTF16) { - if (flags == strictConversion) { - result = sourceIllegal; - source -= (extraBytesToRead+1); /* return to the start */ - break; /* Bail out; shouldn't continue */ - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); - *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); - } - } - *sourceStart = source; - *targetStart = target; - return result; -} - -/* --------------------------------------------------------------------- */ - -static ConversionResult ConvertUTF8toUTF32Impl( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags, - Boolean InputIsPartial) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF32* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (extraBytesToRead >= sourceEnd - source) { - if (flags == strictConversion || InputIsPartial) { - result = sourceExhausted; - break; - } else { - result = sourceIllegal; - - /* - * Replace the maximal subpart of ill-formed sequence with - * replacement character. - */ - source += findMaximalSubpartOfIllFormedUTF8Sequence(source, - sourceEnd); - *target++ = UNI_REPLACEMENT_CHAR; - continue; - } - } - if (target >= targetEnd) { - result = targetExhausted; break; - } - - /* Do this check whether lenient or strict */ - if (!isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - if (flags == strictConversion) { - /* Abort conversion. */ - break; - } else { - /* - * Replace the maximal subpart of ill-formed sequence with - * replacement character. - */ - source += findMaximalSubpartOfIllFormedUTF8Sequence(source, - sourceEnd); - *target++ = UNI_REPLACEMENT_CHAR; - continue; - } - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (ch <= UNI_MAX_LEGAL_UTF32) { - /* - * UTF-16 surrogate values are illegal in UTF-32, and anything - * over Plane 17 (> 0x10FFFF) is illegal. - */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = ch; - } - } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ - result = sourceIllegal; - *target++ = UNI_REPLACEMENT_CHAR; - } - } - *sourceStart = source; - *targetStart = target; - return result; -} - -ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, - const UTF8 *sourceEnd, - UTF32 **targetStart, - UTF32 *targetEnd, - ConversionFlags flags) { - return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, - flags, /*InputIsPartial=*/true); -} - -ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, - const UTF8 *sourceEnd, UTF32 **targetStart, - UTF32 *targetEnd, ConversionFlags flags) { - return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, - flags, /*InputIsPartial=*/false); -} - -/* --------------------------------------------------------------------- - - Note A. - The fall-through switches in UTF-8 reading code save a - temp variable, some decrements & conditionals. The switches - are equivalent to the following loop: - { - int tmpBytesToRead = extraBytesToRead+1; - do { - ch += *source++; - --tmpBytesToRead; - if (tmpBytesToRead) ch <<= 6; - } while (tmpBytesToRead > 0); - } - In UTF-8 writing code, the switches on "bytesToWrite" are - similarly unrolled loops. - - --------------------------------------------------------------------- */ Index: llvm/trunk/lib/Support/ConvertUTF.cpp =================================================================== --- llvm/trunk/lib/Support/ConvertUTF.cpp +++ llvm/trunk/lib/Support/ConvertUTF.cpp @@ -0,0 +1,710 @@ +/*===--- ConvertUTF.c - Universal Character Names conversions ---------------=== + * + * The LLVM Compiler Infrastructure + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + * + *===------------------------------------------------------------------------=*/ +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + +/* --------------------------------------------------------------------- + + Conversions between UTF32, UTF-16, and UTF-8. Source code file. + Author: Mark E. Davis, 1994. + Rev History: Rick McGowan, fixes & updates May 2001. + Sept 2001: fixed const & error conditions per + mods suggested by S. Parent & A. Lillich. + June 2002: Tim Dodd added detection and handling of incomplete + source sequences, enhanced error detection, added casts + to eliminate compiler warnings. + July 2003: slight mods to back out aggressive FFFE detection. + Jan 2004: updated switches in from-UTF8 conversions. + Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. + + See the header file "ConvertUTF.h" for complete documentation. + +------------------------------------------------------------------------ */ + + +#include "llvm/Support/ConvertUTF.h" +#ifdef CVTUTF_DEBUG +#include +#endif +#include + +namespace llvm { + +static const int halfShift = 10; /* used for shifting by 10 bits */ + +static const UTF32 halfBase = 0x0010000UL; +static const UTF32 halfMask = 0x3FFUL; + +#define UNI_SUR_HIGH_START (UTF32)0xD800 +#define UNI_SUR_HIGH_END (UTF32)0xDBFF +#define UNI_SUR_LOW_START (UTF32)0xDC00 +#define UNI_SUR_LOW_END (UTF32)0xDFFF + +/* --------------------------------------------------------------------- */ + +/* + * Index into the table below with the first byte of a UTF-8 sequence to + * get the number of trailing bytes that are supposed to follow it. + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is + * left as-is for anyone who may want to do such conversion, which was + * allowed in earlier algorithms. + */ +static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + +/* + * Magic values subtracted from a buffer value during UTF8 conversion. + * This table contains as many values as there might be trailing bytes + * in a UTF-8 sequence. + */ +static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; + +/* + * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed + * into the first byte, depending on how many bytes follow. There are + * as many entries in this table as there are UTF-8 sequence types. + * (I.e., one byte sequence, two byte... etc.). Remember that sequencs + * for *legal* UTF-8 will be 4 or fewer bytes total. + */ +static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + +/* --------------------------------------------------------------------- */ + +/* The interface converts a whole buffer to avoid function-call overhead. + * Constants have been gathered. Loops & conditionals have been removed as + * much as possible for efficiency, in favor of drop-through switches. + * (See "Note A" at the bottom of the file for equivalent code.) + * If your compiler supports it, the "isLegalUTF8" call can be turned + * into an inline function. + */ + + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF32toUTF16 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + if (target >= targetEnd) { + result = targetExhausted; break; + } + ch = *source++; + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_LEGAL_UTF32) { + if (flags == strictConversion) { + result = sourceIllegal; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + --source; /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF16toUTF32 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF32* target = *targetStart; + UTF32 ch, ch2; + while (source < sourceEnd) { + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + if (target >= targetEnd) { + source = oldSource; /* Back up source pointer! */ + result = targetExhausted; break; + } + *target++ = ch; + } + *sourceStart = source; + *targetStart = target; +#ifdef CVTUTF_DEBUG +if (result == sourceIllegal) { + fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); + fflush(stderr); +} +#endif + return result; +} +ConversionResult ConvertUTF16toUTF8 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + UTF32 ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* Figure out how many bytes the result will require */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + } + + target += bytesToWrite; + if (target > targetEnd) { + source = oldSource; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF32toUTF8 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + ch = *source++; + if (flags == strictConversion ) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* + * Figure out how many bytes the result will require. Turn any + * illegally large UTF32 things (> Plane 17) into replacement chars. + */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + result = sourceIllegal; + } + + target += bytesToWrite; + if (target > targetEnd) { + --source; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +/* + * Utility routine to tell whether a sequence of bytes is legal UTF-8. + * This must be called with the length pre-determined by the first byte. + * If not calling this from ConvertUTF8to*, then the length can be set by: + * length = trailingBytesForUTF8[*source]+1; + * and the sequence is illegal right away if there aren't that many bytes + * available. + * If presented with a length > 4, this returns false. The Unicode + * definition of UTF-8 goes up to 4-byte sequences. + */ + +static Boolean isLegalUTF8(const UTF8 *source, int length) { + UTF8 a; + const UTF8 *srcptr = source+length; + switch (length) { + default: return false; + /* Everything else falls through when "true"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + } + if (*source > 0xF4) return false; + return true; +} + +/* --------------------------------------------------------------------- */ + +/* + * Exported function to return whether a UTF-8 sequence is legal or not. + * This is not used here; it's just exported. + */ +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { + int length = trailingBytesForUTF8[*source]+1; + if (length > sourceEnd - source) { + return false; + } + return isLegalUTF8(source, length); +} + +/* --------------------------------------------------------------------- */ + +static unsigned +findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, + const UTF8 *sourceEnd) { + UTF8 b1, b2, b3; + + assert(!isLegalUTF8Sequence(source, sourceEnd)); + + /* + * Unicode 6.3.0, D93b: + * + * Maximal subpart of an ill-formed subsequence: The longest code unit + * subsequence starting at an unconvertible offset that is either: + * a. the initial subsequence of a well-formed code unit sequence, or + * b. a subsequence of length one. + */ + + if (source == sourceEnd) + return 0; + + /* + * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8 + * Byte Sequences. + */ + + b1 = *source; + ++source; + if (b1 >= 0xC2 && b1 <= 0xDF) { + /* + * First byte is valid, but we know that this code unit sequence is + * invalid, so the maximal subpart has to end after the first byte. + */ + return 1; + } + + if (source == sourceEnd) + return 1; + + b2 = *source; + ++source; + + if (b1 == 0xE0) { + return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1; + } + if (b1 >= 0xE1 && b1 <= 0xEC) { + return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; + } + if (b1 == 0xED) { + return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1; + } + if (b1 >= 0xEE && b1 <= 0xEF) { + return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; + } + if (b1 == 0xF0) { + if (b2 >= 0x90 && b2 <= 0xBF) { + if (source == sourceEnd) + return 2; + + b3 = *source; + return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; + } + return 1; + } + if (b1 >= 0xF1 && b1 <= 0xF3) { + if (b2 >= 0x80 && b2 <= 0xBF) { + if (source == sourceEnd) + return 2; + + b3 = *source; + return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; + } + return 1; + } + if (b1 == 0xF4) { + if (b2 >= 0x80 && b2 <= 0x8F) { + if (source == sourceEnd) + return 2; + + b3 = *source; + return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; + } + return 1; + } + + assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5); + /* + * There are no valid sequences that start with these bytes. Maximal subpart + * is defined to have length 1 in these cases. + */ + return 1; +} + +/* --------------------------------------------------------------------- */ + +/* + * Exported function to return the total number of bytes in a codepoint + * represented in UTF-8, given the value of the first byte. + */ +unsigned getNumBytesForUTF8(UTF8 first) { + return trailingBytesForUTF8[first] + 1; +} + +/* --------------------------------------------------------------------- */ + +/* + * Exported function to return whether a UTF-8 string is legal or not. + * This is not used here; it's just exported. + */ +Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) { + while (*source != sourceEnd) { + int length = trailingBytesForUTF8[**source] + 1; + if (length > sourceEnd - *source || !isLegalUTF8(*source, length)) + return false; + *source += length; + } + return true; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF8toUTF16 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (extraBytesToRead >= sourceEnd - source) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (!isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_UTF16) { + if (flags == strictConversion) { + result = sourceIllegal; + source -= (extraBytesToRead+1); /* return to the start */ + break; /* Bail out; shouldn't continue */ + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +static ConversionResult ConvertUTF8toUTF32Impl( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags, + Boolean InputIsPartial) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF32* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (extraBytesToRead >= sourceEnd - source) { + if (flags == strictConversion || InputIsPartial) { + result = sourceExhausted; + break; + } else { + result = sourceIllegal; + + /* + * Replace the maximal subpart of ill-formed sequence with + * replacement character. + */ + source += findMaximalSubpartOfIllFormedUTF8Sequence(source, + sourceEnd); + *target++ = UNI_REPLACEMENT_CHAR; + continue; + } + } + if (target >= targetEnd) { + result = targetExhausted; break; + } + + /* Do this check whether lenient or strict */ + if (!isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + if (flags == strictConversion) { + /* Abort conversion. */ + break; + } else { + /* + * Replace the maximal subpart of ill-formed sequence with + * replacement character. + */ + source += findMaximalSubpartOfIllFormedUTF8Sequence(source, + sourceEnd); + *target++ = UNI_REPLACEMENT_CHAR; + continue; + } + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; + case 4: ch += *source++; ch <<= 6; + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (ch <= UNI_MAX_LEGAL_UTF32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = ch; + } + } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ + result = sourceIllegal; + *target++ = UNI_REPLACEMENT_CHAR; + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, + const UTF8 *sourceEnd, + UTF32 **targetStart, + UTF32 *targetEnd, + ConversionFlags flags) { + return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, + flags, /*InputIsPartial=*/true); +} + +ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, + const UTF8 *sourceEnd, UTF32 **targetStart, + UTF32 *targetEnd, ConversionFlags flags) { + return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, + flags, /*InputIsPartial=*/false); +} + +/* --------------------------------------------------------------------- + + Note A. + The fall-through switches in UTF-8 reading code save a + temp variable, some decrements & conditionals. The switches + are equivalent to the following loop: + { + int tmpBytesToRead = extraBytesToRead+1; + do { + ch += *source++; + --tmpBytesToRead; + if (tmpBytesToRead) ch <<= 6; + } while (tmpBytesToRead > 0); + } + In UTF-8 writing code, the switches on "bytesToWrite" are + similarly unrolled loops. + + --------------------------------------------------------------------- */ + +} // namespace llvm