Index: include/llvm/Support/ConvertUTF.h =================================================================== --- include/llvm/Support/ConvertUTF.h +++ include/llvm/Support/ConvertUTF.h @@ -1,88 +1,100 @@ -/*===--- ConvertUTF.h - Universal Character Names conversions ---------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *==------------------------------------------------------------------------==*/ /* - * Copyright 2001-2004 Unicode, Inc. - * - * Disclaimer - * - * This source code is provided as is by Unicode, Inc. No claims are - * made as to fitness for any particular purpose. No warranties of any - * kind are expressed or implied. The recipient agrees to determine - * applicability of information provided. If this file has been - * purchased on magnetic or optical media from Unicode, Inc., the - * sole remedy for any claim will be exchange of defective media - * within 90 days of receipt. - * - * Limitations on Rights to Redistribute This Code - * - * Unicode, Inc. hereby grants the right to freely use the information - * supplied in this file in the creation of products supporting the - * Unicode Standard, and to make copies of this file in any form - * for internal or external distribution as long as this notice - * remains attached. + * Copyright © 1991-2015 Unicode, Inc. All rights reserved. + * Distributed under the Terms of Use in + * http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of the Unicode data files and any associated documentation + * (the "Data Files") or Unicode software and any associated documentation + * (the "Software") to deal in the Data Files or Software + * without restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, and/or sell copies of + * the Data Files or Software, and to permit persons to whom the Data Files + * or Software are furnished to do so, provided that + * (a) this copyright and permission notice appear with all copies + * of the Data Files or Software, + * (b) this copyright and permission notice appear in associated + * documentation, and + * (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or + * Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF + * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT OF THIRD PARTY RIGHTS. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS + * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL + * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder + * shall not be used in advertising or otherwise to promote the sale, + * use or other dealings in these Data Files or Software without prior + * written authorization of the copyright holder. */ +#ifndef COMMON_CONVERT_UTF_H_ +#define COMMON_CONVERT_UTF_H_ + /* --------------------------------------------------------------------- - Conversions between UTF32, UTF-16, and UTF-8. Header file. +Conversions between UTF32, UTF-16, and UTF-8. Header file. - Several funtions are included here, forming a complete set of - conversions between the three formats. UTF-7 is not included - here, but is handled in a separate source file. - - Each of these routines takes pointers to input buffers and output - buffers. The input buffers are const. - - Each routine converts the text between *sourceStart and sourceEnd, - putting the result into the buffer between *targetStart and - targetEnd. Note: the end pointers are *after* the last item: e.g. - *(sourceEnd - 1) is the last item. - - The return result indicates whether the conversion was successful, - and if not, whether the problem was in the source or target buffers. - (Only the first encountered problem is indicated.) - - After the conversion, *sourceStart and *targetStart are both - updated to point to the end of last text successfully converted in - the respective buffers. - - Input parameters: - sourceStart - pointer to a pointer to the source buffer. - The contents of this are modified on return so that - it points at the next thing to be converted. - targetStart - similarly, pointer to pointer to the target buffer. - sourceEnd, targetEnd - respectively pointers to the ends of the - two buffers, for overflow checking only. - - These conversion functions take a ConversionFlags argument. When this - flag is set to strict, both irregular sequences and isolated surrogates - will cause an error. When the flag is set to lenient, both irregular - sequences and isolated surrogates are converted. - - Whether the flag is strict or lenient, all illegal sequences will cause - an error return. This includes sequences such as: , , - or in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code - must check for illegal sequences. - - When the flag is set to lenient, characters over 0x10FFFF are converted - to the replacement character; otherwise (when the flag is set to strict) - they constitute an error. - - Output parameters: - The value "sourceIllegal" is returned from some routines if the input - sequence is malformed. When "sourceIllegal" is returned, the source - value will point to the illegal value that caused the problem. E.g., - in UTF-8 when a sequence is malformed, it points to the start of the - malformed sequence. - - Author: Mark E. Davis, 1994. - Rev History: Rick McGowan, fixes & updates May 2001. - Fixes & updates, Sept 2001. +Several funtions are included here, forming a complete set of +conversions between the three formats. UTF-7 is not included +here, but is handled in a separate source file. + +Each of these routines takes pointers to input buffers and output +buffers. The input buffers are const. + +Each routine converts the text between *sourceStart and sourceEnd, +putting the result into the buffer between *targetStart and +targetEnd. Note: the end pointers are *after* the last item: e.g. +*(sourceEnd - 1) is the last item. + +The return result indicates whether the conversion was successful, +and if not, whether the problem was in the source or target buffers. +(Only the first encountered problem is indicated.) + +After the conversion, *sourceStart and *targetStart are both +updated to point to the end of last text successfully converted in +the respective buffers. + +Input parameters: +sourceStart - pointer to a pointer to the source buffer. +The contents of this are modified on return so that +it points at the next thing to be converted. +targetStart - similarly, pointer to pointer to the target buffer. +sourceEnd, targetEnd - respectively pointers to the ends of the +two buffers, for overflow checking only. + +These conversion functions take a ConversionFlags argument. When this +flag is set to strict, both irregular sequences and isolated surrogates +will cause an error. When the flag is set to lenient, both irregular +sequences and isolated surrogates are converted. + +Whether the flag is strict or lenient, all illegal sequences will cause +an error return. This includes sequences such as: , , +or in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code +must check for illegal sequences. + +When the flag is set to lenient, characters over 0x10FFFF are converted +to the replacement character; otherwise (when the flag is set to strict) +they constitute an error. + +Output parameters: +The value "sourceIllegal" is returned from some routines if the input +sequence is malformed. When "sourceIllegal" is returned, the source +value will point to the illegal value that caused the problem. E.g., +in UTF-8 when a sequence is malformed, it points to the start of the +malformed sequence. + +Author: Mark E. Davis, 1994. +Rev History: Rick McGowan, fixes & updates May 2001. +Fixes & updates, Sept 2001. ------------------------------------------------------------------------ */ @@ -99,17 +111,17 @@ namespace llvm { /* --------------------------------------------------------------------- - The following 4 definitions are compiler-specific. - The C standard does not guarantee that wchar_t has at least - 16 bits, so wchar_t is no less portable than unsigned short! - All should be unsigned values to avoid sign extension during - bit mask & shift operations. +The following 4 definitions are compiler-specific. +The C standard does not guarantee that wchar_t has at least +16 bits, so wchar_t is no less portable than unsigned short! +All should be unsigned values to avoid sign extension during +bit mask & shift operations. ------------------------------------------------------------------------ */ -typedef unsigned int UTF32; /* at least 32 bits */ -typedef unsigned short UTF16; /* at least 16 bits */ -typedef unsigned char UTF8; /* typically 8 bits */ -typedef unsigned char Boolean; /* 0 or 1 */ +typedef unsigned long UTF32; /* at least 32 bits */ +typedef unsigned short UTF16; /* at least 16 bits */ +typedef unsigned char UTF8; /* typically 8 bits */ +typedef unsigned char Boolean; /* 0 or 1 */ /* Some fundamental constants */ #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD @@ -124,52 +136,40 @@ #define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE typedef enum { - conversionOK, /* conversion successful */ - sourceExhausted, /* partial character in source, but hit end */ - targetExhausted, /* insuff. room in target for conversion */ - sourceIllegal /* source sequence is illegal/malformed */ + conversionOK, /* conversion successful */ + sourceExhausted, /* partial character in source, but hit end */ + targetExhausted, /* insuff. room in target for conversion */ + sourceIllegal /* source sequence is illegal/malformed */ } ConversionResult; typedef enum { - strictConversion = 0, - lenientConversion + strictConversion = 0, + lenientConversion } ConversionFlags; -ConversionResult ConvertUTF8toUTF16 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); +/* This is for C++ and does no harm in C */ +#ifdef __cplusplus +extern "C" { +#endif -/** - * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an - * incomplete code unit sequence, returns \c sourceExhausted. - */ -ConversionResult ConvertUTF8toUTF32Partial( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); -/** - * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an - * incomplete code unit sequence, returns \c sourceIllegal. - */ -ConversionResult ConvertUTF8toUTF32( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); - -ConversionResult ConvertUTF16toUTF8 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); - -ConversionResult ConvertUTF32toUTF8 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); - -ConversionResult ConvertUTF16toUTF32 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); - -ConversionResult ConvertUTF32toUTF16 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); +ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); @@ -177,6 +177,12 @@ unsigned getNumBytesForUTF8(UTF8 firstByte); +#ifdef __cplusplus +} +#endif + +/* --------------------------------------------------------------------- */ + /*************************************************************************/ /* Below are LLVM-specific wrappers of the functions above. */ @@ -304,3 +310,5 @@ } /* end namespace llvm */ #endif + +#endif // COMMON_CONVERT_UTF_H_ Index: lib/Support/ConvertUTF.cpp =================================================================== --- lib/Support/ConvertUTF.cpp +++ lib/Support/ConvertUTF.cpp @@ -1,55 +1,65 @@ -/*===--- ConvertUTF.c - Universal Character Names conversions ---------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=*/ /* - * Copyright 2001-2004 Unicode, Inc. - * - * Disclaimer + * Copyright © 1991-2015 Unicode, Inc. All rights reserved. + * Distributed under the Terms of Use in + * http://www.unicode.org/copyright.html. * - * This source code is provided as is by Unicode, Inc. No claims are - * made as to fitness for any particular purpose. No warranties of any - * kind are expressed or implied. The recipient agrees to determine - * applicability of information provided. If this file has been - * purchased on magnetic or optical media from Unicode, Inc., the - * sole remedy for any claim will be exchange of defective media - * within 90 days of receipt. + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of the Unicode data files and any associated documentation + * (the "Data Files") or Unicode software and any associated documentation + * (the "Software") to deal in the Data Files or Software + * without restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, and/or sell copies of + * the Data Files or Software, and to permit persons to whom the Data Files + * or Software are furnished to do so, provided that + * (a) this copyright and permission notice appear with all copies + * of the Data Files or Software, + * (b) this copyright and permission notice appear in associated + * documentation, and + * (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or + * Software that the data or software has been modified. * - * Limitations on Rights to Redistribute This Code + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF + * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT OF THIRD PARTY RIGHTS. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS + * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL + * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. * - * Unicode, Inc. hereby grants the right to freely use the information - * supplied in this file in the creation of products supporting the - * Unicode Standard, and to make copies of this file in any form - * for internal or external distribution as long as this notice - * remains attached. + * Except as contained in this notice, the name of a copyright holder + * shall not be used in advertising or otherwise to promote the sale, + * use or other dealings in these Data Files or Software without prior + * written authorization of the copyright holder. */ /* --------------------------------------------------------------------- - Conversions between UTF32, UTF-16, and UTF-8. Source code file. - Author: Mark E. Davis, 1994. - Rev History: Rick McGowan, fixes & updates May 2001. - Sept 2001: fixed const & error conditions per - mods suggested by S. Parent & A. Lillich. - June 2002: Tim Dodd added detection and handling of incomplete - source sequences, enhanced error detection, added casts - to eliminate compiler warnings. - July 2003: slight mods to back out aggressive FFFE detection. - Jan 2004: updated switches in from-UTF8 conversions. - Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. +Conversions between UTF32, UTF-16, and UTF-8. Source code file. +Author: Mark E. Davis, 1994. +Rev History: Rick McGowan, fixes & updates May 2001. +Sept 2001: fixed const & error conditions per +mods suggested by S. Parent & A. Lillich. +June 2002: Tim Dodd added detection and handling of incomplete +source sequences, enhanced error detection, added casts +to eliminate compiler warnings. +July 2003: slight mods to back out aggressive FFFE detection. +Jan 2004: updated switches in from-UTF8 conversions. +Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. - See the header file "ConvertUTF.h" for complete documentation. +See the header file "ConvertUTF.h" for complete documentation. ------------------------------------------------------------------------ */ + #include "llvm/Support/ConvertUTF.h" #ifdef CVTUTF_DEBUG #include #endif -#include + /* * This code extensively uses fall-through switches. @@ -81,6 +91,7 @@ namespace llvm { + static const int halfShift = 10; /* used for shifting by 10 bits */ static const UTF32 halfBase = 0x0010000UL; @@ -91,6 +102,117 @@ #define UNI_SUR_LOW_START (UTF32)0xDC00 #define UNI_SUR_LOW_END (UTF32)0xDFFF +#ifndef false +#define false 0 +#endif +#ifndef true +#define true 1 +#endif + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + if (target >= targetEnd) { + result = targetExhausted; break; + } + ch = *source++; + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_LEGAL_UTF32) { + if (flags == strictConversion) { + result = sourceIllegal; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + --source; /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); + } + } +*sourceStart = source; +*targetStart = target; +return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF32* target = *targetStart; + UTF32 ch, ch2; + while (source < sourceEnd) { + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + if (target >= targetEnd) { + source = oldSource; /* Back up source pointer! */ + result = targetExhausted; break; + } + *target++ = ch; + } + *sourceStart = source; + *targetStart = target; +#ifdef CVTUTF_DEBUG + if (result == sourceIllegal) { + fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); + fflush(stderr); + } +#endif + return result; +} + /* --------------------------------------------------------------------- */ /* @@ -101,14 +223,14 @@ * allowed in earlier algorithms. */ static const char trailingBytesForUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; /* @@ -117,7 +239,7 @@ * in a UTF-8 sequence. */ static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; /* * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed @@ -131,238 +253,80 @@ /* --------------------------------------------------------------------- */ /* The interface converts a whole buffer to avoid function-call overhead. - * Constants have been gathered. Loops & conditionals have been removed as - * much as possible for efficiency, in favor of drop-through switches. - * (See "Note A" at the bottom of the file for equivalent code.) - * If your compiler supports it, the "isLegalUTF8" call can be turned - * into an inline function. - */ - - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF32toUTF16 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - if (target >= targetEnd) { - result = targetExhausted; break; - } - ch = *source++; - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = (UTF16)ch; /* normal case */ - } - } else if (ch > UNI_MAX_LEGAL_UTF32) { - if (flags == strictConversion) { - result = sourceIllegal; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - --source; /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); - *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); - } +* Constants have been gathered. Loops & conditionals have been removed as +* much as possible for efficiency, in favor of drop-through switches. +* (See "Note A" at the bottom of the file for equivalent code.) +* If your compiler supports it, the "isLegalUTF8" call can be turned +* into an inline function. +*/ + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + UTF32 ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* Figure out how many bytes the result will require */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + } + + target += bytesToWrite; + if (target > targetEnd) { + source = oldSource; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); } - *sourceStart = source; - *targetStart = target; - return result; -} - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF16toUTF32 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF32* target = *targetStart; - UTF32 ch, ch2; - while (source < sourceEnd) { - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source buffer... */ - if (source < sourceEnd) { - ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } else { /* We don't have the 16 bits following the high surrogate. */ - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - if (target >= targetEnd) { - source = oldSource; /* Back up source pointer! */ - result = targetExhausted; break; - } - *target++ = ch; - } - *sourceStart = source; - *targetStart = target; -#ifdef CVTUTF_DEBUG -if (result == sourceIllegal) { - fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); - fflush(stderr); -} -#endif - return result; -} -ConversionResult ConvertUTF16toUTF8 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source buffer... */ - if (source < sourceEnd) { - UTF32 ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } else { /* We don't have the 16 bits following the high surrogate. */ - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* Figure out how many bytes the result will require */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - } - - target += bytesToWrite; - if (target > targetEnd) { - source = oldSource; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; -} - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF32toUTF8 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - ch = *source++; - if (flags == strictConversion ) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* - * Figure out how many bytes the result will require. Turn any - * illegally large UTF32 things (> Plane 17) into replacement chars. - */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - result = sourceIllegal; - } - - target += bytesToWrite; - if (target > targetEnd) { - --source; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; + target += bytesToWrite; + } +*sourceStart = source; +*targetStart = target; +return result; } /* --------------------------------------------------------------------- */ @@ -379,135 +343,28 @@ */ static Boolean isLegalUTF8(const UTF8 *source, int length) { - UTF8 a; - const UTF8 *srcptr = source+length; - switch (length) { + UTF8 a; + const UTF8 *srcptr = source+length; + switch (length) { default: return false; - /* Everything else falls through when "true"... */ + /* Everything else falls through when "true"... */ case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; - switch (*source) { - /* no fall-through in this inner switch */ - case 0xE0: if (a < 0xA0) return false; break; - case 0xED: if (a > 0x9F) return false; break; - case 0xF0: if (a < 0x90) return false; break; - case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; - } + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - } - if (*source > 0xF4) return false; - return true; -} - -/* --------------------------------------------------------------------- */ - -/* - * Exported function to return whether a UTF-8 sequence is legal or not. - * This is not used here; it's just exported. - */ -Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { - int length = trailingBytesForUTF8[*source]+1; - if (length > sourceEnd - source) { - return false; - } - return isLegalUTF8(source, length); -} - -/* --------------------------------------------------------------------- */ - -static unsigned -findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, - const UTF8 *sourceEnd) { - UTF8 b1, b2, b3; - - assert(!isLegalUTF8Sequence(source, sourceEnd)); - - /* - * Unicode 6.3.0, D93b: - * - * Maximal subpart of an ill-formed subsequence: The longest code unit - * subsequence starting at an unconvertible offset that is either: - * a. the initial subsequence of a well-formed code unit sequence, or - * b. a subsequence of length one. - */ - - if (source == sourceEnd) - return 0; - - /* - * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8 - * Byte Sequences. - */ - - b1 = *source; - ++source; - if (b1 >= 0xC2 && b1 <= 0xDF) { - /* - * First byte is valid, but we know that this code unit sequence is - * invalid, so the maximal subpart has to end after the first byte. - */ - return 1; + case 1: if (*source >= 0x80 && *source < 0xC2) return false; } - - if (source == sourceEnd) - return 1; - - b2 = *source; - ++source; - - if (b1 == 0xE0) { - return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1; - } - if (b1 >= 0xE1 && b1 <= 0xEC) { - return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; - } - if (b1 == 0xED) { - return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1; - } - if (b1 >= 0xEE && b1 <= 0xEF) { - return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; - } - if (b1 == 0xF0) { - if (b2 >= 0x90 && b2 <= 0xBF) { - if (source == sourceEnd) - return 2; - - b3 = *source; - return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; - } - return 1; - } - if (b1 >= 0xF1 && b1 <= 0xF3) { - if (b2 >= 0x80 && b2 <= 0xBF) { - if (source == sourceEnd) - return 2; - - b3 = *source; - return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; - } - return 1; - } - if (b1 == 0xF4) { - if (b2 >= 0x80 && b2 <= 0x8F) { - if (source == sourceEnd) - return 2; - - b3 = *source; - return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; - } - return 1; - } - - assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5); - /* - * There are no valid sequences that start with these bytes. Maximal subpart - * is defined to have length 1 in these cases. - */ - return 1; + if (*source > 0xF4) return false; + return true; } /* --------------------------------------------------------------------- */ @@ -520,8 +377,6 @@ return trailingBytesForUTF8[first] + 1; } -/* --------------------------------------------------------------------- */ - /* * Exported function to return whether a UTF-8 string is legal or not. * This is not used here; it's just exported. @@ -536,202 +391,221 @@ return true; } -/* --------------------------------------------------------------------- */ +/* + * Exported function to return whether a UTF-8 sequence is legal or not. + * This is not used here; it's just exported. + */ +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { + int length = trailingBytesForUTF8[*source]+1; + if (source+length > sourceEnd) { + return false; + } + return isLegalUTF8(source, length); +} -ConversionResult ConvertUTF8toUTF16 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (extraBytesToRead >= sourceEnd - source) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (!isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; +/* --------------------------------------------------------------------- */ - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = (UTF16)ch; /* normal case */ - } - } else if (ch > UNI_MAX_UTF16) { - if (flags == strictConversion) { - result = sourceIllegal; - source -= (extraBytesToRead+1); /* return to the start */ - break; /* Bail out; shouldn't continue */ - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } +ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); - *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); + *target++ = UNI_REPLACEMENT_CHAR; } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_UTF16) { + if (flags == strictConversion) { + result = sourceIllegal; + source -= (extraBytesToRead+1); /* return to the start */ + break; /* Bail out; shouldn't continue */ + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); } - *sourceStart = source; - *targetStart = target; - return result; + } +*sourceStart = source; +*targetStart = target; +return result; } /* --------------------------------------------------------------------- */ -static ConversionResult ConvertUTF8toUTF32Impl( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags, - Boolean InputIsPartial) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF32* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (extraBytesToRead >= sourceEnd - source) { - if (flags == strictConversion || InputIsPartial) { - result = sourceExhausted; - break; - } else { - result = sourceIllegal; - - /* - * Replace the maximal subpart of ill-formed sequence with - * replacement character. - */ - source += findMaximalSubpartOfIllFormedUTF8Sequence(source, - sourceEnd); - *target++ = UNI_REPLACEMENT_CHAR; - continue; - } - } - if (target >= targetEnd) { - result = targetExhausted; break; - } +ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + ch = *source++; + if (flags == strictConversion ) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* + * Figure out how many bytes the result will require. Turn any + * illegally large UTF32 things (> Plane 17) into replacement chars. + */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + result = sourceIllegal; + } + + target += bytesToWrite; + if (target > targetEnd) { + --source; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } +*sourceStart = source; +*targetStart = target; +return result; +} - /* Do this check whether lenient or strict */ - if (!isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - if (flags == strictConversion) { - /* Abort conversion. */ - break; - } else { - /* - * Replace the maximal subpart of ill-formed sequence with - * replacement character. - */ - source += findMaximalSubpartOfIllFormedUTF8Sequence(source, - sourceEnd); - *target++ = UNI_REPLACEMENT_CHAR; - continue; - } - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; +/* --------------------------------------------------------------------- */ - if (ch <= UNI_MAX_LEGAL_UTF32) { - /* - * UTF-16 surrogate values are illegal in UTF-32, and anything - * over Plane 17 (> 0x10FFFF) is illegal. - */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = ch; - } - } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ - result = sourceIllegal; - *target++ = UNI_REPLACEMENT_CHAR; +ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF32* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; + case 4: ch += *source++; ch <<= 6; + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up the source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_LEGAL_UTF32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; } + } else { + *target++ = ch; + } + } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ + result = sourceIllegal; + *target++ = UNI_REPLACEMENT_CHAR; } - *sourceStart = source; - *targetStart = target; - return result; -} - -ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, - const UTF8 *sourceEnd, - UTF32 **targetStart, - UTF32 *targetEnd, - ConversionFlags flags) { - return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, - flags, /*InputIsPartial=*/true); -} - -ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, - const UTF8 *sourceEnd, UTF32 **targetStart, - UTF32 *targetEnd, ConversionFlags flags) { - return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, - flags, /*InputIsPartial=*/false); + } + *sourceStart = source; + *targetStart = target; + return result; } /* --------------------------------------------------------------------- - Note A. - The fall-through switches in UTF-8 reading code save a - temp variable, some decrements & conditionals. The switches - are equivalent to the following loop: - { - int tmpBytesToRead = extraBytesToRead+1; - do { - ch += *source++; - --tmpBytesToRead; - if (tmpBytesToRead) ch <<= 6; - } while (tmpBytesToRead > 0); - } - In UTF-8 writing code, the switches on "bytesToWrite" are - similarly unrolled loops. +Note A. +The fall-through switches in UTF-8 reading code save a +temp variable, some decrements & conditionals. The switches +are equivalent to the following loop: +{ + int tmpBytesToRead = extraBytesToRead+1; + do { + ch += *source++; + --tmpBytesToRead; + if (tmpBytesToRead) ch <<= 6; + } while (tmpBytesToRead > 0); +} +In UTF-8 writing code, the switches on "bytesToWrite" are +similarly unrolled loops. - --------------------------------------------------------------------- */ +--------------------------------------------------------------------- */ } // namespace llvm Index: lib/Support/ConvertUTFWrapper.cpp =================================================================== --- lib/Support/ConvertUTFWrapper.cpp +++ lib/Support/ConvertUTFWrapper.cpp @@ -62,7 +62,7 @@ return result == conversionOK; } -bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) { +bool ConvertCodePointToUTF8(unsigned long Source, char *&ResultPtr) { const UTF32 *SourceStart = &Source; const UTF32 *SourceEnd = SourceStart + 1; UTF8 *TargetStart = reinterpret_cast(ResultPtr); Index: unittests/Support/ConvertUTFTest.cpp =================================================================== --- unittests/Support/ConvertUTFTest.cpp +++ unittests/Support/ConvertUTFTest.cpp @@ -115,7 +115,7 @@ struct ConvertUTFResultContainer { ConversionResult ErrorCode; - std::vector UnicodeScalars; + std::vector UnicodeScalars; ConvertUTFResultContainer(ConversionResult ErrorCode) : ErrorCode(ErrorCode) {} @@ -146,7 +146,7 @@ } }; -std::pair> +std::pair> ConvertUTF8ToUnicodeScalarsLenient(StringRef S) { const UTF8 *SourceStart = reinterpret_cast(S.data()); @@ -163,32 +163,32 @@ return std::make_pair(ErrorCode, Decoded); } -std::pair> -ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) { - const UTF8 *SourceStart = reinterpret_cast(S.data()); - - const UTF8 *SourceNext = SourceStart; - std::vector Decoded(S.size(), 0); - UTF32 *TargetStart = Decoded.data(); +// std::pair> +// ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) { +// const UTF8 *SourceStart = reinterpret_cast(S.data()); + +// const UTF8 *SourceNext = SourceStart; +// std::vector Decoded(S.size(), 0); +// UTF32 *TargetStart = Decoded.data(); + +// auto ErrorCode = ConvertUTF8toUTF32Partial( +// &SourceNext, SourceStart + S.size(), &TargetStart, +// Decoded.data() + Decoded.size(), lenientConversion); - auto ErrorCode = ConvertUTF8toUTF32Partial( - &SourceNext, SourceStart + S.size(), &TargetStart, - Decoded.data() + Decoded.size(), lenientConversion); +// Decoded.resize(TargetStart - Decoded.data()); - Decoded.resize(TargetStart - Decoded.data()); - - return std::make_pair(ErrorCode, Decoded); -} +// return std::make_pair(ErrorCode, Decoded); +// } ::testing::AssertionResult CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected, StringRef S, bool Partial = false) { ConversionResult ErrorCode; - std::vector Decoded; + std::vector Decoded; if (!Partial) std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S); - else - std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S); +// else +// std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S); if (Expected.ErrorCode != ErrorCode) return ::testing::AssertionFailure() << "Expected error code "