diff --git a/llvm/lib/Support/Unicode.cpp b/llvm/lib/Support/Unicode.cpp --- a/llvm/lib/Support/Unicode.cpp +++ b/llvm/lib/Support/Unicode.cpp @@ -300,8 +300,7 @@ /// * 0 for non-spacing and enclosing combining marks; /// * 2 for CJK characters excluding halfwidth forms; /// * 1 for all remaining characters. -static inline int charWidth(int UCS) -{ +static inline int charWidth(int UCS) { if (!isPrintable(UCS)) return ErrorNonPrintableCharacter; @@ -430,26 +429,45 @@ if (CombiningCharacters.contains(UCS)) return 0; + // We consider double width codepoints any codepoint with + // the property East_Asian_Width=F|W + // + Misc Symbols and Pictographs (U+1F300...U+1F5FF) + // + Supplemental Symbols and Pictographs (U+1F900...U+1F9FF) static const UnicodeCharRange DoubleWidthCharacterRanges[] = { - // Hangul Jamo - { 0x1100, 0x11FF }, - // Deprecated fullwidth angle brackets - { 0x2329, 0x232A }, - // CJK Misc, CJK Unified Ideographs, Yijing Hexagrams, Yi - // excluding U+303F (IDEOGRAPHIC HALF FILL SPACE) - { 0x2E80, 0x303E }, { 0x3040, 0xA4CF }, - // Hangul - { 0xAC00, 0xD7A3 }, { 0xD7B0, 0xD7C6 }, { 0xD7CB, 0xD7FB }, - // CJK Unified Ideographs - { 0xF900, 0xFAFF }, - // Vertical forms - { 0xFE10, 0xFE19 }, - // CJK Compatibility Forms + Small Form Variants - { 0xFE30, 0xFE6F }, - // Fullwidth forms - { 0xFF01, 0xFF60 }, { 0xFFE0, 0xFFE6 }, - // CJK Unified Ideographs - { 0x20000, 0x2A6DF }, { 0x2A700, 0x2B81F }, { 0x2F800, 0x2FA1F } + {0x1100, 0x115F}, {0x231A, 0x231B}, {0x2329, 0x232A}, + {0x23E9, 0x23EC}, {0x23F0, 0x23F0}, {0x23F3, 0x23F3}, + {0x25FD, 0x25FE}, {0x2614, 0x2615}, {0x2648, 0x2653}, + {0x267F, 0x267F}, {0x2693, 0x2693}, {0x26A1, 0x26A1}, + {0x26AA, 0x26AB}, {0x26BD, 0x26BE}, {0x26C4, 0x26C5}, + {0x26CE, 0x26CE}, {0x26D4, 0x26D4}, {0x26EA, 0x26EA}, + {0x26F2, 0x26F3}, {0x26F5, 0x26F5}, {0x26FA, 0x26FA}, + {0x26FD, 0x26FD}, {0x2705, 0x2705}, {0x270A, 0x270B}, + {0x2728, 0x2728}, {0x274C, 0x274C}, {0x274E, 0x274E}, + {0x2753, 0x2755}, {0x2757, 0x2757}, {0x2795, 0x2797}, + {0x27B0, 0x27B0}, {0x27BF, 0x27BF}, {0x2B1B, 0x2B1C}, + {0x2B50, 0x2B50}, {0x2B55, 0x2B55}, {0x2E80, 0x2E99}, + {0x2E9B, 0x2EF3}, {0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, + {0x3000, 0x303E}, {0x3041, 0x3096}, {0x3099, 0x30FF}, + {0x3105, 0x312F}, {0x3131, 0x318E}, {0x3190, 0x31E3}, + {0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0xA48C}, + {0xA490, 0xA4C6}, {0xA960, 0xA97C}, {0xAC00, 0xD7A3}, + {0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52}, + {0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, + {0xFFE0, 0xFFE6}, {0x16FE0, 0x16FE4}, {0x16FF0, 0x16FF1}, + {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08}, + {0x1AFF0, 0x1AFF3}, {0x1AFF5, 0x1AFFB}, {0x1AFFD, 0x1AFFE}, + {0x1B000, 0x1B122}, {0x1B132, 0x1B132}, {0x1B150, 0x1B152}, + {0x1B155, 0x1B155}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, + {0x1F004, 0x1F004}, {0x1F0CF, 0x1F0CF}, {0x1F18E, 0x1F18E}, + {0x1F191, 0x1F19A}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23B}, + {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, + {0x1F300, 0x1F64F}, {0x1F680, 0x1F6C5}, {0x1F6CC, 0x1F6CC}, + {0x1F6D0, 0x1F6D2}, {0x1F6D5, 0x1F6D7}, {0x1F6DC, 0x1F6DF}, + {0x1F6EB, 0x1F6EC}, {0x1F6F4, 0x1F6FC}, {0x1F7E0, 0x1F7EB}, + {0x1F7F0, 0x1F7F0}, {0x1F900, 0x1F9FF}, {0x1FA70, 0x1FA7C}, + {0x1FA80, 0x1FA88}, {0x1FA90, 0x1FABD}, {0x1FABF, 0x1FAC5}, + {0x1FACE, 0x1FADB}, {0x1FAE0, 0x1FAE8}, {0x1FAF0, 0x1FAF8}, + {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD} }; static const UnicodeCharSet DoubleWidthCharacters(DoubleWidthCharacterRanges); @@ -493,4 +511,3 @@ } // namespace unicode } // namespace sys } // namespace llvm - diff --git a/llvm/unittests/Support/UnicodeTest.cpp b/llvm/unittests/Support/UnicodeTest.cpp --- a/llvm/unittests/Support/UnicodeTest.cpp +++ b/llvm/unittests/Support/UnicodeTest.cpp @@ -45,6 +45,11 @@ EXPECT_EQ(3, columnWidthUTF8("q\344\270\200")); EXPECT_EQ(3, columnWidthUTF8("\314\200\340\270\201\344\270\200")); + EXPECT_EQ(2, columnWidthUTF8("\u231A")); // WATCH (emoji) + EXPECT_EQ(2, columnWidthUTF8("\U0001FADB")); // PEA POD (Unicode 15 emoji) + EXPECT_EQ(2, columnWidthUTF8("\U0001B132")); // HIRAGANA LETTER SMALL KO + EXPECT_EQ(2, columnWidthUTF8("\U00017042")); // TANGUT IDEOGRAPH + // Invalid UTF-8 strings, columnWidthUTF8 should error out. EXPECT_EQ(-2, columnWidthUTF8("\344")); EXPECT_EQ(-2, columnWidthUTF8("\344\270"));