Index: include/llvm/Support/DJB.h =================================================================== --- include/llvm/Support/DJB.h +++ include/llvm/Support/DJB.h @@ -20,6 +20,10 @@ /// The Bernstein hash function used by the DWARF accelerator tables. uint32_t djbHash(StringRef Buffer, uint32_t H = 5381); + +/// Computes the Bernstein hash after folding the input according to the Dwarf 5 +/// standard case folding rules. +uint32_t caseFoldingDjbHash(StringRef Buffer, uint32_t H = 5381); } // namespace llvm #endif // LLVM_SUPPORT_DJB_H Index: include/llvm/Support/Unicode.h =================================================================== --- include/llvm/Support/Unicode.h +++ include/llvm/Support/Unicode.h @@ -60,6 +60,10 @@ /// * 1 for each of the remaining characters. int columnWidthUTF8(StringRef Text); +/// Fold input unicode character according the the Simple unicode case folding +/// rules. +int foldCharSimple(int C); + } // namespace unicode } // namespace sys } // namespace llvm Index: lib/Support/CMakeLists.txt =================================================================== --- lib/Support/CMakeLists.txt +++ lib/Support/CMakeLists.txt @@ -113,6 +113,7 @@ Triple.cpp Twine.cpp Unicode.cpp + UnicodeCaseFold.cpp YAMLParser.cpp YAMLTraits.cpp raw_os_ostream.cpp Index: lib/Support/DJB.cpp =================================================================== --- lib/Support/DJB.cpp +++ lib/Support/DJB.cpp @@ -12,9 +12,85 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/DJB.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/Unicode.h" + +using namespace llvm; + +static inline uint32_t djbHashChar(char C, uint32_t H) { + return (H << 5) + H + C; +} uint32_t llvm::djbHash(StringRef Buffer, uint32_t H) { for (char C : Buffer.bytes()) - H = ((H << 5) + H) + C; + H = djbHashChar(C, H); + return H; +} + +static UTF32 chopOneUTF32(StringRef &Buffer) { + UTF32 C; + const UTF8 *const Begin8Const = + reinterpret_cast(Buffer.begin()); + const UTF8 *Begin8 = Begin8Const; + UTF32 *Begin32 = &C; + + // In lenient mode we will always end up with a "reasonable" value in C for + // non-empty input. + assert(!Buffer.empty()); + ConvertUTF8toUTF32(&Begin8, reinterpret_cast(Buffer.end()), + &Begin32, &C + 1, lenientConversion); + Buffer = Buffer.drop_front(Begin8 - Begin8Const); + return C; +} + +static StringRef toUTF8(UTF32 C, MutableArrayRef Storage) { + const UTF32 *Begin32 = &C; + UTF8 *Begin8 = Storage.begin(); + + // The case-folded output should always be a valid unicode character, so use + // strict mode here. + ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8, + Storage.end(), strictConversion); + assert(CR == conversionOK && "Case folding produced invalid char?"); + (void)CR; + return StringRef(reinterpret_cast(Storage.begin()), + Begin8 - Storage.begin()); +} + +static UTF32 foldCharDwarf(UTF32 C) { + // DWARF v5 addition to the unicode folding rules. + // Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot + // Above" into "i". + if (C == 0x130 || C == 0x131) + return 'i'; + return sys::unicode::foldCharSimple(C); +} + +static uint32_t caseFoldingDjbHashCharSlow(StringRef &Buffer, uint32_t H) { + UTF32 C = chopOneUTF32(Buffer); + + C = foldCharDwarf(C); + + std::array Storage; + StringRef Folded = toUTF8(C, Storage); + return djbHash(Folded, H); +} + +uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) { + while (!Buffer.empty()) { + unsigned char C = Buffer.front(); + if (LLVM_LIKELY(C <= 0x7f)) { + // US-ASCII, encoded as one character in utf-8. + // This is by far the most common case, so handle this specially. + if (C >= 'A' && C <= 'Z') + C = 'a' + (C - 'A'); // fold uppercase into lowercase + H = djbHashChar(C, H); + Buffer = Buffer.drop_front(); + continue; + } + H = caseFoldingDjbHashCharSlow(Buffer, H); + } return H; } Index: lib/Support/UnicodeCaseFold.cpp =================================================================== --- /dev/null +++ lib/Support/UnicodeCaseFold.cpp @@ -0,0 +1,500 @@ +//===---------- Support/UnicodeCaseFold.cpp -------------------------------===// +// +// This file was generated by utils/unicode-case-fold.py from the Unicode +// case folding database at +// http://www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt +// +// To regenerate this file, run: +// utils/unicode-case-fold.py \ +// "http://www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt" \ +// > lib/Support/UnicodeCaseFold.cpp +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Unicode.h" + +int llvm::sys::unicode::foldCharSimple(int C) { + // 26 characters + if (C >= 0x0041 && C <= 0x005a && C % 1 == 0) + return C + 32; + // 23 characters + if (C >= 0x00c0 && C <= 0x00d6 && C % 1 == 0) + return C + 32; + // 7 characters + if (C >= 0x00d8 && C <= 0x00de && C % 1 == 0) + return C + 32; + // 24 characters + if (C >= 0x0100 && C <= 0x012e) + return C | 1; + // 3 characters + if (C >= 0x0132 && C <= 0x0136) + return C | 1; + // 8 characters + if (C >= 0x0139 && C <= 0x0147 && C % 2 == 1) + return C + 1; + // 23 characters + if (C >= 0x014a && C <= 0x0176) + return C | 1; + // 3 characters + if (C >= 0x0179 && C <= 0x017d && C % 2 == 1) + return C + 1; + // 3 characters + if (C >= 0x01a0 && C <= 0x01a4) + return C | 1; + // 9 characters + if (C >= 0x01cb && C <= 0x01db && C % 2 == 1) + return C + 1; + // 9 characters + if (C >= 0x01de && C <= 0x01ee) + return C | 1; + // 20 characters + if (C >= 0x01f8 && C <= 0x021e) + return C | 1; + // 9 characters + if (C >= 0x0222 && C <= 0x0232) + return C | 1; + // 5 characters + if (C >= 0x0246 && C <= 0x024e) + return C | 1; + // 3 characters + if (C >= 0x0388 && C <= 0x038a && C % 1 == 0) + return C + 37; + // 17 characters + if (C >= 0x0391 && C <= 0x03a1 && C % 1 == 0) + return C + 32; + // 9 characters + if (C >= 0x03a3 && C <= 0x03ab && C % 1 == 0) + return C + 32; + // 12 characters + if (C >= 0x03d8 && C <= 0x03ee) + return C | 1; + // 3 characters + if (C >= 0x03fd && C <= 0x03ff && C % 1 == 0) + return C + -130; + // 16 characters + if (C >= 0x0400 && C <= 0x040f && C % 1 == 0) + return C + 80; + // 32 characters + if (C >= 0x0410 && C <= 0x042f && C % 1 == 0) + return C + 32; + // 17 characters + if (C >= 0x0460 && C <= 0x0480) + return C | 1; + // 27 characters + if (C >= 0x048a && C <= 0x04be) + return C | 1; + // 7 characters + if (C >= 0x04c1 && C <= 0x04cd && C % 2 == 1) + return C + 1; + // 48 characters + if (C >= 0x04d0 && C <= 0x052e) + return C | 1; + // 38 characters + if (C >= 0x0531 && C <= 0x0556 && C % 1 == 0) + return C + 48; + // 38 characters + if (C >= 0x10a0 && C <= 0x10c5 && C % 1 == 0) + return C + 7264; + // 6 characters + if (C >= 0x13f8 && C <= 0x13fd && C % 1 == 0) + return C + -8; + // 75 characters + if (C >= 0x1e00 && C <= 0x1e94) + return C | 1; + // 48 characters + if (C >= 0x1ea0 && C <= 0x1efe) + return C | 1; + // 8 characters + if (C >= 0x1f08 && C <= 0x1f0f && C % 1 == 0) + return C + -8; + // 6 characters + if (C >= 0x1f18 && C <= 0x1f1d && C % 1 == 0) + return C + -8; + // 8 characters + if (C >= 0x1f28 && C <= 0x1f2f && C % 1 == 0) + return C + -8; + // 8 characters + if (C >= 0x1f38 && C <= 0x1f3f && C % 1 == 0) + return C + -8; + // 6 characters + if (C >= 0x1f48 && C <= 0x1f4d && C % 1 == 0) + return C + -8; + // 4 characters + if (C >= 0x1f59 && C <= 0x1f5f && C % 2 == 1) + return C + -8; + // 8 characters + if (C >= 0x1f68 && C <= 0x1f6f && C % 1 == 0) + return C + -8; + // 8 characters + if (C >= 0x1f88 && C <= 0x1f8f && C % 1 == 0) + return C + -8; + // 8 characters + if (C >= 0x1f98 && C <= 0x1f9f && C % 1 == 0) + return C + -8; + // 8 characters + if (C >= 0x1fa8 && C <= 0x1faf && C % 1 == 0) + return C + -8; + // 4 characters + if (C >= 0x1fc8 && C <= 0x1fcb && C % 1 == 0) + return C + -86; + // 16 characters + if (C >= 0x2160 && C <= 0x216f && C % 1 == 0) + return C + 16; + // 26 characters + if (C >= 0x24b6 && C <= 0x24cf && C % 1 == 0) + return C + 26; + // 47 characters + if (C >= 0x2c00 && C <= 0x2c2e && C % 1 == 0) + return C + 48; + // 3 characters + if (C >= 0x2c67 && C <= 0x2c6b && C % 2 == 1) + return C + 1; + // 50 characters + if (C >= 0x2c80 && C <= 0x2ce2) + return C | 1; + // 22 characters + if (C >= 0xa642 && C <= 0xa66c) + return C | 1; + // 14 characters + if (C >= 0xa680 && C <= 0xa69a) + return C | 1; + // 7 characters + if (C >= 0xa722 && C <= 0xa72e) + return C | 1; + // 31 characters + if (C >= 0xa732 && C <= 0xa76e) + return C | 1; + // 5 characters + if (C >= 0xa77e && C <= 0xa786) + return C | 1; + // 10 characters + if (C >= 0xa796 && C <= 0xa7a8) + return C | 1; + // 80 characters + if (C >= 0xab70 && C <= 0xabbf && C % 1 == 0) + return C + -38864; + // 26 characters + if (C >= 0xff21 && C <= 0xff3a && C % 1 == 0) + return C + 32; + // 40 characters + if (C >= 0x10400 && C <= 0x10427 && C % 1 == 0) + return C + 40; + // 36 characters + if (C >= 0x104b0 && C <= 0x104d3 && C % 1 == 0) + return C + 40; + // 51 characters + if (C >= 0x10c80 && C <= 0x10cb2 && C % 1 == 0) + return C + 64; + // 32 characters + if (C >= 0x118a0 && C <= 0x118bf && C % 1 == 0) + return C + 32; + // 34 characters + if (C >= 0x1e900 && C <= 0x1e921 && C % 1 == 0) + return C + 34; + + switch (C) { + case 0x00b5: // MICRO SIGN + return 0x03bc; + case 0x0178: // LATIN CAPITAL LETTER Y WITH DIAERESIS + return 0x00ff; + case 0x017f: // LATIN SMALL LETTER LONG S + return 0x0073; + case 0x0181: // LATIN CAPITAL LETTER B WITH HOOK + return 0x0253; + case 0x0182: // LATIN CAPITAL LETTER B WITH TOPBAR + return 0x0183; + case 0x0184: // LATIN CAPITAL LETTER TONE SIX + return 0x0185; + case 0x0186: // LATIN CAPITAL LETTER OPEN O + return 0x0254; + case 0x0187: // LATIN CAPITAL LETTER C WITH HOOK + return 0x0188; + case 0x0189: // LATIN CAPITAL LETTER AFRICAN D + return 0x0256; + case 0x018a: // LATIN CAPITAL LETTER D WITH HOOK + return 0x0257; + case 0x018b: // LATIN CAPITAL LETTER D WITH TOPBAR + return 0x018c; + case 0x018e: // LATIN CAPITAL LETTER REVERSED E + return 0x01dd; + case 0x018f: // LATIN CAPITAL LETTER SCHWA + return 0x0259; + case 0x0190: // LATIN CAPITAL LETTER OPEN E + return 0x025b; + case 0x0191: // LATIN CAPITAL LETTER F WITH HOOK + return 0x0192; + case 0x0193: // LATIN CAPITAL LETTER G WITH HOOK + return 0x0260; + case 0x0194: // LATIN CAPITAL LETTER GAMMA + return 0x0263; + case 0x0196: // LATIN CAPITAL LETTER IOTA + return 0x0269; + case 0x0197: // LATIN CAPITAL LETTER I WITH STROKE + return 0x0268; + case 0x0198: // LATIN CAPITAL LETTER K WITH HOOK + return 0x0199; + case 0x019c: // LATIN CAPITAL LETTER TURNED M + return 0x026f; + case 0x019d: // LATIN CAPITAL LETTER N WITH LEFT HOOK + return 0x0272; + case 0x019f: // LATIN CAPITAL LETTER O WITH MIDDLE TILDE + return 0x0275; + case 0x01a6: // LATIN LETTER YR + return 0x0280; + case 0x01a7: // LATIN CAPITAL LETTER TONE TWO + return 0x01a8; + case 0x01a9: // LATIN CAPITAL LETTER ESH + return 0x0283; + case 0x01ac: // LATIN CAPITAL LETTER T WITH HOOK + return 0x01ad; + case 0x01ae: // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK + return 0x0288; + case 0x01af: // LATIN CAPITAL LETTER U WITH HORN + return 0x01b0; + case 0x01b1: // LATIN CAPITAL LETTER UPSILON + return 0x028a; + case 0x01b2: // LATIN CAPITAL LETTER V WITH HOOK + return 0x028b; + case 0x01b3: // LATIN CAPITAL LETTER Y WITH HOOK + return 0x01b4; + case 0x01b5: // LATIN CAPITAL LETTER Z WITH STROKE + return 0x01b6; + case 0x01b7: // LATIN CAPITAL LETTER EZH + return 0x0292; + case 0x01b8: // LATIN CAPITAL LETTER EZH REVERSED + return 0x01b9; + case 0x01bc: // LATIN CAPITAL LETTER TONE FIVE + return 0x01bd; + case 0x01c4: // LATIN CAPITAL LETTER DZ WITH CARON + return 0x01c6; + case 0x01c5: // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON + return 0x01c6; + case 0x01c7: // LATIN CAPITAL LETTER LJ + return 0x01c9; + case 0x01c8: // LATIN CAPITAL LETTER L WITH SMALL LETTER J + return 0x01c9; + case 0x01ca: // LATIN CAPITAL LETTER NJ + return 0x01cc; + case 0x01f1: // LATIN CAPITAL LETTER DZ + return 0x01f3; + case 0x01f2: // LATIN CAPITAL LETTER D WITH SMALL LETTER Z + return 0x01f3; + case 0x01f4: // LATIN CAPITAL LETTER G WITH ACUTE + return 0x01f5; + case 0x01f6: // LATIN CAPITAL LETTER HWAIR + return 0x0195; + case 0x01f7: // LATIN CAPITAL LETTER WYNN + return 0x01bf; + case 0x0220: // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG + return 0x019e; + case 0x023a: // LATIN CAPITAL LETTER A WITH STROKE + return 0x2c65; + case 0x023b: // LATIN CAPITAL LETTER C WITH STROKE + return 0x023c; + case 0x023d: // LATIN CAPITAL LETTER L WITH BAR + return 0x019a; + case 0x023e: // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE + return 0x2c66; + case 0x0241: // LATIN CAPITAL LETTER GLOTTAL STOP + return 0x0242; + case 0x0243: // LATIN CAPITAL LETTER B WITH STROKE + return 0x0180; + case 0x0244: // LATIN CAPITAL LETTER U BAR + return 0x0289; + case 0x0245: // LATIN CAPITAL LETTER TURNED V + return 0x028c; + case 0x0345: // COMBINING GREEK YPOGEGRAMMENI + return 0x03b9; + case 0x0370: // GREEK CAPITAL LETTER HETA + return 0x0371; + case 0x0372: // GREEK CAPITAL LETTER ARCHAIC SAMPI + return 0x0373; + case 0x0376: // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA + return 0x0377; + case 0x037f: // GREEK CAPITAL LETTER YOT + return 0x03f3; + case 0x0386: // GREEK CAPITAL LETTER ALPHA WITH TONOS + return 0x03ac; + case 0x038c: // GREEK CAPITAL LETTER OMICRON WITH TONOS + return 0x03cc; + case 0x038e: // GREEK CAPITAL LETTER UPSILON WITH TONOS + return 0x03cd; + case 0x038f: // GREEK CAPITAL LETTER OMEGA WITH TONOS + return 0x03ce; + case 0x03c2: // GREEK SMALL LETTER FINAL SIGMA + return 0x03c3; + case 0x03cf: // GREEK CAPITAL KAI SYMBOL + return 0x03d7; + case 0x03d0: // GREEK BETA SYMBOL + return 0x03b2; + case 0x03d1: // GREEK THETA SYMBOL + return 0x03b8; + case 0x03d5: // GREEK PHI SYMBOL + return 0x03c6; + case 0x03d6: // GREEK PI SYMBOL + return 0x03c0; + case 0x03f0: // GREEK KAPPA SYMBOL + return 0x03ba; + case 0x03f1: // GREEK RHO SYMBOL + return 0x03c1; + case 0x03f4: // GREEK CAPITAL THETA SYMBOL + return 0x03b8; + case 0x03f5: // GREEK LUNATE EPSILON SYMBOL + return 0x03b5; + case 0x03f7: // GREEK CAPITAL LETTER SHO + return 0x03f8; + case 0x03f9: // GREEK CAPITAL LUNATE SIGMA SYMBOL + return 0x03f2; + case 0x03fa: // GREEK CAPITAL LETTER SAN + return 0x03fb; + case 0x04c0: // CYRILLIC LETTER PALOCHKA + return 0x04cf; + case 0x10c7: // GEORGIAN CAPITAL LETTER YN + return 0x2d27; + case 0x10cd: // GEORGIAN CAPITAL LETTER AEN + return 0x2d2d; + case 0x1c80: // CYRILLIC SMALL LETTER ROUNDED VE + return 0x0432; + case 0x1c81: // CYRILLIC SMALL LETTER LONG-LEGGED DE + return 0x0434; + case 0x1c82: // CYRILLIC SMALL LETTER NARROW O + return 0x043e; + case 0x1c83: // CYRILLIC SMALL LETTER WIDE ES + return 0x0441; + case 0x1c84: // CYRILLIC SMALL LETTER TALL TE + return 0x0442; + case 0x1c85: // CYRILLIC SMALL LETTER THREE-LEGGED TE + return 0x0442; + case 0x1c86: // CYRILLIC SMALL LETTER TALL HARD SIGN + return 0x044a; + case 0x1c87: // CYRILLIC SMALL LETTER TALL YAT + return 0x0463; + case 0x1c88: // CYRILLIC SMALL LETTER UNBLENDED UK + return 0xa64b; + case 0x1e9b: // LATIN SMALL LETTER LONG S WITH DOT ABOVE + return 0x1e61; + case 0x1e9e: // LATIN CAPITAL LETTER SHARP S + return 0x00df; + case 0x1fb8: // GREEK CAPITAL LETTER ALPHA WITH VRACHY + return 0x1fb0; + case 0x1fb9: // GREEK CAPITAL LETTER ALPHA WITH MACRON + return 0x1fb1; + case 0x1fba: // GREEK CAPITAL LETTER ALPHA WITH VARIA + return 0x1f70; + case 0x1fbb: // GREEK CAPITAL LETTER ALPHA WITH OXIA + return 0x1f71; + case 0x1fbc: // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI + return 0x1fb3; + case 0x1fbe: // GREEK PROSGEGRAMMENI + return 0x03b9; + case 0x1fcc: // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI + return 0x1fc3; + case 0x1fd8: // GREEK CAPITAL LETTER IOTA WITH VRACHY + return 0x1fd0; + case 0x1fd9: // GREEK CAPITAL LETTER IOTA WITH MACRON + return 0x1fd1; + case 0x1fda: // GREEK CAPITAL LETTER IOTA WITH VARIA + return 0x1f76; + case 0x1fdb: // GREEK CAPITAL LETTER IOTA WITH OXIA + return 0x1f77; + case 0x1fe8: // GREEK CAPITAL LETTER UPSILON WITH VRACHY + return 0x1fe0; + case 0x1fe9: // GREEK CAPITAL LETTER UPSILON WITH MACRON + return 0x1fe1; + case 0x1fea: // GREEK CAPITAL LETTER UPSILON WITH VARIA + return 0x1f7a; + case 0x1feb: // GREEK CAPITAL LETTER UPSILON WITH OXIA + return 0x1f7b; + case 0x1fec: // GREEK CAPITAL LETTER RHO WITH DASIA + return 0x1fe5; + case 0x1ff8: // GREEK CAPITAL LETTER OMICRON WITH VARIA + return 0x1f78; + case 0x1ff9: // GREEK CAPITAL LETTER OMICRON WITH OXIA + return 0x1f79; + case 0x1ffa: // GREEK CAPITAL LETTER OMEGA WITH VARIA + return 0x1f7c; + case 0x1ffb: // GREEK CAPITAL LETTER OMEGA WITH OXIA + return 0x1f7d; + case 0x1ffc: // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI + return 0x1ff3; + case 0x2126: // OHM SIGN + return 0x03c9; + case 0x212a: // KELVIN SIGN + return 0x006b; + case 0x212b: // ANGSTROM SIGN + return 0x00e5; + case 0x2132: // TURNED CAPITAL F + return 0x214e; + case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED + return 0x2184; + case 0x2c60: // LATIN CAPITAL LETTER L WITH DOUBLE BAR + return 0x2c61; + case 0x2c62: // LATIN CAPITAL LETTER L WITH MIDDLE TILDE + return 0x026b; + case 0x2c63: // LATIN CAPITAL LETTER P WITH STROKE + return 0x1d7d; + case 0x2c64: // LATIN CAPITAL LETTER R WITH TAIL + return 0x027d; + case 0x2c6d: // LATIN CAPITAL LETTER ALPHA + return 0x0251; + case 0x2c6e: // LATIN CAPITAL LETTER M WITH HOOK + return 0x0271; + case 0x2c6f: // LATIN CAPITAL LETTER TURNED A + return 0x0250; + case 0x2c70: // LATIN CAPITAL LETTER TURNED ALPHA + return 0x0252; + case 0x2c72: // LATIN CAPITAL LETTER W WITH HOOK + return 0x2c73; + case 0x2c75: // LATIN CAPITAL LETTER HALF H + return 0x2c76; + case 0x2c7e: // LATIN CAPITAL LETTER S WITH SWASH TAIL + return 0x023f; + case 0x2c7f: // LATIN CAPITAL LETTER Z WITH SWASH TAIL + return 0x0240; + case 0x2ceb: // COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI + return 0x2cec; + case 0x2ced: // COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA + return 0x2cee; + case 0x2cf2: // COPTIC CAPITAL LETTER BOHAIRIC KHEI + return 0x2cf3; + case 0xa640: // CYRILLIC CAPITAL LETTER ZEMLYA + return 0xa641; + case 0xa779: // LATIN CAPITAL LETTER INSULAR D + return 0xa77a; + case 0xa77b: // LATIN CAPITAL LETTER INSULAR F + return 0xa77c; + case 0xa77d: // LATIN CAPITAL LETTER INSULAR G + return 0x1d79; + case 0xa78b: // LATIN CAPITAL LETTER SALTILLO + return 0xa78c; + case 0xa78d: // LATIN CAPITAL LETTER TURNED H + return 0x0265; + case 0xa790: // LATIN CAPITAL LETTER N WITH DESCENDER + return 0xa791; + case 0xa792: // LATIN CAPITAL LETTER C WITH BAR + return 0xa793; + case 0xa7aa: // LATIN CAPITAL LETTER H WITH HOOK + return 0x0266; + case 0xa7ab: // LATIN CAPITAL LETTER REVERSED OPEN E + return 0x025c; + case 0xa7ac: // LATIN CAPITAL LETTER SCRIPT G + return 0x0261; + case 0xa7ad: // LATIN CAPITAL LETTER L WITH BELT + return 0x026c; + case 0xa7ae: // LATIN CAPITAL LETTER SMALL CAPITAL I + return 0x026a; + case 0xa7b0: // LATIN CAPITAL LETTER TURNED K + return 0x029e; + case 0xa7b1: // LATIN CAPITAL LETTER TURNED T + return 0x0287; + case 0xa7b2: // LATIN CAPITAL LETTER J WITH CROSSED-TAIL + return 0x029d; + case 0xa7b3: // LATIN CAPITAL LETTER CHI + return 0xab53; + case 0xa7b4: // LATIN CAPITAL LETTER BETA + return 0xa7b5; + case 0xa7b6: // LATIN CAPITAL LETTER OMEGA + return 0xa7b7; + } + return C; +} Index: unittests/Support/CMakeLists.txt =================================================================== --- unittests/Support/CMakeLists.txt +++ unittests/Support/CMakeLists.txt @@ -19,6 +19,7 @@ ConvertUTFTest.cpp DataExtractorTest.cpp DebugTest.cpp + DJBTest.cpp EndianStreamTest.cpp EndianTest.cpp ErrnoTest.cpp Index: unittests/Support/DJBTest.cpp =================================================================== --- /dev/null +++ unittests/Support/DJBTest.cpp @@ -0,0 +1,89 @@ +//===---------- llvm/unittest/Support/DJBTest.cpp -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/DJB.h" +#include "llvm/ADT/Twine.h" +#include "gtest/gtest.h" + +using namespace llvm; + +TEST(DJBTest, caseFolding) { + struct TestCase { + StringLiteral One; + StringLiteral Two; + }; + + static constexpr TestCase Tests[] = { + {"ASDF", "asdf"}, + {"qWeR", "QwEr"}, + {"qqqqqqqqqqqqqqqqqqqq", "QQQQQQQQQQQQQQQQQQQQ"}, + + {"I", "i"}, + // Latin Small Letter Dotless I + {u8"\u0130", "i"}, + // Latin Capital Letter I With Dot Above + {u8"\u0131", "i"}, + + // Latin Capital Letter A With Grave + {u8"\u00c0", u8"\u00e0"}, + // Latin Capital Letter A With Macron + {u8"\u0100", u8"\u0101"}, + // Latin Capital Letter L With Acute + {u8"\u0139", u8"\u013a"}, + // Cyrillic Capital Letter Ie + {u8"\u0415", u8"\u0435"}, + // Latin Capital Letter A With Circumflex And Grave + {u8"\u1ea6", u8"\u1ea7"}, + // Kelvin Sign + {u8"\u212a", u8"\u006b"}, + // Glagolitic Capital Letter Chrivi + {u8"\u2c1d", u8"\u2c4d"}, + // Fullwidth Latin Capital Letter M + {u8"\uff2d", u8"\uff4d"}, + // Old Hungarian Capital Letter Ej + {u8"\U00010c92", u8"\U00010cd2"}, + }; + + for (const TestCase &T : Tests) { + SCOPED_TRACE("Comparing '" + T.One + "' and '" + T.Two + "'"); + EXPECT_EQ(caseFoldingDjbHash(T.One), caseFoldingDjbHash(T.Two)); + } +} + +TEST(DJBTest, knownValuesLowerCase) { + struct TestCase { + StringLiteral Text; + uint32_t Hash; + }; + static constexpr TestCase Tests[] = { + {"", 5381u}, + {"f", 177675u}, + {"fo", 5863386u}, + {"foo", 193491849u}, + {"foob", 2090263819u}, + {"fooba", 259229388u}, + {"foobar", 4259602622u}, + {"pneumonoultramicroscopicsilicovolcanoconiosis", 3999417781u}, + }; + + for (const TestCase &T: Tests) { + SCOPED_TRACE("Text: '" + T.Text + "'"); + EXPECT_EQ(T.Hash, djbHash(T.Text)); + EXPECT_EQ(T.Hash, caseFoldingDjbHash(T.Text)); + EXPECT_EQ(T.Hash, caseFoldingDjbHash(T.Text.upper())); + } +} + +TEST(DJBTest, knownValuesUnicode) { + EXPECT_EQ( + 2326183139u, + caseFoldingDjbHash( + u8"\u0130\u0131\u00c0\u00e0\u0100\u0101\u0139\u013a\u0415\u0435\u1ea6" + u8"\u1ea7\u212a\u006b\u2c1d\u2c4d\uff2d\uff4d\U00010c92\U00010cd2")); +} Index: utils/unicode-case-fold.py =================================================================== --- /dev/null +++ utils/unicode-case-fold.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +""" +Unicode case folding database conversion utility + +Parses the database and generates a C++ function which implements the case +folding algorithm. The database entries are of the form: + + ; ; ; # + + can be one of four characters: + C - Common mappings + S - mappings for Simple case folding + F - mappings for Full case folding + T - special case for Turkish I characters + +Right now this generates a function which implements simple case folding (C+S +entries). +""" + +import sys +import re +import urllib2 + +# This variable will contain the code to handle special-cased mappings. +special_cases = "" + +# This variable will containg the code to handle blocks of mappings. +blocks = "" + +# Reads file line-by-line, extracts Common and Simple case fold mappings and +# returns a (from_char, to_char, from_name) tuple. +def mappings(f): + expr = re.compile(r'^(.*); [CS]; (.*); # (.*)') + for line in f: + m = expr.match(line) + if not m: continue + from_char = int(m.group(1), 16) + to_char = int(m.group(2), 16) + from_name = m.group(3) + yield from_char, to_char, from_name + +# Computes the shift (to_char - from_char) in a mapping. +def shift(mapping): + return mapping[1] - mapping[0] + +# Computes the stride (from_char2 - from_char1) of two mappings. +def stride2(mapping1, mapping2): + return mapping2[0] - mapping1[0] + +# Computes the stride of a list of mappings. The list should have at least two +# mappings. All mappings in the list are assumed to have the same stride. +def stride(block): + return stride2(block[0], block[1]) + +# Generates a special-case code for the mapping. +def special_case(m): + global special_cases + special_cases += " case {0:#06x}: // {2}\n return {1:#06x};\n".format(*m) + + +# b is a list of mappings. All the mappings are assumed to have the same +# shift and the stride between adjecant mappings (if any) is constant. +def dump_block(b): + global blocks + if len(b) < 3: + # don't bother trying to condense short blocks + for m in b: + special_case(m) + return + + blocks += " // {0} characters\n".format(len(b)) + first = b[0][0] + last = first + stride(b) * (len(b)-1) + modulo = first % stride(b) + if stride(b) == 2 and shift(b[0]) == 1 and b[0][0]%2 == 0: + # Generate slightly more efficient code here. + # We can elide the modulo-check because the expression "C|1" will map + # the intervening characters to themselves. + pattern = " if (C >= {0:#06x} && C <= {1:#06x})\n return C | 1;\n" + blocks += pattern.format(first, last) + return + + pattern = " if (C >= {0:#06x} && C <= {1:#06x} && C % {2} == {3})\n return C + {4};\n" + blocks += pattern.format(first, last, stride(b), modulo, shift(b[0])) + +current_block = [] +f = urllib2.urlopen(sys.argv[1]) +for m in mappings(f): + if len(current_block) == 0: + current_block.append(m) + continue + + if shift(current_block[0]) != shift(m): + # Incompatible shift, start a new block. + dump_block(current_block) + current_block = [m] + continue + + if len(current_block) == 1 or stride(current_block) == stride2(current_block[-1], m): + current_block.append(m) + continue + + # Incompatible stride, start a new block. + dump_block(current_block) + current_block = [m] +f.close() + +dump_block(current_block) + +print '//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//' +print '//' +print '// This file was generated by utils/unicode-case-fold.py from the Unicode' +print '// case folding database at' +print '// ', sys.argv[1] +print '//' +print '// To regenerate this file, run:' +print '// utils/unicode-case-fold.py \\' +print '// "{}" \\'.format(sys.argv[1]) +print '// > lib/Support/UnicodeCaseFold.cpp' +print '//' +print '//===----------------------------------------------------------------------===//' +print '' +print '#include "llvm/Support/Unicode.h"' +print '' +print "int llvm::sys::unicode::foldCharSimple(int C) {" +print blocks +print " switch (C) {" +print special_cases.rstrip() +print " }" +print " return C;" +print "}"