Index: llvm/trunk/include/llvm/Support/DJB.h =================================================================== --- llvm/trunk/include/llvm/Support/DJB.h +++ llvm/trunk/include/llvm/Support/DJB.h @@ -20,6 +20,10 @@ /// The Bernstein hash function used by the DWARF accelerator tables. uint32_t djbHash(StringRef Buffer, uint32_t H = 5381); + +/// Computes the Bernstein hash after folding the input according to the Dwarf 5 +/// standard case folding rules. +uint32_t caseFoldingDjbHash(StringRef Buffer, uint32_t H = 5381); } // namespace llvm #endif // LLVM_SUPPORT_DJB_H Index: llvm/trunk/include/llvm/Support/Unicode.h =================================================================== --- llvm/trunk/include/llvm/Support/Unicode.h +++ llvm/trunk/include/llvm/Support/Unicode.h @@ -60,6 +60,10 @@ /// * 1 for each of the remaining characters. int columnWidthUTF8(StringRef Text); +/// Fold input unicode character according the the Simple unicode case folding +/// rules. +int foldCharSimple(int C); + } // namespace unicode } // namespace sys } // namespace llvm Index: llvm/trunk/lib/Support/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Support/CMakeLists.txt +++ llvm/trunk/lib/Support/CMakeLists.txt @@ -113,6 +113,7 @@ Triple.cpp Twine.cpp Unicode.cpp + UnicodeCaseFold.cpp YAMLParser.cpp YAMLTraits.cpp raw_os_ostream.cpp Index: llvm/trunk/lib/Support/DJB.cpp =================================================================== --- llvm/trunk/lib/Support/DJB.cpp +++ llvm/trunk/lib/Support/DJB.cpp @@ -12,9 +12,85 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/DJB.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/Unicode.h" + +using namespace llvm; + +static inline uint32_t djbHashChar(unsigned char C, uint32_t H) { + return (H << 5) + H + C; +} uint32_t llvm::djbHash(StringRef Buffer, uint32_t H) { - for (char C : Buffer.bytes()) - H = ((H << 5) + H) + C; + for (unsigned char C : Buffer.bytes()) + H = djbHashChar(C, H); + return H; +} + +static UTF32 chopOneUTF32(StringRef &Buffer) { + UTF32 C; + const UTF8 *const Begin8Const = + reinterpret_cast(Buffer.begin()); + const UTF8 *Begin8 = Begin8Const; + UTF32 *Begin32 = &C; + + // In lenient mode we will always end up with a "reasonable" value in C for + // non-empty input. + assert(!Buffer.empty()); + ConvertUTF8toUTF32(&Begin8, reinterpret_cast(Buffer.end()), + &Begin32, &C + 1, lenientConversion); + Buffer = Buffer.drop_front(Begin8 - Begin8Const); + return C; +} + +static StringRef toUTF8(UTF32 C, MutableArrayRef Storage) { + const UTF32 *Begin32 = &C; + UTF8 *Begin8 = Storage.begin(); + + // The case-folded output should always be a valid unicode character, so use + // strict mode here. + ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8, + Storage.end(), strictConversion); + assert(CR == conversionOK && "Case folding produced invalid char?"); + (void)CR; + return StringRef(reinterpret_cast(Storage.begin()), + Begin8 - Storage.begin()); +} + +static UTF32 foldCharDwarf(UTF32 C) { + // DWARF v5 addition to the unicode folding rules. + // Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot + // Above" into "i". + if (C == 0x130 || C == 0x131) + return 'i'; + return sys::unicode::foldCharSimple(C); +} + +static uint32_t caseFoldingDjbHashCharSlow(StringRef &Buffer, uint32_t H) { + UTF32 C = chopOneUTF32(Buffer); + + C = foldCharDwarf(C); + + std::array Storage; + StringRef Folded = toUTF8(C, Storage); + return djbHash(Folded, H); +} + +uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) { + while (!Buffer.empty()) { + unsigned char C = Buffer.front(); + if (LLVM_LIKELY(C <= 0x7f)) { + // US-ASCII, encoded as one character in utf-8. + // This is by far the most common case, so handle this specially. + if (C >= 'A' && C <= 'Z') + C = 'a' + (C - 'A'); // fold uppercase into lowercase + H = djbHashChar(C, H); + Buffer = Buffer.drop_front(); + continue; + } + H = caseFoldingDjbHashCharSlow(Buffer, H); + } return H; } Index: llvm/trunk/lib/Support/UnicodeCaseFold.cpp =================================================================== --- llvm/trunk/lib/Support/UnicodeCaseFold.cpp +++ llvm/trunk/lib/Support/UnicodeCaseFold.cpp @@ -0,0 +1,742 @@ +//===---------- Support/UnicodeCaseFold.cpp -------------------------------===// +// +// This file was generated by utils/unicode-case-fold.py from the Unicode +// case folding database at +// http://www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt +// +// To regenerate this file, run: +// utils/unicode-case-fold.py \ +// "http://www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt" \ +// > lib/Support/UnicodeCaseFold.cpp +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Unicode.h" + +int llvm::sys::unicode::foldCharSimple(int C) { + if (C < 0x0041) + return C; + // 26 characters + if (C <= 0x005a) + return C + 32; + // MICRO SIGN + if (C == 0x00b5) + return 0x03bc; + if (C < 0x00c0) + return C; + // 23 characters + if (C <= 0x00d6) + return C + 32; + if (C < 0x00d8) + return C; + // 7 characters + if (C <= 0x00de) + return C + 32; + if (C < 0x0100) + return C; + // 24 characters + if (C <= 0x012e) + return C | 1; + if (C < 0x0132) + return C; + // 3 characters + if (C <= 0x0136) + return C | 1; + if (C < 0x0139) + return C; + // 8 characters + if (C <= 0x0147 && C % 2 == 1) + return C + 1; + if (C < 0x014a) + return C; + // 23 characters + if (C <= 0x0176) + return C | 1; + // LATIN CAPITAL LETTER Y WITH DIAERESIS + if (C == 0x0178) + return 0x00ff; + if (C < 0x0179) + return C; + // 3 characters + if (C <= 0x017d && C % 2 == 1) + return C + 1; + // LATIN SMALL LETTER LONG S + if (C == 0x017f) + return 0x0073; + // LATIN CAPITAL LETTER B WITH HOOK + if (C == 0x0181) + return 0x0253; + if (C < 0x0182) + return C; + // 2 characters + if (C <= 0x0184) + return C | 1; + // LATIN CAPITAL LETTER OPEN O + if (C == 0x0186) + return 0x0254; + // LATIN CAPITAL LETTER C WITH HOOK + if (C == 0x0187) + return 0x0188; + if (C < 0x0189) + return C; + // 2 characters + if (C <= 0x018a) + return C + 205; + // LATIN CAPITAL LETTER D WITH TOPBAR + if (C == 0x018b) + return 0x018c; + // LATIN CAPITAL LETTER REVERSED E + if (C == 0x018e) + return 0x01dd; + // LATIN CAPITAL LETTER SCHWA + if (C == 0x018f) + return 0x0259; + // LATIN CAPITAL LETTER OPEN E + if (C == 0x0190) + return 0x025b; + // LATIN CAPITAL LETTER F WITH HOOK + if (C == 0x0191) + return 0x0192; + // LATIN CAPITAL LETTER G WITH HOOK + if (C == 0x0193) + return 0x0260; + // LATIN CAPITAL LETTER GAMMA + if (C == 0x0194) + return 0x0263; + // LATIN CAPITAL LETTER IOTA + if (C == 0x0196) + return 0x0269; + // LATIN CAPITAL LETTER I WITH STROKE + if (C == 0x0197) + return 0x0268; + // LATIN CAPITAL LETTER K WITH HOOK + if (C == 0x0198) + return 0x0199; + // LATIN CAPITAL LETTER TURNED M + if (C == 0x019c) + return 0x026f; + // LATIN CAPITAL LETTER N WITH LEFT HOOK + if (C == 0x019d) + return 0x0272; + // LATIN CAPITAL LETTER O WITH MIDDLE TILDE + if (C == 0x019f) + return 0x0275; + if (C < 0x01a0) + return C; + // 3 characters + if (C <= 0x01a4) + return C | 1; + // LATIN LETTER YR + if (C == 0x01a6) + return 0x0280; + // LATIN CAPITAL LETTER TONE TWO + if (C == 0x01a7) + return 0x01a8; + // LATIN CAPITAL LETTER ESH + if (C == 0x01a9) + return 0x0283; + // LATIN CAPITAL LETTER T WITH HOOK + if (C == 0x01ac) + return 0x01ad; + // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK + if (C == 0x01ae) + return 0x0288; + // LATIN CAPITAL LETTER U WITH HORN + if (C == 0x01af) + return 0x01b0; + if (C < 0x01b1) + return C; + // 2 characters + if (C <= 0x01b2) + return C + 217; + if (C < 0x01b3) + return C; + // 2 characters + if (C <= 0x01b5 && C % 2 == 1) + return C + 1; + // LATIN CAPITAL LETTER EZH + if (C == 0x01b7) + return 0x0292; + if (C < 0x01b8) + return C; + // 2 characters + if (C <= 0x01bc && C % 4 == 0) + return C + 1; + // LATIN CAPITAL LETTER DZ WITH CARON + if (C == 0x01c4) + return 0x01c6; + // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON + if (C == 0x01c5) + return 0x01c6; + // LATIN CAPITAL LETTER LJ + if (C == 0x01c7) + return 0x01c9; + // LATIN CAPITAL LETTER L WITH SMALL LETTER J + if (C == 0x01c8) + return 0x01c9; + // LATIN CAPITAL LETTER NJ + if (C == 0x01ca) + return 0x01cc; + if (C < 0x01cb) + return C; + // 9 characters + if (C <= 0x01db && C % 2 == 1) + return C + 1; + if (C < 0x01de) + return C; + // 9 characters + if (C <= 0x01ee) + return C | 1; + // LATIN CAPITAL LETTER DZ + if (C == 0x01f1) + return 0x01f3; + if (C < 0x01f2) + return C; + // 2 characters + if (C <= 0x01f4) + return C | 1; + // LATIN CAPITAL LETTER HWAIR + if (C == 0x01f6) + return 0x0195; + // LATIN CAPITAL LETTER WYNN + if (C == 0x01f7) + return 0x01bf; + if (C < 0x01f8) + return C; + // 20 characters + if (C <= 0x021e) + return C | 1; + // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG + if (C == 0x0220) + return 0x019e; + if (C < 0x0222) + return C; + // 9 characters + if (C <= 0x0232) + return C | 1; + // LATIN CAPITAL LETTER A WITH STROKE + if (C == 0x023a) + return 0x2c65; + // LATIN CAPITAL LETTER C WITH STROKE + if (C == 0x023b) + return 0x023c; + // LATIN CAPITAL LETTER L WITH BAR + if (C == 0x023d) + return 0x019a; + // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE + if (C == 0x023e) + return 0x2c66; + // LATIN CAPITAL LETTER GLOTTAL STOP + if (C == 0x0241) + return 0x0242; + // LATIN CAPITAL LETTER B WITH STROKE + if (C == 0x0243) + return 0x0180; + // LATIN CAPITAL LETTER U BAR + if (C == 0x0244) + return 0x0289; + // LATIN CAPITAL LETTER TURNED V + if (C == 0x0245) + return 0x028c; + if (C < 0x0246) + return C; + // 5 characters + if (C <= 0x024e) + return C | 1; + // COMBINING GREEK YPOGEGRAMMENI + if (C == 0x0345) + return 0x03b9; + if (C < 0x0370) + return C; + // 2 characters + if (C <= 0x0372) + return C | 1; + // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA + if (C == 0x0376) + return 0x0377; + // GREEK CAPITAL LETTER YOT + if (C == 0x037f) + return 0x03f3; + // GREEK CAPITAL LETTER ALPHA WITH TONOS + if (C == 0x0386) + return 0x03ac; + if (C < 0x0388) + return C; + // 3 characters + if (C <= 0x038a) + return C + 37; + // GREEK CAPITAL LETTER OMICRON WITH TONOS + if (C == 0x038c) + return 0x03cc; + if (C < 0x038e) + return C; + // 2 characters + if (C <= 0x038f) + return C + 63; + if (C < 0x0391) + return C; + // 17 characters + if (C <= 0x03a1) + return C + 32; + if (C < 0x03a3) + return C; + // 9 characters + if (C <= 0x03ab) + return C + 32; + // GREEK SMALL LETTER FINAL SIGMA + if (C == 0x03c2) + return 0x03c3; + // GREEK CAPITAL KAI SYMBOL + if (C == 0x03cf) + return 0x03d7; + // GREEK BETA SYMBOL + if (C == 0x03d0) + return 0x03b2; + // GREEK THETA SYMBOL + if (C == 0x03d1) + return 0x03b8; + // GREEK PHI SYMBOL + if (C == 0x03d5) + return 0x03c6; + // GREEK PI SYMBOL + if (C == 0x03d6) + return 0x03c0; + if (C < 0x03d8) + return C; + // 12 characters + if (C <= 0x03ee) + return C | 1; + // GREEK KAPPA SYMBOL + if (C == 0x03f0) + return 0x03ba; + // GREEK RHO SYMBOL + if (C == 0x03f1) + return 0x03c1; + // GREEK CAPITAL THETA SYMBOL + if (C == 0x03f4) + return 0x03b8; + // GREEK LUNATE EPSILON SYMBOL + if (C == 0x03f5) + return 0x03b5; + // GREEK CAPITAL LETTER SHO + if (C == 0x03f7) + return 0x03f8; + // GREEK CAPITAL LUNATE SIGMA SYMBOL + if (C == 0x03f9) + return 0x03f2; + // GREEK CAPITAL LETTER SAN + if (C == 0x03fa) + return 0x03fb; + if (C < 0x03fd) + return C; + // 3 characters + if (C <= 0x03ff) + return C + -130; + if (C < 0x0400) + return C; + // 16 characters + if (C <= 0x040f) + return C + 80; + if (C < 0x0410) + return C; + // 32 characters + if (C <= 0x042f) + return C + 32; + if (C < 0x0460) + return C; + // 17 characters + if (C <= 0x0480) + return C | 1; + if (C < 0x048a) + return C; + // 27 characters + if (C <= 0x04be) + return C | 1; + // CYRILLIC LETTER PALOCHKA + if (C == 0x04c0) + return 0x04cf; + if (C < 0x04c1) + return C; + // 7 characters + if (C <= 0x04cd && C % 2 == 1) + return C + 1; + if (C < 0x04d0) + return C; + // 48 characters + if (C <= 0x052e) + return C | 1; + if (C < 0x0531) + return C; + // 38 characters + if (C <= 0x0556) + return C + 48; + if (C < 0x10a0) + return C; + // 38 characters + if (C <= 0x10c5) + return C + 7264; + if (C < 0x10c7) + return C; + // 2 characters + if (C <= 0x10cd && C % 6 == 5) + return C + 7264; + if (C < 0x13f8) + return C; + // 6 characters + if (C <= 0x13fd) + return C + -8; + // CYRILLIC SMALL LETTER ROUNDED VE + if (C == 0x1c80) + return 0x0432; + // CYRILLIC SMALL LETTER LONG-LEGGED DE + if (C == 0x1c81) + return 0x0434; + // CYRILLIC SMALL LETTER NARROW O + if (C == 0x1c82) + return 0x043e; + if (C < 0x1c83) + return C; + // 2 characters + if (C <= 0x1c84) + return C + -6210; + // CYRILLIC SMALL LETTER THREE-LEGGED TE + if (C == 0x1c85) + return 0x0442; + // CYRILLIC SMALL LETTER TALL HARD SIGN + if (C == 0x1c86) + return 0x044a; + // CYRILLIC SMALL LETTER TALL YAT + if (C == 0x1c87) + return 0x0463; + // CYRILLIC SMALL LETTER UNBLENDED UK + if (C == 0x1c88) + return 0xa64b; + if (C < 0x1e00) + return C; + // 75 characters + if (C <= 0x1e94) + return C | 1; + // LATIN SMALL LETTER LONG S WITH DOT ABOVE + if (C == 0x1e9b) + return 0x1e61; + // LATIN CAPITAL LETTER SHARP S + if (C == 0x1e9e) + return 0x00df; + if (C < 0x1ea0) + return C; + // 48 characters + if (C <= 0x1efe) + return C | 1; + if (C < 0x1f08) + return C; + // 8 characters + if (C <= 0x1f0f) + return C + -8; + if (C < 0x1f18) + return C; + // 6 characters + if (C <= 0x1f1d) + return C + -8; + if (C < 0x1f28) + return C; + // 8 characters + if (C <= 0x1f2f) + return C + -8; + if (C < 0x1f38) + return C; + // 8 characters + if (C <= 0x1f3f) + return C + -8; + if (C < 0x1f48) + return C; + // 6 characters + if (C <= 0x1f4d) + return C + -8; + if (C < 0x1f59) + return C; + // 4 characters + if (C <= 0x1f5f && C % 2 == 1) + return C + -8; + if (C < 0x1f68) + return C; + // 8 characters + if (C <= 0x1f6f) + return C + -8; + if (C < 0x1f88) + return C; + // 8 characters + if (C <= 0x1f8f) + return C + -8; + if (C < 0x1f98) + return C; + // 8 characters + if (C <= 0x1f9f) + return C + -8; + if (C < 0x1fa8) + return C; + // 8 characters + if (C <= 0x1faf) + return C + -8; + if (C < 0x1fb8) + return C; + // 2 characters + if (C <= 0x1fb9) + return C + -8; + if (C < 0x1fba) + return C; + // 2 characters + if (C <= 0x1fbb) + return C + -74; + // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI + if (C == 0x1fbc) + return 0x1fb3; + // GREEK PROSGEGRAMMENI + if (C == 0x1fbe) + return 0x03b9; + if (C < 0x1fc8) + return C; + // 4 characters + if (C <= 0x1fcb) + return C + -86; + // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI + if (C == 0x1fcc) + return 0x1fc3; + if (C < 0x1fd8) + return C; + // 2 characters + if (C <= 0x1fd9) + return C + -8; + if (C < 0x1fda) + return C; + // 2 characters + if (C <= 0x1fdb) + return C + -100; + if (C < 0x1fe8) + return C; + // 2 characters + if (C <= 0x1fe9) + return C + -8; + if (C < 0x1fea) + return C; + // 2 characters + if (C <= 0x1feb) + return C + -112; + // GREEK CAPITAL LETTER RHO WITH DASIA + if (C == 0x1fec) + return 0x1fe5; + if (C < 0x1ff8) + return C; + // 2 characters + if (C <= 0x1ff9) + return C + -128; + if (C < 0x1ffa) + return C; + // 2 characters + if (C <= 0x1ffb) + return C + -126; + // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI + if (C == 0x1ffc) + return 0x1ff3; + // OHM SIGN + if (C == 0x2126) + return 0x03c9; + // KELVIN SIGN + if (C == 0x212a) + return 0x006b; + // ANGSTROM SIGN + if (C == 0x212b) + return 0x00e5; + // TURNED CAPITAL F + if (C == 0x2132) + return 0x214e; + if (C < 0x2160) + return C; + // 16 characters + if (C <= 0x216f) + return C + 16; + // ROMAN NUMERAL REVERSED ONE HUNDRED + if (C == 0x2183) + return 0x2184; + if (C < 0x24b6) + return C; + // 26 characters + if (C <= 0x24cf) + return C + 26; + if (C < 0x2c00) + return C; + // 47 characters + if (C <= 0x2c2e) + return C + 48; + // LATIN CAPITAL LETTER L WITH DOUBLE BAR + if (C == 0x2c60) + return 0x2c61; + // LATIN CAPITAL LETTER L WITH MIDDLE TILDE + if (C == 0x2c62) + return 0x026b; + // LATIN CAPITAL LETTER P WITH STROKE + if (C == 0x2c63) + return 0x1d7d; + // LATIN CAPITAL LETTER R WITH TAIL + if (C == 0x2c64) + return 0x027d; + if (C < 0x2c67) + return C; + // 3 characters + if (C <= 0x2c6b && C % 2 == 1) + return C + 1; + // LATIN CAPITAL LETTER ALPHA + if (C == 0x2c6d) + return 0x0251; + // LATIN CAPITAL LETTER M WITH HOOK + if (C == 0x2c6e) + return 0x0271; + // LATIN CAPITAL LETTER TURNED A + if (C == 0x2c6f) + return 0x0250; + // LATIN CAPITAL LETTER TURNED ALPHA + if (C == 0x2c70) + return 0x0252; + if (C < 0x2c72) + return C; + // 2 characters + if (C <= 0x2c75 && C % 3 == 2) + return C + 1; + if (C < 0x2c7e) + return C; + // 2 characters + if (C <= 0x2c7f) + return C + -10815; + if (C < 0x2c80) + return C; + // 50 characters + if (C <= 0x2ce2) + return C | 1; + if (C < 0x2ceb) + return C; + // 2 characters + if (C <= 0x2ced && C % 2 == 1) + return C + 1; + if (C < 0x2cf2) + return C; + // 2 characters + if (C <= 0xa640 && C % 31054 == 11506) + return C + 1; + if (C < 0xa642) + return C; + // 22 characters + if (C <= 0xa66c) + return C | 1; + if (C < 0xa680) + return C; + // 14 characters + if (C <= 0xa69a) + return C | 1; + if (C < 0xa722) + return C; + // 7 characters + if (C <= 0xa72e) + return C | 1; + if (C < 0xa732) + return C; + // 31 characters + if (C <= 0xa76e) + return C | 1; + if (C < 0xa779) + return C; + // 2 characters + if (C <= 0xa77b && C % 2 == 1) + return C + 1; + // LATIN CAPITAL LETTER INSULAR G + if (C == 0xa77d) + return 0x1d79; + if (C < 0xa77e) + return C; + // 5 characters + if (C <= 0xa786) + return C | 1; + // LATIN CAPITAL LETTER SALTILLO + if (C == 0xa78b) + return 0xa78c; + // LATIN CAPITAL LETTER TURNED H + if (C == 0xa78d) + return 0x0265; + if (C < 0xa790) + return C; + // 2 characters + if (C <= 0xa792) + return C | 1; + if (C < 0xa796) + return C; + // 10 characters + if (C <= 0xa7a8) + return C | 1; + // LATIN CAPITAL LETTER H WITH HOOK + if (C == 0xa7aa) + return 0x0266; + // LATIN CAPITAL LETTER REVERSED OPEN E + if (C == 0xa7ab) + return 0x025c; + // LATIN CAPITAL LETTER SCRIPT G + if (C == 0xa7ac) + return 0x0261; + // LATIN CAPITAL LETTER L WITH BELT + if (C == 0xa7ad) + return 0x026c; + // LATIN CAPITAL LETTER SMALL CAPITAL I + if (C == 0xa7ae) + return 0x026a; + // LATIN CAPITAL LETTER TURNED K + if (C == 0xa7b0) + return 0x029e; + // LATIN CAPITAL LETTER TURNED T + if (C == 0xa7b1) + return 0x0287; + // LATIN CAPITAL LETTER J WITH CROSSED-TAIL + if (C == 0xa7b2) + return 0x029d; + // LATIN CAPITAL LETTER CHI + if (C == 0xa7b3) + return 0xab53; + if (C < 0xa7b4) + return C; + // 2 characters + if (C <= 0xa7b6) + return C | 1; + if (C < 0xab70) + return C; + // 80 characters + if (C <= 0xabbf) + return C + -38864; + if (C < 0xff21) + return C; + // 26 characters + if (C <= 0xff3a) + return C + 32; + if (C < 0x10400) + return C; + // 40 characters + if (C <= 0x10427) + return C + 40; + if (C < 0x104b0) + return C; + // 36 characters + if (C <= 0x104d3) + return C + 40; + if (C < 0x10c80) + return C; + // 51 characters + if (C <= 0x10cb2) + return C + 64; + if (C < 0x118a0) + return C; + // 32 characters + if (C <= 0x118bf) + return C + 32; + if (C < 0x1e900) + return C; + // 34 characters + if (C <= 0x1e921) + return C + 34; + + return C; +} Index: llvm/trunk/unittests/Support/CMakeLists.txt =================================================================== --- llvm/trunk/unittests/Support/CMakeLists.txt +++ llvm/trunk/unittests/Support/CMakeLists.txt @@ -19,6 +19,7 @@ ConvertUTFTest.cpp DataExtractorTest.cpp DebugTest.cpp + DJBTest.cpp EndianStreamTest.cpp EndianTest.cpp ErrnoTest.cpp Index: llvm/trunk/unittests/Support/DJBTest.cpp =================================================================== --- llvm/trunk/unittests/Support/DJBTest.cpp +++ llvm/trunk/unittests/Support/DJBTest.cpp @@ -0,0 +1,96 @@ +//===---------- llvm/unittest/Support/DJBTest.cpp -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/DJB.h" +#include "llvm/ADT/Twine.h" +#include "gtest/gtest.h" + +using namespace llvm; + +TEST(DJBTest, caseFolding) { + struct TestCase { + StringLiteral One; + StringLiteral Two; + }; + + static constexpr TestCase Tests[] = { + {{"ASDF"}, {"asdf"}}, + {{"qWeR"}, {"QwEr"}}, + {{"qqqqqqqqqqqqqqqqqqqq"}, {"QQQQQQQQQQQQQQQQQQQQ"}}, + + {{"I"}, {"i"}}, + // Latin Small Letter Dotless I + {{u8"\u0130"}, {"i"}}, + // Latin Capital Letter I With Dot Above + {{u8"\u0131"}, {"i"}}, + + // Latin Capital Letter A With Grave + {{u8"\u00c0"}, {u8"\u00e0"}}, + // Latin Capital Letter A With Macron + {{u8"\u0100"}, {u8"\u0101"}}, + // Latin Capital Letter L With Acute + {{u8"\u0139"}, {u8"\u013a"}}, + // Cyrillic Capital Letter Ie + {{u8"\u0415"}, {u8"\u0435"}}, + // Latin Capital Letter A With Circumflex And Grave + {{u8"\u1ea6"}, {u8"\u1ea7"}}, + // Kelvin Sign + {{u8"\u212a"}, {u8"\u006b"}}, + // Glagolitic Capital Letter Chrivi + {{u8"\u2c1d"}, {u8"\u2c4d"}}, + // Fullwidth Latin Capital Letter M + {{u8"\uff2d"}, {u8"\uff4d"}}, + // Old Hungarian Capital Letter Ej + {{u8"\U00010c92"}, {u8"\U00010cd2"}}, + }; + + for (const TestCase &T : Tests) { + SCOPED_TRACE("Comparing '" + T.One + "' and '" + T.Two + "'"); + EXPECT_EQ(caseFoldingDjbHash(T.One), caseFoldingDjbHash(T.Two)); + } +} + +TEST(DJBTest, knownValuesLowerCase) { + struct TestCase { + StringLiteral Text; + uint32_t Hash; + }; + static constexpr TestCase Tests[] = { + {{""}, 5381u}, + {{"f"}, 177675u}, + {{"fo"}, 5863386u}, + {{"foo"}, 193491849u}, + {{"foob"}, 2090263819u}, + {{"fooba"}, 259229388u}, + {{"foobar"}, 4259602622u}, + {{"pneumonoultramicroscopicsilicovolcanoconiosis"}, 3999417781u}, + }; + + for (const TestCase &T : Tests) { + SCOPED_TRACE("Text: '" + T.Text + "'"); + EXPECT_EQ(T.Hash, djbHash(T.Text)); + EXPECT_EQ(T.Hash, caseFoldingDjbHash(T.Text)); + EXPECT_EQ(T.Hash, caseFoldingDjbHash(T.Text.upper())); + } +} + +TEST(DJBTest, knownValuesUnicode) { + EXPECT_EQ(5866553u, djbHash(u8"\u0130")); + EXPECT_EQ(177678u, caseFoldingDjbHash(u8"\u0130")); + EXPECT_EQ( + 1302161417u, + djbHash( + u8"\u0130\u0131\u00c0\u00e0\u0100\u0101\u0139\u013a\u0415\u0435\u1ea6" + u8"\u1ea7\u212a\u006b\u2c1d\u2c4d\uff2d\uff4d\U00010c92\U00010cd2")); + EXPECT_EQ( + 1145571043u, + caseFoldingDjbHash( + u8"\u0130\u0131\u00c0\u00e0\u0100\u0101\u0139\u013a\u0415\u0435\u1ea6" + u8"\u1ea7\u212a\u006b\u2c1d\u2c4d\uff2d\uff4d\U00010c92\U00010cd2")); +} Index: llvm/trunk/utils/unicode-case-fold.py =================================================================== --- llvm/trunk/utils/unicode-case-fold.py +++ llvm/trunk/utils/unicode-case-fold.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python +""" +Unicode case folding database conversion utility + +Parses the database and generates a C++ function which implements the case +folding algorithm. The database entries are of the form: + + ; ; ; # + + can be one of four characters: + C - Common mappings + S - mappings for Simple case folding + F - mappings for Full case folding + T - special case for Turkish I characters + +Right now this generates a function which implements simple case folding (C+S +entries). +""" + +import sys +import re +import urllib2 + +# This variable will body of the mappings function +body = "" + +# Reads file line-by-line, extracts Common and Simple case fold mappings and +# returns a (from_char, to_char, from_name) tuple. +def mappings(f): + previous_from = -1 + expr = re.compile(r'^(.*); [CS]; (.*); # (.*)') + for line in f: + m = expr.match(line) + if not m: continue + from_char = int(m.group(1), 16) + to_char = int(m.group(2), 16) + from_name = m.group(3) + + if from_char <= previous_from: + raise Exception("Duplicate or unsorted characters in input") + yield from_char, to_char, from_name + previous_from = from_char + +# Computes the shift (to_char - from_char) in a mapping. +def shift(mapping): + return mapping[1] - mapping[0] + +# Computes the stride (from_char2 - from_char1) of two mappings. +def stride2(mapping1, mapping2): + return mapping2[0] - mapping1[0] + +# Computes the stride of a list of mappings. The list should have at least two +# mappings. All mappings in the list are assumed to have the same stride. +def stride(block): + return stride2(block[0], block[1]) + + +# b is a list of mappings. All the mappings are assumed to have the same +# shift and the stride between adjecant mappings (if any) is constant. +def dump_block(b): + global body + + if len(b) == 1: + # Special case for handling blocks of length 1. We don't even need to + # emit the "if (C < X) return C" check below as all characters in this + # range will be caught by the "C < X" check emitted by the first + # non-trivial block. + body += " // {2}\n if (C == {0:#06x})\n return {1:#06x};\n".format(*b[0]) + return + + first = b[0][0] + last = first + stride(b) * (len(b)-1) + modulo = first % stride(b) + + # All characters before this block map to themselves. + body += " if (C < {0:#06x})\n return C;\n".format(first) + body += " // {0} characters\n".format(len(b)) + + # Generic pattern: check upper bound (lower bound is checked by the "if" + # above) and modulo of C, return C+shift. + pattern = " if (C <= {0:#06x} && C % {1} == {2})\n return C + {3};\n" + + if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0: + # Special case: + # We can elide the modulo-check because the expression "C|1" will map + # the intervening characters to themselves. + pattern = " if (C <= {0:#06x})\n return C | 1;\n" + elif stride(b) == 1: + # Another special case: X % 1 is always zero, so don't emit the + # modulo-check. + pattern = " if (C <= {0:#06x})\n return C + {3};\n" + + body += pattern.format(last, stride(b), modulo, shift(b[0])) + +current_block = [] +f = urllib2.urlopen(sys.argv[1]) +for m in mappings(f): + if len(current_block) == 0: + current_block.append(m) + continue + + if shift(current_block[0]) != shift(m): + # Incompatible shift, start a new block. + dump_block(current_block) + current_block = [m] + continue + + if len(current_block) == 1 or stride(current_block) == stride2(current_block[-1], m): + current_block.append(m) + continue + + # Incompatible stride, start a new block. + dump_block(current_block) + current_block = [m] +f.close() + +dump_block(current_block) + +print '//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//' +print '//' +print '// This file was generated by utils/unicode-case-fold.py from the Unicode' +print '// case folding database at' +print '// ', sys.argv[1] +print '//' +print '// To regenerate this file, run:' +print '// utils/unicode-case-fold.py \\' +print '// "{}" \\'.format(sys.argv[1]) +print '// > lib/Support/UnicodeCaseFold.cpp' +print '//' +print '//===----------------------------------------------------------------------===//' +print '' +print '#include "llvm/Support/Unicode.h"' +print '' +print "int llvm::sys::unicode::foldCharSimple(int C) {" +print body +print " return C;" +print "}"