Index: llvm/test/tools/llvm-rc/Inputs/utf8.rc =================================================================== --- /dev/null +++ llvm/test/tools/llvm-rc/Inputs/utf8.rc @@ -0,0 +1,22 @@ +STRINGTABLE + +BEGIN + + 1 "리소스 컴파일러" + + 2 L"리소스 컴파일러" + +END + + +128 450 { + L"리소스 컴파일러" + +} + +1 MENU { + MENUITEM "리소스 컴파일러" +, 500 + MENUITEM L"리소스 컴파일러" +, 501 +} Index: llvm/test/tools/llvm-rc/utf-support.test =================================================================== --- /dev/null +++ llvm/test/tools/llvm-rc/utf-support.test @@ -0,0 +1,51 @@ +; Both inputs have the same contents; they're only encoded differently. + +; RUN: llvm-rc /FO %t %p/Inputs/utf8.rc +; RUN: llvm-readobj %t | FileCheck %s + +; RUN: llvm-rc /FO %t2 %p/Inputs/utf16le.rc +; RUN: llvm-readobj %t2 | FileCheck %s + + +; CHECK: Resource type (int): 450 +; CHECK-NEXT: Resource name (int): 128 +; CHECK-NEXT: Data version: 0 +; CHECK-NEXT: Memory flags: 0x30 +; CHECK-NEXT: Language ID: 1033 +; CHECK-NEXT: Version (major): 0 +; CHECK-NEXT: Version (minor): 0 +; CHECK-NEXT: Characteristics: 0 +; CHECK-NEXT: Data size: 16 +; CHECK-NEXT: Data:: (AC B9 8C C1 A4 C2 20 00 F4 CE 0C D3 7C C7 EC B7) + +; CHECK-DAG: Resource type (int): 4 +; CHECK-NEXT: Resource name (int): 1 +; CHECK-NEXT: Data version: 0 +; CHECK-NEXT: Memory flags: 0x1030 +; CHECK-NEXT: Language ID: 1033 +; CHECK-NEXT: Version (major): 0 +; CHECK-NEXT: Version (minor): 0 +; CHECK-NEXT: Characteristics: 0 +; CHECK-NEXT: Data size: 48 +; CHECK-NEXT: Data: ( +; CHECK-NEXT: 0000: 00000000 0000F401 ACB98CC1 A4C22000 |.............. .| +; CHECK-NEXT: 0010: F4CE0CD3 7CC7ECB7 00008000 F501ACB9 |....|...........| +; CHECK-NEXT: 0020: 8CC1A4C2 2000F4CE 0CD37CC7 ECB70000 |.... .....|.....| +; CHECK-NEXT: ) + +; CHECK-DAG: Resource type (int): 6 +; CHECK-NEXT: Resource name (int): 1 +; CHECK-NEXT: Data version: 0 +; CHECK-NEXT: Memory flags: 0x1030 +; CHECK-NEXT: Language ID: 1033 +; CHECK-NEXT: Version (major): 0 +; CHECK-NEXT: Version (minor): 0 +; CHECK-NEXT: Characteristics: 0 +; CHECK-NEXT: Data size: 64 +; CHECK-NEXT: Data: ( +; CHECK-NEXT: 0000: 00000800 ACB98CC1 A4C22000 F4CE0CD3 |.......... .....| +; CHECK-NEXT: 0010: 7CC7ECB7 0800ACB9 8CC1A4C2 2000F4CE ||........... ...| +; CHECK-NEXT: 0020: 0CD37CC7 ECB70000 00000000 00000000 |..|.............| +; CHECK-NEXT: 0030: 00000000 00000000 00000000 00000000 |................| +; CHECK-NEXT: ) + Index: llvm/tools/llvm-rc/llvm-rc.cpp =================================================================== --- llvm/tools/llvm-rc/llvm-rc.cpp +++ llvm/tools/llvm-rc/llvm-rc.cpp @@ -19,6 +19,7 @@ #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/ManagedStatic.h" @@ -98,6 +99,41 @@ return Dirs; } +// We need to use a heuristic to detect the encoding. We guess that a string +// is UTF-16LE if either: +// * its second byte is equal to 0 (that's a required condition), +// * or a UTF-16LE BOM (\xff\xfe) is given. +bool isUTF16(StringRef Str) { + if (Str.size() < 2) + return false; + return Str.startswith("\xff\xfe") || Str[1] == '\0'; +} + +// Convert file from UTF-8 or UTF-16LE. +// we guess that a correct UTF-8 script won't contain any null bytes). +std::string decodeFile(StringRef Filename) { + ErrorOr> File = MemoryBuffer::getFile(Filename); + if (!File) + fatalError("Error opening file '" + Filename + + "': " + File.getError().message()); + std::unique_ptr FileContents = std::move(*File); + + auto *FileFrom = FileContents->getBufferStart(); + auto *FileTo = FileContents->getBufferEnd(); + if (isUTF16(FileContents->getBuffer())) { + std::string ContentsFromUTF16; + if (convertUTF16ToUTF8String(ArrayRef(FileFrom, FileTo), + ContentsFromUTF16)) + return ContentsFromUTF16; + } else { + if (isLegalUTF8String(reinterpret_cast(&FileFrom), + reinterpret_cast(FileTo))) + return FileContents->getBuffer(); + } + + fatalError("Input file '" + Filename + "' is neither UTF-8 nor UTF-16"); +} + } // anonymous namespace int main(int argc_, const char *argv_[]) { @@ -133,14 +169,7 @@ // Read and tokenize the input file. StringRef Filename = InArgsInfo[0]; - ErrorOr> File = MemoryBuffer::getFile(Filename); - if (!File) { - fatalError("Error opening file '" + Filename + - "': " + File.getError().message()); - } - - std::unique_ptr FileContents = std::move(*File); - StringRef Contents = FileContents->getBuffer(); + std::string Contents = decodeFile(Filename); std::vector Tokens = ExitOnErr(tokenizeRC(Contents));