Index: llvm/trunk/test/tools/llvm-rc/Inputs/cp1252.rc =================================================================== --- llvm/trunk/test/tools/llvm-rc/Inputs/cp1252.rc +++ llvm/trunk/test/tools/llvm-rc/Inputs/cp1252.rc @@ -0,0 +1,4 @@ +STRINGTABLE { + 1 "åäö © ƒ \xe5\xe4\366 \251 \x83" + 2 L"åäö © ƒ \xe5\xe4\366 \251 \x0192" +} Index: llvm/trunk/test/tools/llvm-rc/Inputs/utf8-escape-narrow.rc =================================================================== --- llvm/trunk/test/tools/llvm-rc/Inputs/utf8-escape-narrow.rc +++ llvm/trunk/test/tools/llvm-rc/Inputs/utf8-escape-narrow.rc @@ -0,0 +1,5 @@ +STRINGTABLE { + // One can't pass UTF-8 sequences via multiple escaped chars - in narrow + // strings in UTF-8 mode, only ASCII chars can be entered via escapes. + 1 "åäö \xc3\xa5" +} Index: llvm/trunk/test/tools/llvm-rc/Inputs/utf8.rc =================================================================== --- llvm/trunk/test/tools/llvm-rc/Inputs/utf8.rc +++ llvm/trunk/test/tools/llvm-rc/Inputs/utf8.rc @@ -0,0 +1,6 @@ +STRINGTABLE { + // One can't pass UTF-8 sequences via multiple escaped chars - in narrow + // strings in UTF-8 mode, only ASCII chars can be entered via escapes. + 1 "åäö © \x61" + 2 L"åäö © \xe5\xe4\366 \251" +} Index: llvm/trunk/test/tools/llvm-rc/codepage.test =================================================================== --- llvm/trunk/test/tools/llvm-rc/codepage.test +++ llvm/trunk/test/tools/llvm-rc/codepage.test @@ -0,0 +1,44 @@ +; RUN: llvm-rc /C 65001 /FO %t.utf8.res %p/Inputs/utf8.rc +; RUN: llvm-readobj %t.utf8.res | FileCheck %s --check-prefix=UTF8 + +; UTF8: Resource type (int): 6 +; UTF8-NEXT: Resource name (int): 1 +; UTF8-NEXT: Data version: 0 +; UTF8-NEXT: Memory flags: 0x1030 +; UTF8-NEXT: Language ID: 1033 +; UTF8-NEXT: Version (major): 0 +; UTF8-NEXT: Version (minor): 0 +; UTF8-NEXT: Characteristics: 0 +; UTF8-NEXT: Data size: 68 +; UTF8-NEXT: Data: ( +; UTF8-NEXT: 0000: 00000700 E500E400 F6002000 A9002000 |.......... ... .| +; UTF8-NEXT: 0010: 61000B00 E500E400 F6002000 A9002000 |a......... ... .| +; UTF8-NEXT: 0020: E500E400 F6002000 A9000000 00000000 |...... .........| +; UTF8-NEXT: 0030: 00000000 00000000 00000000 00000000 |................| +; UTF8-NEXT: 0040: 00000000 |....| +; UTF8-NEXT: ) + +; RUN: not llvm-rc /C 65001 /FO %t.utf8-escape-narrow.res %p/Inputs/utf8-escape-narrow.rc 2>&1 | FileCheck %s --check-prefix UTF8_ESCAPE +; UTF8_ESCAPE: llvm-rc: Error in STRINGTABLE statement (ID 1): +; UTF8_ESCAPE-NEXT: Unable to interpret single byte (195) as UTF-8 + +; RUN: llvm-rc /C 1252 /FO %t.cp1252.res %p/Inputs/cp1252.rc +; RUN: llvm-readobj %t.cp1252.res | FileCheck %s --check-prefix=CP1252 + +; CP1252: Resource type (int): 6 +; CP1252-NEXT: Resource name (int): 1 +; CP1252-NEXT: Data version: 0 +; CP1252-NEXT: Memory flags: 0x1030 +; CP1252-NEXT: Language ID: 1033 +; CP1252-NEXT: Version (major): 0 +; CP1252-NEXT: Version (minor): 0 +; CP1252-NEXT: Characteristics: 0 +; CP1252-NEXT: Data size: 92 +; CP1252-NEXT: Data: ( +; CP1252-NEXT: 0000: 00000F00 E500E400 F6002000 A9002000 |.......... ... .| +; CP1252-NEXT: 0010: 92012000 E500E400 F6002000 A9002000 |.. ....... ... .| +; CP1252-NEXT: 0020: 92010F00 E500E400 F6002000 A9002000 |.......... ... .| +; CP1252-NEXT: 0030: 92012000 E500E400 F6002000 A9002000 |.. ....... ... .| +; CP1252-NEXT: 0040: 92010000 00000000 00000000 00000000 |................| +; CP1252-NEXT: 0050: 00000000 00000000 00000000 |............| +; CP1252-NEXT: ) Index: llvm/trunk/test/tools/llvm-rc/helpmsg.test =================================================================== --- llvm/trunk/test/tools/llvm-rc/helpmsg.test +++ llvm/trunk/test/tools/llvm-rc/helpmsg.test @@ -7,6 +7,7 @@ ; CHECK-DAG: USAGE: rc [options] ; CHECK-DAG: OPTIONS: ; CHECK-NEXT: /? Display this help and exit. +; CHECK-NEXT: /C Set the codepage used for input strings. ; CHECK-NEXT: /dry-run Don't compile the input; only try to parse it. ; CHECK-NEXT: /D Define a symbol for the C preprocessor. ; CHECK-NEXT: /FO Change the output file location. Index: llvm/trunk/tools/llvm-rc/Opts.td =================================================================== --- llvm/trunk/tools/llvm-rc/Opts.td +++ llvm/trunk/tools/llvm-rc/Opts.td @@ -35,6 +35,9 @@ def DRY_RUN : Flag<[ "/", "-" ], "dry-run">, HelpText<"Don't compile the input; only try to parse it.">; +def CODEPAGE : JoinedOrSeparate<[ "/", "-" ], "C">, + HelpText<"Set the codepage used for input strings.">; + // Unused switches (at least for now). These will stay unimplemented // in an early stage of development and can be ignored. However, we need to // parse them in order to preserve the compatibility with the original tool. @@ -44,7 +47,6 @@ def SL : Flag<[ "/", "-" ], "SL">; // (Codepages support.) -def C : Flag<[ "/", "-" ], "C">; def W : Flag<[ "/", "-" ], "W">; // (Support of MUI and similar.) Index: llvm/trunk/tools/llvm-rc/ResourceFileWriter.h =================================================================== --- llvm/trunk/tools/llvm-rc/ResourceFileWriter.h +++ llvm/trunk/tools/llvm-rc/ResourceFileWriter.h @@ -25,15 +25,25 @@ namespace rc { -struct SearchParams { +enum CodePage { + CpAcp = 0, // The current used codepage. Since there's no such + // notion in LLVM what codepage it actually means, + // this only allows ASCII. + CpWin1252 = 1252, // A codepage where most 8 bit values correspond to + // unicode code points with the same value. + CpUtf8 = 65001, // UTF-8. +}; + +struct WriterParams { std::vector Include; // Additional folders to search for files. std::vector NoInclude; // Folders to exclude from file search. StringRef InputFilePath; // The full path of the input file. + int CodePage = CpAcp; // The codepage for interpreting characters. }; class ResourceFileWriter : public Visitor { public: - ResourceFileWriter(const SearchParams &Params, + ResourceFileWriter(const WriterParams &Params, std::unique_ptr Stream) : Params(Params), FS(std::move(Stream)), IconCursorID(1) { assert(FS && "Output stream needs to be provided to the serializator"); @@ -146,7 +156,7 @@ Error writeVersionInfoBlock(const VersionInfoBlock &); Error writeVersionInfoValue(const VersionInfoValue &); - const SearchParams &Params; + const WriterParams &Params; // Output stream handling. std::unique_ptr FS; Index: llvm/trunk/tools/llvm-rc/ResourceFileWriter.cpp =================================================================== --- llvm/trunk/tools/llvm-rc/ResourceFileWriter.cpp +++ llvm/trunk/tools/llvm-rc/ResourceFileWriter.cpp @@ -110,6 +110,18 @@ return true; } +static UTF16 cp1252ToUnicode(unsigned char C) { + static const UTF16 Map80[] = { + 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, + 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f, + 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, + 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178, + }; + if (C >= 0x80 && C <= 0x9F) + return Map80[C - 0x80]; + return C; +} + // Describes a way to handle '\0' characters when processing the string. // rc.exe tool sometimes behaves in a weird way in postprocessing. // If the string to be output is equivalent to a C-string (e.g. in MENU @@ -132,10 +144,26 @@ // * Replace the escape sequences with their processed version. // For identifiers, this is no-op. static Error processString(StringRef Str, NullHandlingMethod NullHandler, - bool &IsLongString, SmallVectorImpl &Result) { + bool &IsLongString, SmallVectorImpl &Result, + int CodePage) { bool IsString = stripQuotes(Str, IsLongString); SmallVector Chars; - convertUTF8ToUTF16String(Str, Chars); + + // Convert the input bytes according to the chosen codepage. + if (CodePage == CpUtf8) { + convertUTF8ToUTF16String(Str, Chars); + } else if (CodePage == CpWin1252) { + for (char C : Str) + Chars.push_back(cp1252ToUnicode((unsigned char)C)); + } else { + // For other, unknown codepages, only allow plain ASCII input. + for (char C : Str) { + if ((unsigned char)C > 0x7F) + return createError("Non-ASCII 8-bit codepoint (" + Twine(C) + + ") can't be interpreted in the current codepage"); + Chars.push_back((unsigned char)C); + } + } if (!IsString) { // It's an identifier if it's not a string. Make all characters uppercase. @@ -157,21 +185,35 @@ if (Char > 0xFF) return createError("Non-8-bit codepoint (" + Twine(Char) + ") can't occur in a user-defined narrow string"); + } + } + Result.push_back(Char); + return Error::success(); + }; + auto AddEscapedChar = [AddRes, IsLongString, CodePage](UTF16 Char) -> Error { + if (!IsLongString) { + // Escaped chars in narrow strings have to be interpreted according to + // the chosen code page. + if (Char > 0xFF) + return createError("Non-8-bit escaped char (" + Twine(Char) + + ") can't occur in narrow string"); + if (CodePage == CpUtf8) { + if (Char >= 0x80) + return createError("Unable to interpret single byte (" + Twine(Char) + + ") as UTF-8"); + } else if (CodePage == CpWin1252) { + Char = cp1252ToUnicode(Char); } else { - // In case of narrow non-user strings, Windows RC converts - // [0x80, 0xFF] chars according to the current codepage. - // There is no 'codepage' concept settled in every supported platform, - // so we should reject such inputs. - if (Char > 0x7F && Char <= 0xFF) + // Unknown/unsupported codepage, only allow ASCII input. + if (Char > 0x7F) return createError("Non-ASCII 8-bit codepoint (" + Twine(Char) + ") can't " "occur in a non-Unicode string"); } } - Result.push_back(Char); - return Error::success(); + return AddRes(Char); }; while (Pos < Chars.size()) { @@ -223,7 +265,7 @@ --RemainingChars; } - RETURN_IF_ERROR(AddRes(ReadInt)); + RETURN_IF_ERROR(AddEscapedChar(ReadInt)); continue; } @@ -240,7 +282,7 @@ ++Pos; } - RETURN_IF_ERROR(AddRes(ReadInt)); + RETURN_IF_ERROR(AddEscapedChar(ReadInt)); continue; } @@ -328,7 +370,8 @@ SmallVector ProcessedString; bool IsLongString; RETURN_IF_ERROR(processString(Str, NullHandlingMethod::CutAtNull, - IsLongString, ProcessedString)); + IsLongString, ProcessedString, + Params.CodePage)); for (auto Ch : ProcessedString) writeInt(Ch); if (WriteTerminator) @@ -1142,6 +1185,7 @@ static bool classof(const RCResource *Res) { return Res->getKind() == RkStringTableBundle; } + Twine getResourceTypeName() const override { return "STRINGTABLE"; } }; Error ResourceFileWriter::visitStringTableBundle(const RCResource *Res) { @@ -1168,7 +1212,7 @@ SmallVector Data; RETURN_IF_ERROR(processString(Res->Bundle.Data[ID].getValueOr(StringRef()), NullHandlingMethod::CutAtDoubleNull, - IsLongString, Data)); + IsLongString, Data, Params.CodePage)); if (AppendNull && Res->Bundle.Data[ID]) Data.push_back('\0'); RETURN_IF_ERROR( @@ -1215,9 +1259,9 @@ SmallVector ProcessedString; bool IsLongString; - RETURN_IF_ERROR(processString(Elem.getString(), - NullHandlingMethod::UserResource, - IsLongString, ProcessedString)); + RETURN_IF_ERROR( + processString(Elem.getString(), NullHandlingMethod::UserResource, + IsLongString, ProcessedString, Params.CodePage)); for (auto Ch : ProcessedString) { if (IsLongString) { Index: llvm/trunk/tools/llvm-rc/llvm-rc.cpp =================================================================== --- llvm/trunk/tools/llvm-rc/llvm-rc.cpp +++ llvm/trunk/tools/llvm-rc/llvm-rc.cpp @@ -129,13 +129,29 @@ } } - SearchParams Params; + WriterParams Params; SmallString<128> InputFile(InArgsInfo[0]); llvm::sys::fs::make_absolute(InputFile); Params.InputFilePath = InputFile; Params.Include = InputArgs.getAllArgValues(OPT_INCLUDE); Params.NoInclude = InputArgs.getAllArgValues(OPT_NOINCLUDE); + if (InputArgs.hasArg(OPT_CODEPAGE)) { + if (InputArgs.getLastArgValue(OPT_CODEPAGE) + .getAsInteger(10, Params.CodePage)) + fatalError("Invalid code page: " + + InputArgs.getLastArgValue(OPT_CODEPAGE)); + switch (Params.CodePage) { + case CpAcp: + case CpWin1252: + case CpUtf8: + break; + default: + fatalError( + "Unsupported code page, only 0, 1252 and 65001 are supported!"); + } + } + std::unique_ptr Visitor; bool IsDryRun = InputArgs.hasArg(OPT_DRY_RUN);