This is an archive of the discontinued LLVM Phabricator instance.

[llvm-rc] Support UTF inputs.
Needs ReviewPublic

Authored by mnbvmar on Sep 29 2017, 3:10 PM.

Download Raw Diff

Details

Reviewers

rnk
zturner

Summary

This enables llvm-rc to consume UTF-8 and UTF-16 little endian inputs. The latter is a widely used Microsoft standard, and is given as a large part of the inputs. However, original rc tool also accepts UTF-8. Therefore, we need to find a way to determine the file encoding, and convert the input script to UTF-8.

We settle on the following algorithm: if the file starts with UTF-16 Byte Order Mark, or its second byte is equal to 0 (i.e., first character in the file is in range [0x00, 0xFF]), we guess the file is UTF-16LE. In the opposite case, we guess it's UTF-8.
This method should be enough for all feasible .rc inputs.

Diff Detail

Event Timeline

mnbvmar created this revision.Sep 29 2017, 3:10 PM

mnbvmar added a parent revision: D38429: [llvm-rc] Add directory lookup support (serialization, pt 11)..

Revision Contents

Path

Size

llvm/

test/

tools/

llvm-rc/

Inputs/

utf16le.rc

utf8.rc

22 lines

utf-support.test

51 lines

tools/

llvm-rc/

llvm-rc.cpp

45 lines

Diff 117231

llvm/test/tools/llvm-rc/Inputs/utf16le.rc

This binary file was added.

llvm/test/tools/llvm-rc/Inputs/utf8.rc

This file was added.

				STRINGTABLE

				BEGIN

				1 "리소스 컴파일러"

				2 L"리소스 컴파일러"

				END


				128 450 {
				L"리소스 컴파일러"

				}

				1 MENU {
				MENUITEM "리소스 컴파일러"
				, 500
				MENUITEM L"리소스 컴파일러"
				, 501
				}

llvm/test/tools/llvm-rc/utf-support.test

This file was added.

				; Both inputs have the same contents; they're only encoded differently.

				; RUN: llvm-rc /FO %t %p/Inputs/utf8.rc
				; RUN: llvm-readobj %t \| FileCheck %s

				; RUN: llvm-rc /FO %t2 %p/Inputs/utf16le.rc
				; RUN: llvm-readobj %t2 \| FileCheck %s


				; CHECK: Resource type (int): 450
				; CHECK-NEXT: Resource name (int): 128
				; CHECK-NEXT: Data version: 0
				; CHECK-NEXT: Memory flags: 0x30
				; CHECK-NEXT: Language ID: 1033
				; CHECK-NEXT: Version (major): 0
				; CHECK-NEXT: Version (minor): 0
				; CHECK-NEXT: Characteristics: 0
				; CHECK-NEXT: Data size: 16
				; CHECK-NEXT: Data:: (AC B9 8C C1 A4 C2 20 00 F4 CE 0C D3 7C C7 EC B7)

				; CHECK-DAG: Resource type (int): 4
				; CHECK-NEXT: Resource name (int): 1
				; CHECK-NEXT: Data version: 0
				; CHECK-NEXT: Memory flags: 0x1030
				; CHECK-NEXT: Language ID: 1033
				; CHECK-NEXT: Version (major): 0
				; CHECK-NEXT: Version (minor): 0
				; CHECK-NEXT: Characteristics: 0
				; CHECK-NEXT: Data size: 48
				; CHECK-NEXT: Data: (
				; CHECK-NEXT: 0000: 00000000 0000F401 ACB98CC1 A4C22000 \|.............. .\|
				; CHECK-NEXT: 0010: F4CE0CD3 7CC7ECB7 00008000 F501ACB9 \|....\|...........\|
				; CHECK-NEXT: 0020: 8CC1A4C2 2000F4CE 0CD37CC7 ECB70000 \|.... .....\|.....\|
				; CHECK-NEXT: )

				; CHECK-DAG: Resource type (int): 6
				; CHECK-NEXT: Resource name (int): 1
				; CHECK-NEXT: Data version: 0
				; CHECK-NEXT: Memory flags: 0x1030
				; CHECK-NEXT: Language ID: 1033
				; CHECK-NEXT: Version (major): 0
				; CHECK-NEXT: Version (minor): 0
				; CHECK-NEXT: Characteristics: 0
				; CHECK-NEXT: Data size: 64
				; CHECK-NEXT: Data: (
				; CHECK-NEXT: 0000: 00000800 ACB98CC1 A4C22000 F4CE0CD3 \|.......... .....\|
				; CHECK-NEXT: 0010: 7CC7ECB7 0800ACB9 8CC1A4C2 2000F4CE \|\|........... ...\|
				; CHECK-NEXT: 0020: 0CD37CC7 ECB70000 00000000 00000000 \|..\|.............\|
				; CHECK-NEXT: 0030: 00000000 00000000 00000000 00000000 \|................\|
				; CHECK-NEXT: )

llvm/tools/llvm-rc/llvm-rc.cpp

Show All 13 Lines

#include "ResourceFileWriter.h"		#include "ResourceFileWriter.h"
#include "ResourceScriptParser.h"		#include "ResourceScriptParser.h"
#include "ResourceScriptStmt.h"		#include "ResourceScriptStmt.h"
#include "ResourceScriptToken.h"		#include "ResourceScriptToken.h"

#include "llvm/Option/Arg.h"		#include "llvm/Option/Arg.h"
#include "llvm/Option/ArgList.h"		#include "llvm/Option/ArgList.h"
		#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/Error.h"		#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"		#include "llvm/Support/FileSystem.h"
#include "llvm/Support/ManagedStatic.h"		#include "llvm/Support/ManagedStatic.h"
#include "llvm/Support/Path.h"		#include "llvm/Support/Path.h"
#include "llvm/Support/PrettyStackTrace.h"		#include "llvm/Support/PrettyStackTrace.h"
#include "llvm/Support/Process.h"		#include "llvm/Support/Process.h"
#include "llvm/Support/Signals.h"		#include "llvm/Support/Signals.h"
#include "llvm/Support/raw_ostream.h"		#include "llvm/Support/raw_ostream.h"
▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines	fetchIncludes(StringRef InputName, const opt::InputArgList &InputArgs) {

std::vector<PathType> Dirs{pathToNative(sys::path::parent_path(InputName)),		std::vector<PathType> Dirs{pathToNative(sys::path::parent_path(InputName)),
pathToNative(StringRef())};		pathToNative(StringRef())};
for (auto Include : InputArgs.getAllArgValues(OPT_INCLUDE))		for (auto Include : InputArgs.getAllArgValues(OPT_INCLUDE))
Dirs.push_back(pathToNative(Include));		Dirs.push_back(pathToNative(Include));
return Dirs;		return Dirs;
}		}

		// We need to use a heuristic to detect the encoding. We guess that a string
		// is UTF-16LE if either:
		// * its second byte is equal to 0 (that's a required condition),
		// * or a UTF-16LE BOM (\xff\xfe) is given.
		bool isUTF16(StringRef Str) {
		if (Str.size() < 2)
		return false;
		return Str.startswith("\xff\xfe") \|\| Str[1] == '\0';
		}

		// Convert file from UTF-8 or UTF-16LE.
		// we guess that a correct UTF-8 script won't contain any null bytes).
		std::string decodeFile(StringRef Filename) {
		ErrorOr<std::unique_ptr<MemoryBuffer>> File = MemoryBuffer::getFile(Filename);
		if (!File)
		fatalError("Error opening file '" + Filename +
		"': " + File.getError().message());
		std::unique_ptr<MemoryBuffer> FileContents = std::move(*File);

		auto *FileFrom = FileContents->getBufferStart();
		auto *FileTo = FileContents->getBufferEnd();
		if (isUTF16(FileContents->getBuffer())) {
		std::string ContentsFromUTF16;
		if (convertUTF16ToUTF8String(ArrayRef<char>(FileFrom, FileTo),
		ContentsFromUTF16))
		return ContentsFromUTF16;
		} else {
		if (isLegalUTF8String(reinterpret_cast<const UTF8 **>(&FileFrom),
		reinterpret_cast<const UTF8 *>(FileTo)))
		return FileContents->getBuffer();
		}

		fatalError("Input file '" + Filename + "' is neither UTF-8 nor UTF-16");
		}

} // anonymous namespace		} // anonymous namespace

int main(int argc_, const char *argv_[]) {		int main(int argc_, const char *argv_[]) {
sys::PrintStackTraceOnErrorSignal(argv_[0]);		sys::PrintStackTraceOnErrorSignal(argv_[0]);
PrettyStackTraceProgram X(argc_, argv_);		PrettyStackTraceProgram X(argc_, argv_);

ExitOnErr.setBanner("llvm-rc: ");		ExitOnErr.setBanner("llvm-rc: ");

Show All 19 Lines	int main(int argc_, const char *argv_[]) {

std::vector<std::string> InArgsInfo = InputArgs.getAllArgValues(OPT_INPUT);		std::vector<std::string> InArgsInfo = InputArgs.getAllArgValues(OPT_INPUT);
if (InArgsInfo.size() != 1) {		if (InArgsInfo.size() != 1) {
fatalError("Exactly one input file should be provided.");		fatalError("Exactly one input file should be provided.");
}		}

// Read and tokenize the input file.		// Read and tokenize the input file.
StringRef Filename = InArgsInfo[0];		StringRef Filename = InArgsInfo[0];
ErrorOr<std::unique_ptr<MemoryBuffer>> File = MemoryBuffer::getFile(Filename);		std::string Contents = decodeFile(Filename);
if (!File) {
fatalError("Error opening file '" + Filename +
"': " + File.getError().message());
}

std::unique_ptr<MemoryBuffer> FileContents = std::move(*File);
StringRef Contents = FileContents->getBuffer();

std::vector<RCToken> Tokens = ExitOnErr(tokenizeRC(Contents));		std::vector<RCToken> Tokens = ExitOnErr(tokenizeRC(Contents));

if (BeVerbose) {		if (BeVerbose) {
const Twine TokenNames[] = {		const Twine TokenNames[] = {
#define TOKEN(Name) #Name,		#define TOKEN(Name) #Name,
#define SHORT_TOKEN(Name, Ch) #Name,		#define SHORT_TOKEN(Name, Ch) #Name,
#include "ResourceScriptTokenList.h"		#include "ResourceScriptTokenList.h"
▲ Show 20 Lines • Show All 54 Lines • Show Last 20 Lines