This is an archive of the discontinued LLVM Phabricator instance.

Differential D123711

[flang] Always encode multi-byte output in UTF-8
ClosedPublic

Authored by klausler on Apr 13 2022, 12:35 PM.

Download Raw Diff

Details

Reviewers

jeanPerier

Commits

rG664c111c958c: [flang] Always encode multi-byte output in UTF-8

Summary

A recent change to implement UTF-8 encoding should have
made the encoding conditional only for CHARACTER(KIND=1)
to enable UTF-8 output vs. Latin-1 or whatever. UTF-8 output
of wider CHARACTER kinds should not be conditional (until we choose
to support UCS-16, maybe). So wider CHARACTER kinds are being
emitted with extra zero bytes; this patch fixes them.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

klausler created this revision.Apr 13 2022, 12:35 PM

Herald added a project: Restricted Project. · View Herald TranscriptApr 13 2022, 12:35 PM

Herald added a subscriber: jdoerfert. · View Herald Transcript

klausler requested review of this revision.Apr 13 2022, 12:35 PM

Harbormaster completed remote builds in B159509: Diff 422601.Apr 13 2022, 1:00 PM

Thanks

This revision is now accepted and ready to land.Apr 14 2022, 2:26 AM

Closed by commit rG664c111c958c: [flang] Always encode multi-byte output in UTF-8 (authored by klausler). · Explain WhyApr 14 2022, 11:14 AM

This revision was automatically updated to reflect the committed changes.

klausler added a commit: rG664c111c958c: [flang] Always encode multi-byte output in UTF-8.

Revision Contents

Path

Size

flang/

runtime/

connection.h

7 lines

edit-output.cpp

2 lines

io-stmt.cpp

2 lines

Diff 422924

flang/runtime/connection.h

Show All 28 Lines	struct ConnectionAttributes {
std::optional<bool> isUnformatted; // FORM='UNFORMATTED' if true		std::optional<bool> isUnformatted; // FORM='UNFORMATTED' if true
bool isUTF8{false}; // ENCODING='UTF-8'		bool isUTF8{false}; // ENCODING='UTF-8'
std::optional<std::int64_t> openRecl; // RECL= on OPEN		std::optional<std::int64_t> openRecl; // RECL= on OPEN

bool IsRecordFile() const {		bool IsRecordFile() const {
// Formatted stream files are viewed as having records, at least on input		// Formatted stream files are viewed as having records, at least on input
return access != Access::Stream \|\| !isUnformatted.value_or(true);		return access != Access::Stream \|\| !isUnformatted.value_or(true);
}		}

		template <typename CHAR = char> constexpr bool useUTF8() const {
		// For wide CHARACTER kinds, always use UTF-8 for formatted I/O.
		// For single-byte CHARACTER, encode characters >= 0x80 with
		// UTF-8 iff the mode is set.
		return sizeof(CHAR) > 1 \|\| isUTF8;
		}
};		};

struct ConnectionState : public ConnectionAttributes {		struct ConnectionState : public ConnectionAttributes {
bool IsAtEOF() const; // true when read has hit EOF or endfile record		bool IsAtEOF() const; // true when read has hit EOF or endfile record
bool IsAfterEndfile() const; // true after ENDFILE until repositioned		bool IsAfterEndfile() const; // true after ENDFILE until repositioned
std::size_t RemainingSpaceInRecord() const;		std::size_t RemainingSpaceInRecord() const;
bool NeedAdvance(std::size_t) const;		bool NeedAdvance(std::size_t) const;
void HandleAbsolutePosition(std::int64_t);		void HandleAbsolutePosition(std::int64_t);
▲ Show 20 Lines • Show All 51 Lines • Show Last 20 Lines

flang/runtime/edit-output.cpp

Show First 20 Lines • Show All 500 Lines • ▼ Show 20 Lines	for (std::size_t j{0}; j < length; ++j) {
}		}
EmitOne(x[j]);		EmitOne(x[j]);
}		}
EmitOne(modes.delim);		EmitOne(modes.delim);
} else {		} else {
// Undelimited list-directed output		// Undelimited list-directed output
ok = ok && list.EmitLeadingSpaceOrAdvance(io, length > 0 ? 1 : 0, true);		ok = ok && list.EmitLeadingSpaceOrAdvance(io, length > 0 ? 1 : 0, true);
std::size_t put{0};		std::size_t put{0};
std::size_t oneIfUTF8{connection.isUTF8 ? 1 : length};		std::size_t oneIfUTF8{connection.useUTF8<CHAR>() ? 1 : length};
while (ok && put < length) {		while (ok && put < length) {
if (std::size_t chunk{std::min<std::size_t>(		if (std::size_t chunk{std::min<std::size_t>(
std::min<std::size_t>(length - put, oneIfUTF8),		std::min<std::size_t>(length - put, oneIfUTF8),
connection.RemainingSpaceInRecord())}) {		connection.RemainingSpaceInRecord())}) {
ok = io.EmitEncoded(x + put, chunk);		ok = io.EmitEncoded(x + put, chunk);
put += chunk;		put += chunk;
} else {		} else {
ok = io.AdvanceRecord() && io.Emit(" ", 1);		ok = io.AdvanceRecord() && io.Emit(" ", 1);
▲ Show 20 Lines • Show All 63 Lines • Show Last 20 Lines

flang/runtime/io-stmt.cpp

Show First 20 Lines • Show All 471 Lines • ▼ Show 20 Lines	bool IoStatementState::Emit(const char32_t *data, std::size_t chars) {
return std::visit([=](auto &x) { return x.get().Emit(data, chars); }, u_);		return std::visit([=](auto &x) { return x.get().Emit(data, chars); }, u_);
}		}

template <typename CHAR>		template <typename CHAR>
bool IoStatementState::EmitEncoded(const CHAR *data0, std::size_t chars) {		bool IoStatementState::EmitEncoded(const CHAR *data0, std::size_t chars) {
// Don't allow sign extension		// Don't allow sign extension
using UnsignedChar = std::make_unsigned_t<CHAR>;		using UnsignedChar = std::make_unsigned_t<CHAR>;
const UnsignedChar data{reinterpret_cast<const UnsignedChar >(data0)};		const UnsignedChar data{reinterpret_cast<const UnsignedChar >(data0)};
if (GetConnectionState().isUTF8) {		if (GetConnectionState().useUTF8<CHAR>()) {
char buffer[256];		char buffer[256];
std::size_t at{0};		std::size_t at{0};
while (chars-- > 0) {		while (chars-- > 0) {
auto len{EncodeUTF8(buffer + at, *data++)};		auto len{EncodeUTF8(buffer + at, *data++)};
at += len;		at += len;
if (at + maxUTF8Bytes > sizeof buffer) {		if (at + maxUTF8Bytes > sizeof buffer) {
if (!Emit(buffer, at)) {		if (!Emit(buffer, at)) {
return false;		return false;
▲ Show 20 Lines • Show All 1,019 Lines • Show Last 20 Lines