This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
include/clang/Basic/
-
clang/
-
Basic/
-
SourceManager.h
-
lib/Basic/
-
Basic/
1
SourceManager.cpp
-
test/Misc/
-
Misc/
-
diag-utf8.cpp

Differential D33765

Show correct column nr. when multi-byte utf8 chars are used.
Needs ReviewPublic

Authored by erikjv on Jun 1 2017, 3:14 AM.

Download Raw Diff

Details

Reviewers

bkramer
klimek

Summary

Previously, the column number in a diagnostic would be the byte position
in the line. This results in incorrect column numbers when a multi-byte
UTF-8 character would be present in the input. This change corrects for
those multi-byte characters and for zero-length diacritic marks.

This fixes PR21144.

Diff Detail

Event Timeline

erikjv created this revision.Jun 1 2017, 3:14 AM

Correctly counting columns is a bit more complicated that that... for example, consider what happens if you replace ideëen with idez̈en. See https://stackoverflow.com/questions/3634627/how-to-know-the-preferred-display-width-in-columns-of-unicode-characters .

erikjv updated this revision to Diff 117660.Oct 4 2017, 5:28 AM

erikjv edited the summary of this revision. (Show Details)

yvvan added a subscriber: yvvan.Oct 25 2017, 11:38 PM

I didn't really search for it before, but it looks like LLVM already has a routine for computing column widths? See llvm::sys::unicode::columnWidthUTF8.

There are some tools which parse clang diagnostic output; we might need a flag to control this. Not sure who would know about that?

lib/Basic/SourceManager.cpp
1501	Instead of adding a parameter to getColumnNumber, it would probably make sense to just make this caller correct the column number afterwards.

I moved all code to the TextDiagnostics, so all other interfaces still get byte offsets.

Still worried about the effect on tools which parse clang diagnostics... please send a message to cfe-dev. Hopefully we'll get responses there.

Godin added a subscriber: Godin.May 22 2018, 8:29 AM

lelf added a subscriber: lelf.May 19 2019, 7:07 PM

Revision Contents

Path

Size

include/

clang/

Basic/

SourceManager.h

3 lines

lib/

Basic/

SourceManager.cpp

52 lines

test/

Misc/

diag-utf8.cpp

10 lines

Diff 117660

include/clang/Basic/SourceManager.h

Show First 20 Lines • Show All 1,294 Lines • ▼ Show 20 Lines	public:

/// \brief Return the column # for the specified file position.		/// \brief Return the column # for the specified file position.
///		///
/// This is significantly cheaper to compute than the line number. This		/// This is significantly cheaper to compute than the line number. This
/// returns zero if the column number isn't known. This may only be called		/// returns zero if the column number isn't known. This may only be called
/// on a file sloc, so you must choose a spelling or expansion location		/// on a file sloc, so you must choose a spelling or expansion location
/// before calling this method.		/// before calling this method.
unsigned getColumnNumber(FileID FID, unsigned FilePos,		unsigned getColumnNumber(FileID FID, unsigned FilePos,
bool *Invalid = nullptr) const;		bool *Invalid = nullptr,
		bool BytePosition = true) const;
unsigned getSpellingColumnNumber(SourceLocation Loc,		unsigned getSpellingColumnNumber(SourceLocation Loc,
bool *Invalid = nullptr) const;		bool *Invalid = nullptr) const;
unsigned getExpansionColumnNumber(SourceLocation Loc,		unsigned getExpansionColumnNumber(SourceLocation Loc,
bool *Invalid = nullptr) const;		bool *Invalid = nullptr) const;
unsigned getPresumedColumnNumber(SourceLocation Loc,		unsigned getPresumedColumnNumber(SourceLocation Loc,
bool *Invalid = nullptr) const;		bool *Invalid = nullptr) const;

/// \brief Given a SourceLocation, return the spelling line number		/// \brief Given a SourceLocation, return the spelling line number
▲ Show 20 Lines • Show All 457 Lines • Show Last 20 Lines

lib/Basic/SourceManager.cpp

Show First 20 Lines • Show All 1,078 Lines • ▼ Show 20 Lines	const char *SourceManager::getCharacterData(SourceLocation SL,
}		}
llvm::MemoryBuffer *Buffer = Entry.getFile().getContentCache()->getBuffer(		llvm::MemoryBuffer *Buffer = Entry.getFile().getContentCache()->getBuffer(
Diag, *this, SourceLocation(), &CharDataInvalid);		Diag, *this, SourceLocation(), &CharDataInvalid);
if (Invalid)		if (Invalid)
*Invalid = CharDataInvalid;		*Invalid = CharDataInvalid;
return Buffer->getBufferStart() + (CharDataInvalid? 0 : LocInfo.second);		return Buffer->getBufferStart() + (CharDataInvalid? 0 : LocInfo.second);
}		}

		static unsigned correctForMultiByteChars(const char *Buf, unsigned LineStart,
		unsigned Column) {
		auto isDiacriticMark = [Buf, LineStart, Column](unsigned I) -> bool {
		if (I + 1 >= Column)
		return false;
		unsigned char FirstByte = static_cast<unsigned char>(Buf[LineStart + I]);
		unsigned char SecondByte =
		static_cast<unsigned char>(Buf[LineStart + I + 1]);
		if (FirstByte == 0xcc) {
		return SecondByte >= 0x80;
		} else if (FirstByte == 0xcd) {
		return SecondByte < 0xaf;
		}
		return false;
		};

		unsigned CorrectedColumn = Column;
		unsigned char FirstByte;
		for (unsigned I = 0; I < Column; ++I) {
		FirstByte = static_cast<unsigned char>(Buf[LineStart + I]);
		if (FirstByte < 0xc0)
		continue;
		if (isDiacriticMark(I)) {
		CorrectedColumn -= 2;
		++I;
		} else if (FirstByte < 0xe0) {
		--CorrectedColumn;
		++I;
		} else if (FirstByte < 0xf0) {
		CorrectedColumn -= 2;
		I += 2;
		} else {
		CorrectedColumn -= 3;
		I += 3;
		}
		}
		return CorrectedColumn;
		}

/// getColumnNumber - Return the column # for the specified file position.		/// getColumnNumber - Return the column # for the specified file position.
/// this is significantly cheaper to compute than the line number.		/// this is significantly cheaper to compute than the line number.
unsigned SourceManager::getColumnNumber(FileID FID, unsigned FilePos,		unsigned SourceManager::getColumnNumber(FileID FID, unsigned FilePos,
bool *Invalid) const {		bool *Invalid,
		bool BytePosition) const {
bool MyInvalid = false;		bool MyInvalid = false;
llvm::MemoryBuffer *MemBuf = getBuffer(FID, &MyInvalid);		llvm::MemoryBuffer *MemBuf = getBuffer(FID, &MyInvalid);
if (Invalid)		if (Invalid)
*Invalid = MyInvalid;		*Invalid = MyInvalid;

if (MyInvalid)		if (MyInvalid)
return 1;		return 1;

Show All 17 Lines	if (FilePos >= LineStart && FilePos < LineEnd) {
// LineEnd is the LineStart of the next line.		// LineEnd is the LineStart of the next line.
// A line ends with separator LF or CR+LF on Windows.		// A line ends with separator LF or CR+LF on Windows.
// FilePos might point to the last separator,		// FilePos might point to the last separator,
// but we need a column number at most 1 + the last column.		// but we need a column number at most 1 + the last column.
if (FilePos + 1 == LineEnd && FilePos > LineStart) {		if (FilePos + 1 == LineEnd && FilePos > LineStart) {
if (Buf[FilePos - 1] == '\r' \|\| Buf[FilePos - 1] == '\n')		if (Buf[FilePos - 1] == '\r' \|\| Buf[FilePos - 1] == '\n')
--FilePos;		--FilePos;
}		}
return FilePos - LineStart + 1;		unsigned Column = FilePos - LineStart + 1;
		return BytePosition ? Column
		: correctForMultiByteChars(Buf, LineStart, Column);
}		}
}		}

unsigned LineStart = FilePos;		unsigned LineStart = FilePos;
while (LineStart && Buf[LineStart-1] != '\n' && Buf[LineStart-1] != '\r')		while (LineStart && Buf[LineStart-1] != '\n' && Buf[LineStart-1] != '\r')
--LineStart;		--LineStart;
return FilePos-LineStart+1;		unsigned Column = FilePos - LineStart + 1;
		return BytePosition ? Column
		: correctForMultiByteChars(Buf, LineStart, Column);
}		}

// isInvalid - Return the result of calling loc.isInvalid(), and		// isInvalid - Return the result of calling loc.isInvalid(), and
// if Invalid is not null, set its value to same.		// if Invalid is not null, set its value to same.
template<typename LocType>		template<typename LocType>
static bool isInvalid(LocType Loc, bool *Invalid) {		static bool isInvalid(LocType Loc, bool *Invalid) {
bool MyInvalid = Loc.isInvalid();		bool MyInvalid = Loc.isInvalid();
if (Invalid)		if (Invalid)
▲ Show 20 Lines • Show All 308 Lines • ▼ Show 20 Lines	PresumedLoc SourceManager::getPresumedLoc(SourceLocation Loc,
if (C->OrigEntry)		if (C->OrigEntry)
Filename = C->OrigEntry->getName();		Filename = C->OrigEntry->getName();
else		else
Filename = C->getBuffer(Diag, *this)->getBufferIdentifier();		Filename = C->getBuffer(Diag, *this)->getBufferIdentifier();

unsigned LineNo = getLineNumber(LocInfo.first, LocInfo.second, &Invalid);		unsigned LineNo = getLineNumber(LocInfo.first, LocInfo.second, &Invalid);
if (Invalid)		if (Invalid)
return PresumedLoc();		return PresumedLoc();
unsigned ColNo = getColumnNumber(LocInfo.first, LocInfo.second, &Invalid);		unsigned ColNo = getColumnNumber(LocInfo.first, LocInfo.second, &Invalid,
		/BytePosition=/false);
		efriedmaUnsubmitted Not Done Reply Inline Actions Instead of adding a parameter to getColumnNumber, it would probably make sense to just make this caller correct the column number afterwards. efriedma: Instead of adding a parameter to getColumnNumber, it would probably make sense to just make…
if (Invalid)		if (Invalid)
return PresumedLoc();		return PresumedLoc();

SourceLocation IncludeLoc = FI.getIncludeLoc();		SourceLocation IncludeLoc = FI.getIncludeLoc();

// If we have #line directives in this file, update and overwrite the physical		// If we have #line directives in this file, update and overwrite the physical
// location info if appropriate.		// location info if appropriate.
if (UseLineDirectives && FI.hasLineDirectives()) {		if (UseLineDirectives && FI.hasLineDirectives()) {
▲ Show 20 Lines • Show All 768 Lines • Show Last 20 Lines

test/Misc/diag-utf8.cpp

This file was added.

				// RUN: not %clang_cc1 -fsyntax-only %s 2>&1 \| FileCheck %s

				struct Foo { int member; };

				void f(Foo foo)
				{
				"ideeen" << foo; // CHECK: {{.[/\\]}}diag-utf8.cpp:7:14: error: invalid operands to binary expression ('const char ' and 'Foo')
				"ideëen" << foo; // CHECK: {{.[/\\]}}diag-utf8.cpp:8:14: error: invalid operands to binary expression ('const char ' and 'Foo')
				"idez̈en" << foo; // CHECK: {{.[/\\]}}diag-utf8.cpp:9:14: error: invalid operands to binary expression ('const char ' and 'Foo')
				}