Index: include/clang/Basic/SourceManager.h =================================================================== --- include/clang/Basic/SourceManager.h +++ include/clang/Basic/SourceManager.h @@ -1300,7 +1300,8 @@ /// on a file sloc, so you must choose a spelling or expansion location /// before calling this method. unsigned getColumnNumber(FileID FID, unsigned FilePos, - bool *Invalid = nullptr) const; + bool *Invalid = nullptr, + bool BytePosition = true) const; unsigned getSpellingColumnNumber(SourceLocation Loc, bool *Invalid = nullptr) const; unsigned getExpansionColumnNumber(SourceLocation Loc, Index: lib/Basic/SourceManager.cpp =================================================================== --- lib/Basic/SourceManager.cpp +++ lib/Basic/SourceManager.cpp @@ -1084,11 +1084,50 @@ return Buffer->getBufferStart() + (CharDataInvalid? 0 : LocInfo.second); } +static unsigned correctForMultiByteChars(const char *Buf, unsigned LineStart, + unsigned Column) { + auto isDiacriticMark = [Buf, LineStart, Column](unsigned I) -> bool { + if (I + 1 >= Column) + return false; + unsigned char FirstByte = static_cast(Buf[LineStart + I]); + unsigned char SecondByte = + static_cast(Buf[LineStart + I + 1]); + if (FirstByte == 0xcc) { + return SecondByte >= 0x80; + } else if (FirstByte == 0xcd) { + return SecondByte < 0xaf; + } + return false; + }; + + unsigned CorrectedColumn = Column; + unsigned char FirstByte; + for (unsigned I = 0; I < Column; ++I) { + FirstByte = static_cast(Buf[LineStart + I]); + if (FirstByte < 0xc0) + continue; + if (isDiacriticMark(I)) { + CorrectedColumn -= 2; + ++I; + } else if (FirstByte < 0xe0) { + --CorrectedColumn; + ++I; + } else if (FirstByte < 0xf0) { + CorrectedColumn -= 2; + I += 2; + } else { + CorrectedColumn -= 3; + I += 3; + } + } + return CorrectedColumn; +} /// getColumnNumber - Return the column # for the specified file position. /// this is significantly cheaper to compute than the line number. unsigned SourceManager::getColumnNumber(FileID FID, unsigned FilePos, - bool *Invalid) const { + bool *Invalid, + bool BytePosition) const { bool MyInvalid = false; llvm::MemoryBuffer *MemBuf = getBuffer(FID, &MyInvalid); if (Invalid) @@ -1122,14 +1161,18 @@ if (Buf[FilePos - 1] == '\r' || Buf[FilePos - 1] == '\n') --FilePos; } - return FilePos - LineStart + 1; + unsigned Column = FilePos - LineStart + 1; + return BytePosition ? Column + : correctForMultiByteChars(Buf, LineStart, Column); } } unsigned LineStart = FilePos; while (LineStart && Buf[LineStart-1] != '\n' && Buf[LineStart-1] != '\r') --LineStart; - return FilePos-LineStart+1; + unsigned Column = FilePos - LineStart + 1; + return BytePosition ? Column + : correctForMultiByteChars(Buf, LineStart, Column); } // isInvalid - Return the result of calling loc.isInvalid(), and @@ -1454,7 +1497,8 @@ unsigned LineNo = getLineNumber(LocInfo.first, LocInfo.second, &Invalid); if (Invalid) return PresumedLoc(); - unsigned ColNo = getColumnNumber(LocInfo.first, LocInfo.second, &Invalid); + unsigned ColNo = getColumnNumber(LocInfo.first, LocInfo.second, &Invalid, + /*BytePosition=*/false); if (Invalid) return PresumedLoc(); Index: test/Misc/diag-utf8.cpp =================================================================== --- /dev/null +++ test/Misc/diag-utf8.cpp @@ -0,0 +1,10 @@ +// RUN: not %clang_cc1 -fsyntax-only %s 2>&1 | FileCheck %s + +struct Foo { int member; }; + +void f(Foo foo) +{ + "ideeen" << foo; // CHECK: {{.*[/\\]}}diag-utf8.cpp:7:14: error: invalid operands to binary expression ('const char *' and 'Foo') + "ideëen" << foo; // CHECK: {{.*[/\\]}}diag-utf8.cpp:8:14: error: invalid operands to binary expression ('const char *' and 'Foo') + "idez̈en" << foo; // CHECK: {{.*[/\\]}}diag-utf8.cpp:9:14: error: invalid operands to binary expression ('const char *' and 'Foo') +}