diff --git a/clang/test/Analysis/checker-plugins.c b/clang/test/Analysis/checker-plugins.c --- a/clang/test/Analysis/checker-plugins.c +++ b/clang/test/Analysis/checker-plugins.c @@ -116,4 +116,5 @@ // RUN: 2>&1 | FileCheck %s -check-prefix=CHECK-CHECKER-OPTION-HELP // CHECK-CHECKER-OPTION-HELP: example.MyChecker:ExampleOption (bool) This is an -// CHECK-CHECKER-OPTION-HELP-SAME: example checker opt. (default: false) +// CHECK-CHECKER-OPTION-HELP-SAME: example checker opt. (default: +// CHECK-CHECKER-OPTION-HELP-NEXT: false) diff --git a/llvm/include/llvm/Support/FormattedStream.h b/llvm/include/llvm/Support/FormattedStream.h --- a/llvm/include/llvm/Support/FormattedStream.h +++ b/llvm/include/llvm/Support/FormattedStream.h @@ -14,6 +14,7 @@ #ifndef LLVM_SUPPORT_FORMATTEDSTREAM_H #define LLVM_SUPPORT_FORMATTEDSTREAM_H +#include "llvm/ADT/SmallString.h" #include "llvm/Support/raw_ostream.h" #include @@ -40,6 +41,14 @@ /// const char *Scanned; + /// PartialUTF8Char - Either empty or a prefix of a UTF-8 code unit sequence + /// for a Unicode scalar value which should be prepended to the buffer for the + /// next call to ComputePosition. This is needed when the buffer is flushed + /// when it ends part-way through the UTF-8 encoding of a Unicode scalar + /// value, so that we can compute the display width of the character once we + /// have the rest of it. + SmallString<4> PartialUTF8Char; + void write_impl(const char *Ptr, size_t Size) override; /// current_pos - Return the current position within the stream, @@ -52,10 +61,16 @@ } /// ComputePosition - Examine the given output buffer and figure out the new - /// position after output. - /// + /// position after output. This is safe to call multiple times on the same + /// buffer, as it records the most recently scanned character and resumes from + /// there when the buffer has not been flushed. void ComputePosition(const char *Ptr, size_t size); + /// UpdatePosition - scan the characters in [Ptr, Ptr+Size), and update the + /// line and column numbers. Unlike ComputePosition, this must be called + /// exactly once on each region of the buffer. + void UpdatePosition(const char *Ptr, size_t Size); + void setStream(raw_ostream &Stream) { releaseStream(); @@ -105,11 +120,17 @@ /// \param NewCol - The column to move to. formatted_raw_ostream &PadToColumn(unsigned NewCol); - /// getColumn - Return the column number - unsigned getColumn() { return Position.first; } + unsigned getColumn() { + // Calculate current position, taking buffer contents into account. + ComputePosition(getBufferStart(), GetNumBytesInBuffer()); + return Position.first; + } - /// getLine - Return the line number - unsigned getLine() { return Position.second; } + unsigned getLine() { + // Calculate current position, taking buffer contents into account. + ComputePosition(getBufferStart(), GetNumBytesInBuffer()); + return Position.second; + } raw_ostream &resetColor() override { TheStream->resetColor(); diff --git a/llvm/lib/Support/FormattedStream.cpp b/llvm/lib/Support/FormattedStream.cpp --- a/llvm/lib/Support/FormattedStream.cpp +++ b/llvm/lib/Support/FormattedStream.cpp @@ -11,7 +11,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/FormattedStream.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Locale.h" #include "llvm/Support/raw_ostream.h" #include @@ -19,16 +21,24 @@ /// UpdatePosition - Examine the given char sequence and figure out which /// column we end up in after output, and how many line breaks are contained. -/// -static void UpdatePosition(std::pair &Position, const char *Ptr, size_t Size) { +/// This assumes that the input string is well-formed UTF-8, and takes into +/// account Unicode characters which render as multiple columns wide. +void formatted_raw_ostream::UpdatePosition(const char *Ptr, size_t Size) { unsigned &Column = Position.first; unsigned &Line = Position.second; - // Keep track of the current column and line by scanning the string for - // special characters - for (const char *End = Ptr + Size; Ptr != End; ++Ptr) { - ++Column; - switch (*Ptr) { + auto ProcessCodePoint = [&Line, &Column](StringRef CP) { + int Width = sys::locale::columnWidth(CP); + // columnWidth returns -1 for non-printing characters. + if (Width != -1) + Column += Width; + + // If this is the final byte of a multi-byte sequence, it can't be any of + // the special whitespace characters below. + if (CP.size() > 1) + return; + + switch (CP[0]) { case '\n': Line += 1; LLVM_FALLTHROUGH; @@ -40,6 +50,46 @@ Column += (8 - (Column & 0x7)) & 0x7; break; } + }; + + // If we have a partial UTF-8 sequence from the previous buffer, check that + // first. + if (PartialUTF8Char.size()) { + size_t NumBytes = getNumBytesForUTF8(PartialUTF8Char[0]); + if (NumBytes > (PartialUTF8Char.size() + Size)) { + // If we still don't have enough bytes for a complete code point, just + // append what we have. + PartialUTF8Char.append(StringRef(Ptr, Size)); + return; + } else { + // The first few bytes from the buffer will complete the code point. + // Concatenate them and process their effect on the line and column + // numbers. + size_t BytesFromBuffer = NumBytes - PartialUTF8Char.size(); + PartialUTF8Char.append(StringRef(Ptr, BytesFromBuffer)); + ProcessCodePoint(PartialUTF8Char); + PartialUTF8Char.clear(); + Ptr += BytesFromBuffer; + Size -= BytesFromBuffer; + } + } + + // Now scan the rest of the buffer. + unsigned NumBytes; + for (const char *End = Ptr + Size; Ptr < End; Ptr += NumBytes) { + NumBytes = getNumBytesForUTF8(*Ptr); + + // The buffer might end part way through a UTF-8 code unit sequence for a + // Unicode scalar value if it got flushed. If this happens, we can't know + // the display width until we see the rest of the code point. Stash the + // bytes we do have, so that we can reconstruct the whole code point later, + // even if the buffer is being flushed. + if ((Ptr + NumBytes) > End) { + PartialUTF8Char = StringRef(Ptr, End - Ptr); + return; + } + + ProcessCodePoint(StringRef(Ptr, NumBytes)); } } @@ -52,9 +102,9 @@ if (Ptr <= Scanned && Scanned <= Ptr + Size) // Scan all characters added since our last scan to determine the new // column. - UpdatePosition(Position, Scanned, Size - (Scanned - Ptr)); + UpdatePosition(Scanned, Size - (Scanned - Ptr)); else - UpdatePosition(Position, Ptr, Size); + UpdatePosition(Ptr, Size); // Update the scanning pointer. Scanned = Ptr + Size; diff --git a/llvm/test/MC/ARM/lsl-zero.s b/llvm/test/MC/ARM/lsl-zero.s --- a/llvm/test/MC/ARM/lsl-zero.s +++ b/llvm/test/MC/ARM/lsl-zero.s @@ -1,6 +1,6 @@ -// RUN: llvm-mc -triple=thumbv7 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV7 %s -// RUN: llvm-mc -triple=thumbv8 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV8 %s -// RUN: llvm-mc -triple=armv7 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM %s +// RUN: llvm-mc -triple=thumbv7 -show-encoding < %s 2>/dev/null | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV7 %s +// RUN: llvm-mc -triple=thumbv8 -show-encoding < %s 2>/dev/null | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV8 %s +// RUN: llvm-mc -triple=armv7 -show-encoding < %s 2>/dev/null | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM %s // lsl #0 is actually mov, so here we check that it behaves the same as // mov with regards to the permitted registers and how it behaves in an diff --git a/llvm/unittests/Support/formatted_raw_ostream_test.cpp b/llvm/unittests/Support/formatted_raw_ostream_test.cpp --- a/llvm/unittests/Support/formatted_raw_ostream_test.cpp +++ b/llvm/unittests/Support/formatted_raw_ostream_test.cpp @@ -29,4 +29,143 @@ } } +TEST(formatted_raw_ostreamTest, Test_LineColumn) { + // Test tracking of line and column numbers in a stream. + SmallString<128> A; + raw_svector_ostream B(A); + formatted_raw_ostream C(B); + + EXPECT_EQ(0U, C.getLine()); + EXPECT_EQ(0U, C.getColumn()); + + C << "a"; + EXPECT_EQ(0U, C.getLine()); + EXPECT_EQ(1U, C.getColumn()); + + C << "bcdef"; + EXPECT_EQ(0U, C.getLine()); + EXPECT_EQ(6U, C.getColumn()); + + // '\n' increments line number, sets column to zero. + C << "\n"; + EXPECT_EQ(1U, C.getLine()); + EXPECT_EQ(0U, C.getColumn()); + + // '\r sets column to zero without changing line number + C << "foo\r"; + EXPECT_EQ(1U, C.getLine()); + EXPECT_EQ(0U, C.getColumn()); + + // '\t' advances column to the next multiple of 8. + // FIXME: If the column number is already a multiple of 8 this will do + // nothing, is this behaviour correct? + C << "1\t"; + EXPECT_EQ(8U, C.getColumn()); + C << "\t"; + EXPECT_EQ(8U, C.getColumn()); + C << "1234567\t"; + EXPECT_EQ(16U, C.getColumn()); + EXPECT_EQ(1U, C.getLine()); +} + +TEST(formatted_raw_ostreamTest, Test_Flush) { + // Flushing the buffer causes the characters in the buffer to be scanned + // before the buffer is emptied, so line and column numbers will still be + // tracked properly. + SmallString<128> A; + raw_svector_ostream B(A); + B.SetBufferSize(32); + formatted_raw_ostream C(B); + + C << "\nabc"; + EXPECT_EQ(4U, C.GetNumBytesInBuffer()); + C.flush(); + EXPECT_EQ(1U, C.getLine()); + EXPECT_EQ(3U, C.getColumn()); + EXPECT_EQ(0U, C.GetNumBytesInBuffer()); +} + +TEST(formatted_raw_ostreamTest, Test_UTF8) { + SmallString<128> A; + raw_svector_ostream B(A); + B.SetBufferSize(32); + formatted_raw_ostream C(B); + + // U+00A0 Non-breaking space: encoded as two bytes, but only one column wide. + C << u8"\u00a0"; + EXPECT_EQ(0U, C.getLine()); + EXPECT_EQ(1U, C.getColumn()); + EXPECT_EQ(2U, C.GetNumBytesInBuffer()); + + // U+2468 CIRCLED DIGIT NINE: encoded as three bytes, but only one column + // wide. + C << u8"\u2468"; + EXPECT_EQ(0U, C.getLine()); + EXPECT_EQ(2U, C.getColumn()); + EXPECT_EQ(5U, C.GetNumBytesInBuffer()); + + // U+00010000 LINEAR B SYLLABLE B008 A: encoded as four bytes, but only one + // column wide. + C << u8"\U00010000"; + EXPECT_EQ(0U, C.getLine()); + EXPECT_EQ(3U, C.getColumn()); + EXPECT_EQ(9U, C.GetNumBytesInBuffer()); + + // U+55B5, CJK character, encodes as three bytes, takes up two columns. + C << u8"\u55b5"; + EXPECT_EQ(0U, C.getLine()); + EXPECT_EQ(5U, C.getColumn()); + EXPECT_EQ(12U, C.GetNumBytesInBuffer()); + + // U+200B, zero-width space, encoded as three bytes but has no effect on the + // column or line number. + C << u8"\u200b"; + EXPECT_EQ(0U, C.getLine()); + EXPECT_EQ(5U, C.getColumn()); + EXPECT_EQ(15U, C.GetNumBytesInBuffer()); +} + +TEST(formatted_raw_ostreamTest, Test_UTF8Buffered) { + SmallString<128> A; + raw_svector_ostream B(A); + B.SetBufferSize(4); + formatted_raw_ostream C(B); + + // This character encodes as three bytes, so will cause the buffer to be + // flushed after the first byte (4 byte buffer, 3 bytes already written). We + // need to save the first part of the UTF-8 encoding until after the buffer is + // cleared and the remaining two bytes are written, at which point we can + // check the display width. In this case the display width is 1, so we end at + // column 4, with 6 bytes written into total, 2 of which are in the buffer. + C << u8"123\u2468"; + EXPECT_EQ(0U, C.getLine()); + EXPECT_EQ(4U, C.getColumn()); + EXPECT_EQ(2U, C.GetNumBytesInBuffer()); + C.flush(); + EXPECT_EQ(6U, A.size()); + + // Same as above, but with a CJK character which displays as two columns. + C << u8"123\u55b5"; + EXPECT_EQ(0U, C.getLine()); + EXPECT_EQ(9U, C.getColumn()); + EXPECT_EQ(2U, C.GetNumBytesInBuffer()); + C.flush(); + EXPECT_EQ(12U, A.size()); +} + +TEST(formatted_raw_ostreamTest, Test_UTF8TinyBuffer) { + SmallString<128> A; + raw_svector_ostream B(A); + B.SetBufferSize(1); + formatted_raw_ostream C(B); + + // The stream has a one-byte buffer, so it gets flushed multiple times while + // printing a single Unicode character. + C << u8"\u2468"; + EXPECT_EQ(0U, C.getLine()); + EXPECT_EQ(1U, C.getColumn()); + EXPECT_EQ(0U, C.GetNumBytesInBuffer()); + C.flush(); + EXPECT_EQ(3U, A.size()); +} }