diff --git a/clang/include/clang/Basic/CharSet.h b/clang/include/clang/Basic/CharSet.h --- a/clang/include/clang/Basic/CharSet.h +++ b/clang/include/clang/Basic/CharSet.h @@ -51,7 +51,10 @@ /// In case of an error, the result string contains the successfully converted /// part of the input string. /// - + /// If the Source parameter has a zero length, then no conversion is + /// performed. Instead, the internal conversation state of iconv is reset to + /// the initial state if iconv is used for the conversion. Otherwise it is a + /// no-op. virtual std::error_code convert(StringRef Source, SmallVectorImpl &Result, bool ShouldAutoFlush) const = 0; @@ -81,6 +84,8 @@ /// Utility class to convert between different character set encodings. /// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8. +/// If the iconv library is available, then arbitrary conversions are supported. +/// TODO Add Windows support. class CharSetConverter { // details::CharSetConverterImplBase *Converter; std::unique_ptr Converter; diff --git a/clang/include/clang/Config/config.h.cmake b/clang/include/clang/Config/config.h.cmake --- a/clang/include/clang/Config/config.h.cmake +++ b/clang/include/clang/Config/config.h.cmake @@ -57,6 +57,9 @@ /* Define if we have sys/resource.h (rlimits) */ #cmakedefine CLANG_HAVE_RLIMITS ${CLANG_HAVE_RLIMITS} +/* Define if iconv library is available */ +#cmakedefine HAVE_ICONV ${HAVE_ICONV} + /* Linker version detected at compile time. */ #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}" diff --git a/clang/lib/Basic/CMakeLists.txt b/clang/lib/Basic/CMakeLists.txt --- a/clang/lib/Basic/CMakeLists.txt +++ b/clang/lib/Basic/CMakeLists.txt @@ -51,6 +51,17 @@ PROPERTIES COMPILE_DEFINITIONS "CLANG_VENDOR=\"${CLANG_VENDOR} \"") endif() +# Link iconv library if it is an external library. +find_package(Iconv) +if(Iconv_FOUND) + set(HAVE_ICONV 1) +else() + set(HAVE_ICONV 0) +endif() +if(Iconv_FOUND AND NOT Iconv_IS_BUILT_IN) + set(system_libs ${system_libs} ${Iconv_LIBRARIES}) +endif() + add_clang_library(clangBasic Attributes.cpp Builtins.cpp diff --git a/clang/lib/Basic/CharSet.cpp b/clang/lib/Basic/CharSet.cpp --- a/clang/lib/Basic/CharSet.cpp +++ b/clang/lib/Basic/CharSet.cpp @@ -22,6 +22,10 @@ #include #include +#ifdef HAVE_ICONV +#include +#endif + using namespace llvm; // Normalize the charset name with the charset alias matching algorithm proposed @@ -97,6 +101,132 @@ return std::error_code(); } +#ifdef HAVE_ICONV +class CharSetConverterIconv : public details::CharSetConverterImplBase { + iconv_t ConvDesc; + +public: + CharSetConverterIconv(iconv_t ConvDesc) : ConvDesc(ConvDesc) {} + + std::error_code convert(StringRef Source, SmallVectorImpl &Result, + bool ShouldAutoFlush) const override; + std::error_code flush() const override; + std::error_code flush(SmallVectorImpl &Result) const override; +}; + +std::error_code CharSetConverterIconv::convert(StringRef Source, + SmallVectorImpl &Result, + bool ShouldAutoFlush) const { + // Setup the input. Use nullptr to reset iconv state if input length is zero. + size_t InputLength = Source.size(); + char *Input = InputLength ? const_cast(Source.data()) : nullptr; + // Setup the output. We directly write into the SmallVector. + size_t Capacity = Result.capacity(); + Result.resize_for_overwrite(Capacity); + char *Output = InputLength ? static_cast(Result.data()) : nullptr; + size_t OutputLength = Capacity; + + size_t Ret; + + // Handle errors returned from iconv(). + auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) { + if (Ret == static_cast(-1)) { + // An error occured. Check if we can gracefully handle it. + if (errno == E2BIG && Capacity < std::numeric_limits::max()) { + // No space left in output buffer. Double the size of the underlying + // memory in the SmallVectorImpl, adjust pointer and length and continue + // the conversion. + const size_t Used = Capacity - OutputLength; + Capacity = (Capacity < std::numeric_limits::max() / 2) + ? 2 * Capacity + : std::numeric_limits::max(); + Result.resize_for_overwrite(Capacity); + Output = static_cast(Result.data()) + Used; + OutputLength = Capacity - Used; + return std::error_code(); + } else { + // Some other error occured. + return std::error_code(errno, std::generic_category()); + } + } else { + // A positive return value indicates that some characters were converted + // in a nonreversible way, that is, replaced with a SUB symbol. Returning + // an error in this case makes sure that both conversion routines behave + // in the same way. + return std::make_error_code(std::errc::illegal_byte_sequence); + } + }; + + // Convert the string. + while ((Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength))) + if (auto EC = HandleError(Ret)) + return EC; + if (ShouldAutoFlush) { + while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength))) + if (auto EC = HandleError(Ret)) + return EC; + } + + // Re-adjust size to actual size. + Result.resize(Capacity - OutputLength); + return std::error_code(); +} + +std::error_code CharSetConverterIconv::flush() const { + size_t Ret = iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr); + if (Ret == static_cast(-1)) { + return std::error_code(errno, std::generic_category()); + } + return std::error_code(); +} + +std::error_code +CharSetConverterIconv::flush(SmallVectorImpl &Result) const { + char *Output = Result.data(); + size_t OutputLength = Result.capacity(); + size_t Capacity = Result.capacity(); + Result.resize_for_overwrite(Capacity); + + // Handle errors returned from iconv(). + auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) { + if (Ret == static_cast(-1)) { + // An error occured. Check if we can gracefully handle it. + if (errno == E2BIG && Capacity < std::numeric_limits::max()) { + // No space left in output buffer. Increase the size of the underlying + // memory in the SmallVectorImpl by 2 bytes, adjust pointer and length + // and continue the conversion. + const size_t Used = Capacity - OutputLength; + Capacity = (Capacity < std::numeric_limits::max() - 2) + ? 2 + Capacity + : std::numeric_limits::max(); + Result.resize_for_overwrite(Capacity); + Output = static_cast(Result.data()) + Used; + OutputLength = Capacity - Used; + return std::error_code(); + } else { + // Some other error occured. + return std::error_code(errno, std::generic_category()); + } + } else { + // A positive return value indicates that some characters were converted + // in a nonreversible way, that is, replaced with a SUB symbol. Returning + // an error in this case makes sure that both conversion routines behave + // in the same way. + return std::make_error_code(std::errc::illegal_byte_sequence); + } + }; + + size_t Ret; + while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength))) + if (auto EC = HandleError(Ret)) + return EC; + + // Re-adjust size to actual size. + Result.resize(Capacity - OutputLength); + return std::error_code(); +} + +#endif // HAVE_ICONV } // namespace CharSetConverter CharSetConverter::create(text_encoding::id CPFrom, @@ -120,5 +250,13 @@ std::optional To = getKnownCharSet(CSTo); if (From && To) return create(*From, *To); +#if HAVE_ICONV + iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str()); + if (ConvDesc == (iconv_t)-1) + return std::error_code(errno, std::generic_category()); + std::unique_ptr Converter = + std::make_unique(ConvDesc); + return CharSetConverter(std::move(Converter)); +#endif return std::make_error_code(std::errc::invalid_argument); } diff --git a/clang/unittests/Basic/CharSetTest.cpp b/clang/unittests/Basic/CharSetTest.cpp --- a/clang/unittests/Basic/CharSetTest.cpp +++ b/clang/unittests/Basic/CharSetTest.cpp @@ -40,6 +40,29 @@ // String with Cyrillic character ya. static const char CyrillicUTF[] = "\xd0\xaf"; +// String "Earth地球". +// ISO-2022-JP: Sequence ESC $ B (\x1B\x24\x42) switches to JIS X 0208-1983, and +// sequence ESC ( B (\x1B\x28\x42) switches back to ASCII. +// IBM-939: Byte 0x0E shifts from single byte to double byte, and 0x0F shifts +// back. +static const char EarthUTF[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90\x83"; +// Identical to above, except the final character (球) has its last byte taken +// away from it. +static const char EarthUTFBroken[] = "\x45\x61\x72\x74\x68\xe5\x9c\xb0\xe7\x90"; +static const char EarthISO2022[] = + "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65\x1B\x28\x42"; +static const char EarthISO2022ShiftBack[] = + "\x45\x61\x72\x74\x68\x1B\x24\x42\x43\x4F\x35\x65"; +static const char EarthIBM939[] = + "\xc5\x81\x99\xa3\x88\x0e\x45\xc2\x48\xdb\x0f"; +static const char ShiftBackOnly[] = "\x1B\x28\x42"; + +// String "地球". +static const char EarthKanjiOnlyUTF[] = "\xe5\x9c\xb0\xe7\x90\x83"; +static const char EarthKanjiOnlyISO2022[] = + "\x1B\x24\x42\x43\x4F\x35\x65\x1b\x28\x42"; +static const char EarthKanjiOnlyIBM939[] = "\x0e\x45\xc2\x48\xdb\x0f"; + TEST(CharSet, FromUTF8) { // Hello string. StringRef Src(HelloA); @@ -98,4 +121,154 @@ EXPECT_STREQ(AccentUTF, static_cast(Dst).c_str()); } +TEST(CharSet, RoundTrip) { + ErrorOr ConvToUTF16 = + CharSetConverter::create("IBM-1047", "UTF-16"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToUTF16) { + ASSERT_EQ(ConvToUTF16.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + ErrorOr ConvToUTF32 = + CharSetConverter::create("UTF-16", "UTF-32"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToUTF32) { + ASSERT_EQ(ConvToUTF32.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + ErrorOr ConvToEBCDIC = + CharSetConverter::create("UTF-32", "IBM-1047"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToEBCDIC) { + ASSERT_EQ(ConvToEBCDIC.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + // Setup source string. + char SrcStr[256]; + for (size_t I = 0; I < 256; ++I) + SrcStr[I] = (I + 1) % 256; + + SmallString<99> Dst1Str, Dst2Str, Dst3Str; + + std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str, true); + EXPECT_TRUE(!EC); + EC = ConvToUTF32->convert(Dst1Str, Dst2Str, true); + EXPECT_TRUE(!EC); + EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str, true); + EXPECT_TRUE(!EC); + EXPECT_STREQ(SrcStr, static_cast(Dst3Str).c_str()); +} + +TEST(CharSet, ShiftState2022) { + // Earth string. + StringRef Src(EarthUTF); + SmallString<64> Dst; + + ErrorOr ConvTo2022 = + CharSetConverter::create("UTF-8", "ISO-2022-JP"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvTo2022) { + ASSERT_EQ(ConvTo2022.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + // Check that the string is properly converted. + std::error_code EC = ConvTo2022->convert(Src, Dst, true); + EXPECT_TRUE(!EC); + EXPECT_STREQ(EarthISO2022, static_cast(Dst).c_str()); +} + +TEST(CharSet, ShiftState2022Flush) { + StringRef Src0(EarthUTFBroken); + StringRef Src1(EarthKanjiOnlyUTF); + SmallString<64> Dst0; + SmallString<64> Dst1; + ErrorOr ConvTo2022Flush = + CharSetConverter::create("UTF-8", "ISO-2022-JP"); + if (!ConvTo2022Flush) { + ASSERT_EQ(ConvTo2022Flush.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + // This should emit an error; there is a malformed multibyte character in the + // input string. + std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, true); + EXPECT_TRUE(EC0); + std::error_code EC1 = ConvTo2022Flush->flush(); + EXPECT_TRUE(!EC1); + std::error_code EC2 = ConvTo2022Flush->convert(Src1, Dst1, true); + EXPECT_TRUE(!EC2); + EXPECT_STREQ(EarthKanjiOnlyISO2022, static_cast(Dst1).c_str()); +} + +TEST(CharSet, ShiftStateIBM939) { + // Earth string. + StringRef Src(EarthUTF); + SmallString<64> Dst; + + ErrorOr ConvToIBM939 = + CharSetConverter::create("UTF-8", "IBM-939"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToIBM939) { + ASSERT_EQ(ConvToIBM939.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + // Check that the string is properly converted. + std::error_code EC = ConvToIBM939->convert(Src, Dst, true); + EXPECT_TRUE(!EC); + EXPECT_STREQ(EarthIBM939, static_cast(Dst).c_str()); +} + +TEST(CharSet, ShiftStateIBM939Flush) { + StringRef Src0(EarthUTFBroken); + StringRef Src1(EarthKanjiOnlyUTF); + SmallString<64> Dst0; + SmallString<64> Dst1; + ErrorOr ConvTo939Flush = + CharSetConverter::create("UTF-8", "IBM-939"); + if (!ConvTo939Flush) { + ASSERT_EQ(ConvTo939Flush.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + // This should emit an error; there is a malformed multibyte character in the + // input string. + std::error_code EC0 = ConvTo939Flush->convert(Src0, Dst0, true); + EXPECT_TRUE(EC0); + std::error_code EC1 = ConvTo939Flush->flush(); + EXPECT_TRUE(!EC1); + std::error_code EC2 = ConvTo939Flush->convert(Src1, Dst1, true); + EXPECT_TRUE(!EC2); + EXPECT_STREQ(EarthKanjiOnlyIBM939, static_cast(Dst1).c_str()); +} + +TEST(CharSet, ShiftState2022Flush1) { + StringRef Src0(EarthUTF); + SmallString<64> Dst0; + SmallString<64> Dst1; + ErrorOr ConvTo2022Flush = + CharSetConverter::create("UTF-8", "ISO-2022-JP"); + if (!ConvTo2022Flush) { + ASSERT_EQ(ConvTo2022Flush.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + std::error_code EC0 = ConvTo2022Flush->convert(Src0, Dst0, false); + EXPECT_TRUE(!EC0); + EXPECT_STREQ(EarthISO2022ShiftBack, static_cast(Dst0).c_str()); + std::error_code EC1 = ConvTo2022Flush->flush(Dst1); + EXPECT_TRUE(!EC1); + EXPECT_STREQ(ShiftBackOnly, static_cast(Dst1).c_str()); +} + } // namespace