diff --git a/clang/include/clang/Basic/CharSet.h b/clang/include/clang/Basic/CharSet.h new file mode 100644 --- /dev/null +++ b/clang/include/clang/Basic/CharSet.h @@ -0,0 +1,160 @@ +//===-- CharSet.h - Utility class to convert between char sets ----*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides a utility class to convert between different character +/// set encodings. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_CHARSET_H +#define LLVM_SUPPORT_CHARSET_H + +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Config/config.h" +#include "llvm/Support/ErrorOr.h" + +#include +#include +#include + +namespace llvm { + +template class SmallVectorImpl; + +namespace details { +class CharSetConverterImplBase { +public: + virtual ~CharSetConverterImplBase() = default; + + /// Converts a string. + /// \param[in] Source source string + /// \param[in,out] Result container for converted string + /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion + /// for multi-byte encodings iff true. + /// \return error code in case something went wrong + /// + /// The following error codes can occur, among others: + /// - std::errc::argument_list_too_long: The result requires more than + /// std::numeric_limits::max() bytes. + /// - std::errc::illegal_byte_sequence: The input contains an invalid + /// multibyte sequence. + /// - std::errc::invalid_argument: The input contains an incomplete + /// multibyte sequence. + /// + /// In case of an error, the result string contains the successfully converted + /// part of the input string. + /// + + virtual std::error_code convert(StringRef Source, + SmallVectorImpl &Result, + bool ShouldAutoFlush) const = 0; + + /// Restore the conversion to the original state. + /// \return error code in case something went wrong + /// + /// If the original character set or the destination character set + /// are multi-byte character sets, set the shift state to the initial + /// state. Otherwise this is a no-op. + virtual std::error_code flush() const = 0; + + virtual std::error_code flush(SmallVectorImpl &Result) const = 0; +}; +} // namespace details + +// Names inspired by https://wg21.link/p1885. +namespace text_encoding { +enum class id { + /// UTF-8 character set encoding. + UTF8, + + /// IBM EBCDIC 1047 character set encoding. + IBM1047 +}; +} // end namespace text_encoding + +/// Utility class to convert between different character set encodings. +/// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8. +class CharSetConverter { + // details::CharSetConverterImplBase *Converter; + std::unique_ptr Converter; + + CharSetConverter(std::unique_ptr Converter) + : Converter(std::move(Converter)) {} + +public: + /// Creates a CharSetConverter instance. + /// \param[in] CSFrom name of the source character encoding + /// \param[in] CSTo name of the target character encoding + /// \return a CharSetConverter instance + static CharSetConverter create(text_encoding::id CSFrom, + text_encoding::id CSTo); + + /// Creates a CharSetConverter instance. + /// Returns std::errc::invalid_argument in case the requested conversion is + /// not supported. + /// \param[in] CPFrom name of the source character encoding + /// \param[in] CPTo name of the target character encoding + /// \return a CharSetConverter instance or an error code + static ErrorOr create(StringRef CPFrom, StringRef CPTo); + + CharSetConverter(const CharSetConverter &) = delete; + CharSetConverter &operator=(const CharSetConverter &) = delete; + + CharSetConverter(CharSetConverter &&Other) { + Converter = std::move(Other.Converter); + } + + CharSetConverter &operator=(CharSetConverter &&Other) { + if (this != &Other) + Converter = std::move(Other.Converter); + return *this; + } + + ~CharSetConverter() = default; + + /// Converts a string. + /// \param[in] Source source string + /// \param[in,out] Result container for converted string + /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion + /// for multi-byte encodings. + /// \return error code in case something went wrong + std::error_code convert(StringRef Source, SmallVectorImpl &Result, + bool ShouldAutoFlush = true) const { + return Converter->convert(Source, Result, ShouldAutoFlush); + } + + char convert(char SingleChar) const { + SmallString<1> Result; + Converter->convert(StringRef(&SingleChar, 1), Result, false); + return Result[0]; + } + + /// Converts a string. + /// \param[in] Source source string + /// \param[in,out] Result container for converted string + /// \param[in] ShouldAutoFlush Append shift-back sequence after conversion + /// for multi-byte encodings iff true. + /// \return error code in case something went wrong + std::error_code convert(const std::string &Source, + SmallVectorImpl &Result, + bool ShouldAutoFlush = true) const { + return convert(StringRef(Source), Result, ShouldAutoFlush); + } + + std::error_code flush() const { return Converter->flush(); } + + std::error_code flush(SmallVectorImpl &Result) const { + return Converter->flush(Result); + } +}; + +} // namespace llvm + +#endif diff --git a/clang/lib/Basic/CMakeLists.txt b/clang/lib/Basic/CMakeLists.txt --- a/clang/lib/Basic/CMakeLists.txt +++ b/clang/lib/Basic/CMakeLists.txt @@ -56,6 +56,7 @@ Builtins.cpp CLWarnings.cpp CharInfo.cpp + CharSet.cpp CodeGenOptions.cpp Cuda.cpp DarwinSDKInfo.cpp diff --git a/clang/lib/Basic/CharSet.cpp b/clang/lib/Basic/CharSet.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Basic/CharSet.cpp @@ -0,0 +1,124 @@ +//===-- CharSet.cpp - Utility class to convert between char sets --*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides utility classes to convert between different character +/// set encoding. +/// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/CharSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/ConvertEBCDIC.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +using namespace llvm; + +// Normalize the charset name with the charset alias matching algorithm proposed +// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching. +void normalizeCharSetName(StringRef CSName, SmallVectorImpl &Normalized) { + bool PrevDigit = false; + for (auto Ch : CSName) { + if (isAlnum(Ch)) { + Ch = toLower(Ch); + if (Ch != '0' || PrevDigit) { + PrevDigit = isDigit(Ch); + Normalized.push_back(Ch); + } + } + } +} + +// Maps the charset name to enum constant if possible. +std::optional getKnownCharSet(StringRef CSName) { + SmallString<16> Normalized; + normalizeCharSetName(CSName, Normalized); +#define CSNAME(CS, STR) \ + if (Normalized.equals(STR)) \ + return CS + CSNAME(text_encoding::id::UTF8, "utf8"); + CSNAME(text_encoding::id::IBM1047, "ibm1047"); +#undef CSNAME + return std::nullopt; +} + +namespace { +enum ConversionType { + UTFToIBM1047, + IBM1047ToUTF, +}; + +// Support conversion between EBCDIC 1047 and UTF8. This class uses +// built-in translation tables that allow for translation between the +// aforementioned character sets. The use of tables for conversion is only +// possible because EBCDIC 1047 is a single-byte, stateless encoding; other +// character sets are not supported. +class CharSetConverterTable : public details::CharSetConverterImplBase { + ConversionType ConvType; + +public: + CharSetConverterTable(ConversionType ConvType) : ConvType(ConvType) {} + + std::error_code convert(StringRef Source, SmallVectorImpl &Result, + bool ShouldAutoFlush) const override; + std::error_code flush() const override; + std::error_code flush(SmallVectorImpl &Result) const override; +}; + +std::error_code CharSetConverterTable::convert(StringRef Source, + SmallVectorImpl &Result, + bool ShouldAutoFlush) const { + if (ConvType == IBM1047ToUTF) { + ConverterEBCDIC::convertToUTF8(Source, Result); + return std::error_code(); + } else if (ConvType == UTFToIBM1047) { + return ConverterEBCDIC::convertToEBCDIC(Source, Result); + } + llvm_unreachable("Invalid ConvType!"); + return std::error_code(); +} + +std::error_code CharSetConverterTable::flush() const { + return std::error_code(); +} + +std::error_code +CharSetConverterTable::flush(SmallVectorImpl &Result) const { + return std::error_code(); +} + +} // namespace + +CharSetConverter CharSetConverter::create(text_encoding::id CPFrom, + text_encoding::id CPTo) { + + assert(CPFrom != CPTo && "Text encodings should be distinct"); + + ConversionType Conversion; + if (CPFrom == text_encoding::id::UTF8 && CPTo == text_encoding::id::IBM1047) + Conversion = UTFToIBM1047; + else + Conversion = IBM1047ToUTF; + std::unique_ptr Converter = + std::make_unique(Conversion); + return CharSetConverter(std::move(Converter)); +} + +ErrorOr CharSetConverter::create(StringRef CSFrom, + StringRef CSTo) { + std::optional From = getKnownCharSet(CSFrom); + std::optional To = getKnownCharSet(CSTo); + if (From && To) + return create(*From, *To); + return std::make_error_code(std::errc::invalid_argument); +} diff --git a/clang/unittests/Basic/CMakeLists.txt b/clang/unittests/Basic/CMakeLists.txt --- a/clang/unittests/Basic/CMakeLists.txt +++ b/clang/unittests/Basic/CMakeLists.txt @@ -4,6 +4,7 @@ add_clang_unittest(BasicTests CharInfoTest.cpp + CharSetTest.cpp DarwinSDKInfoTest.cpp DiagnosticTest.cpp FileEntryTest.cpp diff --git a/clang/unittests/Basic/CharSetTest.cpp b/clang/unittests/Basic/CharSetTest.cpp new file mode 100644 --- /dev/null +++ b/clang/unittests/Basic/CharSetTest.cpp @@ -0,0 +1,101 @@ +//===- unittests/Support/CharSetTest.cpp - Charset conversion tests -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/CharSet.h" +#include "llvm/ADT/SmallString.h" +#include "gtest/gtest.h" +using namespace llvm; + +namespace { + +// String "Hello World!" +static const char HelloA[] = + "\x48\x65\x6C\x6C\x6F\x20\x57\x6F\x72\x6C\x64\x21\x0a"; +static const char HelloE[] = + "\xC8\x85\x93\x93\x96\x40\xE6\x96\x99\x93\x84\x5A\x15"; + +// String "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" +static const char ABCStrA[] = + "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F\x50\x51\x52" + "\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A" + "\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A"; +static const char ABCStrE[] = + "\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9" + "\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\x81\x82\x83\x84\x85\x86\x87\x88\x89\x91" + "\x92\x93\x94\x95\x96\x97\x98\x99\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9"; + +// String "¡¢£AÄÅÆEÈÉÊaàáâãäeèéêë" +static const char AccentUTF[] = + "\xc2\xa1\xc2\xa2\xc2\xa3\x41\xc3\x84\xc3\x85\xc3\x86\x45\xc3\x88\xc3\x89" + "\xc3\x8a\x61\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\x65\xc3\xa8\xc3\xa9" + "\xc3\xaa\xc3\xab"; +static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72" + "\x81\x44\x45\x42\x46\x43\x85\x54\x51\x52\x53"; + +// String with Cyrillic character ya. +static const char CyrillicUTF[] = "\xd0\xaf"; + +TEST(CharSet, FromUTF8) { + // Hello string. + StringRef Src(HelloA); + SmallString<64> Dst; + + CharSetConverter Conv = CharSetConverter::create(text_encoding::id::UTF8, + text_encoding::id::IBM1047); + std::error_code EC = Conv.convert(Src, Dst, true); + EXPECT_TRUE(!EC); + EXPECT_STREQ(HelloE, static_cast(Dst).c_str()); + Dst.clear(); + + // ABC string. + Src = ABCStrA; + EC = Conv.convert(Src, Dst, true); + EXPECT_TRUE(!EC); + EXPECT_STREQ(ABCStrE, static_cast(Dst).c_str()); + Dst.clear(); + + // Accent string. + Src = AccentUTF; + EC = Conv.convert(Src, Dst, true); + EXPECT_TRUE(!EC); + EXPECT_STREQ(AccentE, static_cast(Dst).c_str()); + Dst.clear(); + + // Cyrillic string. Results in error because not representable in 1047. + Src = CyrillicUTF; + EC = Conv.convert(Src, Dst, true); + EXPECT_EQ(EC, std::errc::illegal_byte_sequence); +} + +TEST(CharSet, ToUTF8) { + // Hello string. + StringRef Src(HelloE); + SmallString<64> Dst; + + CharSetConverter Conv = CharSetConverter::create(text_encoding::id::IBM1047, + text_encoding::id::UTF8); + std::error_code EC = Conv.convert(Src, Dst, true); + EXPECT_TRUE(!EC); + EXPECT_STREQ(HelloA, static_cast(Dst).c_str()); + Dst.clear(); + + // ABC string. + Src = ABCStrE; + EC = Conv.convert(Src, Dst, true); + EXPECT_TRUE(!EC); + EXPECT_STREQ(ABCStrA, static_cast(Dst).c_str()); + Dst.clear(); + + // Accent string. + Src = AccentE; + EC = Conv.convert(Src, Dst, true); + EXPECT_TRUE(!EC); + EXPECT_STREQ(AccentUTF, static_cast(Dst).c_str()); +} + +} // namespace