diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -342,6 +342,9 @@ /// input is a header file (i.e. -x c-header). bool IsHeaderFile = false; + /// Name of the exec charset to convert the internal charset to. + std::string ExecCharset; + LangOptions(); // Define accessors/mutators for language options of enumeration type. diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h --- a/clang/include/clang/Basic/TokenKinds.h +++ b/clang/include/clang/Basic/TokenKinds.h @@ -90,6 +90,13 @@ isStringLiteral(K) || K == tok::header_name; } +/// Return true if this is a utf literal kind. +inline bool isUTFLiteral(TokenKind K) { + return K == tok::utf8_char_constant || K == tok::utf8_string_literal || + K == tok::utf16_char_constant || K == tok::utf16_string_literal || + K == tok::utf32_char_constant || K == tok::utf32_string_literal; +} + /// Return true if this is any of tok::annot_* kinds. bool isAnnotation(TokenKind K); diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4424,6 +4424,11 @@ let Flags = [CC1Option, CC1AsOption, NoDriverOption] in { +def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"">, + HelpText<"Set the execution for string and character literals. " + "Supported character encodings include ISO8859-1, UTF-8, IBM-1047 " + "and those supported by the host iconv library.">, + MarshallingInfoString>; def target_cpu : Separate<["-"], "target-cpu">, HelpText<"Target a specific cpu type">, MarshallingInfoString>; diff --git a/clang/include/clang/Lex/LiteralConverter.h b/clang/include/clang/Lex/LiteralConverter.h new file mode 100644 --- /dev/null +++ b/clang/include/clang/Lex/LiteralConverter.h @@ -0,0 +1,36 @@ +//===--- clang/Lex/LiteralConverter.h - Translator for Literals -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LEX_LITERALCONVERTER_H +#define LLVM_CLANG_LEX_LITERALCONVERTER_H + +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/TargetInfo.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/CharSet.h" + +enum ConversionAction { NoConversion, ToSystemCharset, ToExecCharset }; + +class LiteralConverter { + llvm::StringRef InternalCharset; + llvm::StringRef SystemCharset; + llvm::StringRef ExecCharset; + llvm::StringMap CharsetConverters; + +public: + llvm::CharSetConverter *getConverter(const char *Codepage); + llvm::CharSetConverter *getConverter(ConversionAction Action); + llvm::CharSetConverter *createAndInsertCharConverter(const char *To); + void setConvertersFromOptions(const clang::LangOptions &Opts, + const clang::TargetInfo &TInfo, + clang::DiagnosticsEngine &Diags); +}; + +#endif diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h --- a/clang/include/clang/Lex/LiteralSupport.h +++ b/clang/include/clang/Lex/LiteralSupport.h @@ -17,10 +17,12 @@ #include "clang/Basic/CharInfo.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/TokenKinds.h" +#include "clang/Lex/LiteralConverter.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/CharSet.h" #include "llvm/Support/DataTypes.h" namespace clang { @@ -185,9 +187,8 @@ SmallString<32> UDSuffixBuf; unsigned UDSuffixOffset; public: - CharLiteralParser(const char *begin, const char *end, - SourceLocation Loc, Preprocessor &PP, - tok::TokenKind kind); + CharLiteralParser(const char *begin, const char *end, SourceLocation Loc, + Preprocessor &PP, tok::TokenKind kind); bool hadError() const { return HadError; } bool isAscii() const { return Kind == tok::char_constant; } @@ -212,6 +213,7 @@ const LangOptions &Features; const TargetInfo &Target; DiagnosticsEngine *Diags; + LiteralConverter *LiteralConv; unsigned MaxTokenLength; unsigned SizeBound; @@ -223,19 +225,19 @@ unsigned UDSuffixToken; unsigned UDSuffixOffset; public: - StringLiteralParser(ArrayRef StringToks, - Preprocessor &PP, bool Complain = true); - StringLiteralParser(ArrayRef StringToks, - const SourceManager &sm, const LangOptions &features, - const TargetInfo &target, + StringLiteralParser(ArrayRef StringToks, Preprocessor &PP, + bool Complain = true, + ConversionAction Action = ToExecCharset); + StringLiteralParser(ArrayRef StringToks, const SourceManager &sm, + const LangOptions &features, const TargetInfo &target, DiagnosticsEngine *diags = nullptr) - : SM(sm), Features(features), Target(target), Diags(diags), - MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), - ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) { - init(StringToks); + : SM(sm), Features(features), Target(target), Diags(diags), + LiteralConv(nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0), + Kind(tok::unknown), ResultPtr(ResultBuf.data()), hadError(false), + Pascal(false) { + init(StringToks, NoConversion); } - bool hadError; bool Pascal; @@ -278,7 +280,7 @@ static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix); private: - void init(ArrayRef StringToks); + void init(ArrayRef StringToks, ConversionAction Action); bool CopyStringFragment(const Token &Tok, const char *TokBegin, StringRef Fragment); void DiagnoseLexingError(SourceLocation Loc); diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -23,6 +23,7 @@ #include "clang/Basic/SourceManager.h" #include "clang/Basic/TokenKinds.h" #include "clang/Lex/Lexer.h" +#include "clang/Lex/LiteralConverter.h" #include "clang/Lex/MacroInfo.h" #include "clang/Lex/ModuleLoader.h" #include "clang/Lex/ModuleMap.h" @@ -141,6 +142,7 @@ std::unique_ptr ScratchBuf; HeaderSearch &HeaderInfo; ModuleLoader &TheModuleLoader; + LiteralConverter LiteralConv; /// External source of macros. ExternalPreprocessorSource *ExternalSource; @@ -931,6 +933,7 @@ SelectorTable &getSelectorTable() { return Selectors; } Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; } llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; } + LiteralConverter &getLiteralConverter() { return LiteralConv; } void setExternalSource(ExternalPreprocessorSource *Source) { ExternalSource = Source; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -36,6 +36,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Config/llvm-config.h" #include "llvm/Option/ArgList.h" +#include "llvm/Support/CharSet.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Compression.h" @@ -6218,14 +6219,21 @@ << value; } - // -fexec_charset=UTF-8 is default. Reject others + // Set the default fexec-charset as the system charset. + CmdArgs.push_back("-fexec-charset"); + CmdArgs.push_back(Args.MakeArgString(Triple.getSystemCharset())); if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) { StringRef value = execCharset->getValue(); - if (!value.equals_lower("utf-8")) - D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args) - << value; + llvm::ErrorOr ErrorOrConverter = + llvm::CharSetConverter::create("UTF-8", value.data()); + if (ErrorOrConverter) { + CmdArgs.push_back("-fexec-charset"); + CmdArgs.push_back(Args.MakeArgString(value)); + } else { + D.Diag(diag::err_drv_invalid_value) + << execCharset->getAsString(Args) << value; + } } - RenderDiagnosticsOptions(D, Args, CmdArgs); // -fno-asm-blocks is default. diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -12,6 +12,7 @@ #include "clang/AST/Decl.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/Diagnostic.h" +#include "clang/Basic/DiagnosticDriver.h" #include "clang/Basic/FileManager.h" #include "clang/Basic/LangStandard.h" #include "clang/Basic/SourceManager.h" @@ -29,6 +30,7 @@ #include "clang/Frontend/Utils.h" #include "clang/Frontend/VerifyDiagnosticConsumer.h" #include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/LiteralConverter.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" #include "clang/Sema/CodeCompleteConsumer.h" @@ -529,6 +531,8 @@ /*ShowAllHeaders=*/true, /*OutputPath=*/"", /*ShowDepth=*/true, /*MSStyle=*/true); } + PP->getLiteralConverter().setConvertersFromOptions(getLangOpts(), getTarget(), + getDiagnostics()); } std::string CompilerInstance::getSpecificModuleCachePath(StringRef ModuleHash) { diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt --- a/clang/lib/Lex/CMakeLists.txt +++ b/clang/lib/Lex/CMakeLists.txt @@ -7,6 +7,7 @@ HeaderMap.cpp HeaderSearch.cpp Lexer.cpp + LiteralConverter.cpp LiteralSupport.cpp MacroArgs.cpp MacroInfo.cpp diff --git a/clang/lib/Lex/LiteralConverter.cpp b/clang/lib/Lex/LiteralConverter.cpp new file mode 100644 --- /dev/null +++ b/clang/lib/Lex/LiteralConverter.cpp @@ -0,0 +1,68 @@ +//===--- LiteralConverter.cpp - Translator for String Literals -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/LiteralConverter.h" +#include "clang/Basic/DiagnosticDriver.h" + +using namespace llvm; + +llvm::CharSetConverter *LiteralConverter::getConverter(const char *Codepage) { + auto Iter = CharsetConverters.find(Codepage); + if (Iter != CharsetConverters.end()) + return &Iter->second; + return nullptr; +} + +llvm::CharSetConverter * +LiteralConverter::getConverter(ConversionAction Action) { + StringRef CodePage; + if (Action == ToSystemCharset) + CodePage = SystemCharset; + else if (Action == ToExecCharset) + CodePage = ExecCharset; + else + CodePage = InternalCharset; + return getConverter(CodePage.data()); +} + +llvm::CharSetConverter * +LiteralConverter::createAndInsertCharConverter(const char *To) { + const char *From = InternalCharset.data(); + llvm::CharSetConverter *Converter = getConverter(To); + if (Converter) + return Converter; + + ErrorOr ErrorOrConverter = + llvm::CharSetConverter::create(From, To); + if (!ErrorOrConverter) + return nullptr; + CharsetConverters.insert_or_assign(StringRef(To), + std::move(*ErrorOrConverter)); + return Converter; +} + +void LiteralConverter::setConvertersFromOptions( + const clang::LangOptions &Opts, const clang::TargetInfo &TInfo, + clang::DiagnosticsEngine &Diags) { + using namespace llvm; + SystemCharset = TInfo.getTriple().getSystemCharset(); + InternalCharset = "UTF-8"; + ExecCharset = Opts.ExecCharset.empty() ? InternalCharset : Opts.ExecCharset; + // Create converter between internal and system charset + if (!InternalCharset.equals(SystemCharset)) + createAndInsertCharConverter(SystemCharset.data()); + + // Create converter between internal and exec charset specified + // in fexec-charset option. + if (InternalCharset.equals(ExecCharset)) + return; + if (!createAndInsertCharConverter(ExecCharset.data())) { + Diags.Report(clang::diag::err_drv_invalid_value) + << "-fexec-charset" << ExecCharset; + } +} diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -93,7 +93,8 @@ const char *ThisTokEnd, bool &HadError, FullSourceLoc Loc, unsigned CharWidth, DiagnosticsEngine *Diags, - const LangOptions &Features) { + const LangOptions &Features, + llvm::CharSetConverter *Converter) { const char *EscapeBegin = ThisTokBuf; // Skip the '\' char. @@ -102,6 +103,8 @@ // We know that this character can't be off the end of the buffer, because // that would have been \", which would not have been the end of string. unsigned ResultChar = *ThisTokBuf++; + bool Translate = true; + bool Invalid = false; switch (ResultChar) { // These map to themselves. case '\\': case '\'': case '"': case '?': break; @@ -142,6 +145,7 @@ ResultChar = 11; break; case 'x': { // Hex escape. + Translate = false; ResultChar = 0; if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { if (Diags) @@ -179,6 +183,7 @@ case '4': case '5': case '6': case '7': { // Octal escapes. --ThisTokBuf; + Translate = false; ResultChar = 0; // Octal escapes are a series of octal digits with maximum length 3. @@ -210,6 +215,7 @@ << std::string(1, ResultChar); break; default: + Invalid = true; if (!Diags) break; @@ -224,6 +230,15 @@ break; } + if (Translate && Converter) { + // Invalid escapes are written as '?' and then translated. + char ByteChar = Invalid ? '?' : ResultChar; + SmallString<8> ResultCharConv; + Converter->convert(StringRef(&ByteChar), ResultCharConv); + assert(ResultCharConv.size() == 1 && + "Char size increased after translation"); + ResultChar = ResultCharConv[0]; + } return ResultChar; } @@ -1261,6 +1276,7 @@ HadError = false; Kind = kind; + LiteralConverter *LiteralConv = &PP.getLiteralConverter(); const char *TokBegin = begin; @@ -1322,6 +1338,10 @@ largest_character_for_kind = 0x7Fu; } + llvm::CharSetConverter *Converter = nullptr; + if (!isUTFLiteral(Kind) && LiteralConv) + Converter = LiteralConv->getConverter(ToExecCharset); + while (begin != end) { // Is this a span of non-escape characters? if (begin[0] != '\\') { @@ -1359,6 +1379,16 @@ HadError = true; PP.Diag(Loc, diag::err_character_too_large); } + if (!HadError && Converter) { + assert(Kind != tok::wide_char_constant && + "Wide character translation not supported"); + char ByteChar = *tmp_out_start; + SmallString<1> ConvertedChar; + Converter->convert(StringRef(&ByteChar), ConvertedChar); + assert(ConvertedChar.size() == 1 && + "Char size increased after translation"); + *tmp_out_start = ConvertedChar[0]; + } } } @@ -1381,9 +1411,9 @@ } unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo()); uint64_t result = - ProcessCharEscape(TokBegin, begin, end, HadError, - FullSourceLoc(Loc,PP.getSourceManager()), - CharWidth, &PP.getDiagnostics(), PP.getLangOpts()); + ProcessCharEscape(TokBegin, begin, end, HadError, + FullSourceLoc(Loc, PP.getSourceManager()), CharWidth, + &PP.getDiagnostics(), PP.getLangOpts(), nullptr); *buffer_begin++ = result; } @@ -1491,17 +1521,21 @@ /// hex-digit hex-digit hex-digit hex-digit /// \endverbatim /// -StringLiteralParser:: -StringLiteralParser(ArrayRef StringToks, - Preprocessor &PP, bool Complain) - : SM(PP.getSourceManager()), Features(PP.getLangOpts()), - Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr), - MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), - ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) { - init(StringToks); + +StringLiteralParser::StringLiteralParser(ArrayRef StringToks, + Preprocessor &PP, bool Complain, + ConversionAction Action) + : SM(PP.getSourceManager()), Features(PP.getLangOpts()), + Target(PP.getTargetInfo()), + Diags(Complain ? &PP.getDiagnostics() : nullptr), + LiteralConv(&PP.getLiteralConverter()), MaxTokenLength(0), SizeBound(0), + CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()), + hadError(false), Pascal(false) { + init(StringToks, Action); } -void StringLiteralParser::init(ArrayRef StringToks){ +void StringLiteralParser::init(ArrayRef StringToks, + ConversionAction Action) { // The literal token may have come from an invalid source location (e.g. due // to a PCH error), in which case the token length will be 0. if (StringToks.empty() || StringToks[0].getLength() < 2) @@ -1577,6 +1611,10 @@ SourceLocation UDSuffixTokLoc; + llvm::CharSetConverter *Converter = nullptr; + if (!isUTFLiteral(Kind) && LiteralConv) + Converter = LiteralConv->getConverter(Action); + for (unsigned i = 0, e = StringToks.size(); i != e; ++i) { const char *ThisTokBuf = &TokenBuf[0]; // Get the spelling of the token, which eliminates trigraphs, etc. We know @@ -1684,6 +1722,16 @@ if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF)) hadError = true; + if (!hadError && Converter) { + assert(Kind != tok::wide_string_literal && + "Wide character translation not supported"); + SmallString<256> CpConv; + int ResultLength = BeforeCRLF.size() * CharByteWidth; + char *Cp = ResultPtr - ResultLength; + Converter->convert(StringRef(Cp, ResultLength), CpConv); + memmove(Cp, CpConv.data(), ResultLength); + ResultPtr = Cp + CpConv.size(); + } // Point into the \n inside the \r\n sequence and operate on the // remaining portion of the literal. RemainingTokenSpan = AfterCRLF.substr(1); @@ -1717,25 +1765,45 @@ ++ThisTokBuf; } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); + int Length = ThisTokBuf - InStart; // Copy the character span over. if (CopyStringFragment(StringToks[i], ThisTokBegin, StringRef(InStart, ThisTokBuf - InStart))) hadError = true; + + if (!hadError && Converter) { + assert(Kind != tok::wide_string_literal && + "Wide character translation not supported"); + SmallString<256> CpConv; + int ResultLength = Length * CharByteWidth; + char *Cp = ResultPtr - ResultLength; + Converter->convert(StringRef(Cp, ResultLength), CpConv); + memmove(Cp, CpConv.data(), ResultLength); + ResultPtr = Cp + CpConv.size(); + } continue; } // Is this a Universal Character Name escape? if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { - EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, - ResultPtr, hadError, + char *Cp = ResultPtr; + EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, ResultPtr, + hadError, FullSourceLoc(StringToks[i].getLocation(), SM), CharByteWidth, Diags, Features); + + if (!hadError && Converter) { + SmallString<8> CpConv; + Converter->convert(StringRef(Cp), CpConv); + memmove(Cp, CpConv.data(), CpConv.size()); + ResultPtr = Cp + CpConv.size(); + } continue; } // Otherwise, this is a non-UCN escape character. Process it. unsigned ResultChar = - ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError, - FullSourceLoc(StringToks[i].getLocation(), SM), - CharByteWidth*8, Diags, Features); + ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError, + FullSourceLoc(StringToks[i].getLocation(), SM), + CharByteWidth * 8, Diags, Features, Converter); if (CharByteWidth == 4) { // FIXME: Make the type of the result buffer correct instead of @@ -1929,8 +1997,8 @@ ByteNo -= Len; } else { ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError, - FullSourceLoc(Tok.getLocation(), SM), - CharByteWidth*8, Diags, Features); + FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8, + Diags, Features, nullptr); --ByteNo; } assert(!HadError && "This method isn't valid on erroneous strings"); diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/systemz-charset.c @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s +// RUN: %clang %s -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s + +const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; +// CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00" + +const char *LowerCaseLetters = "abcdefghijklmnopqrstuvwxyz"; +//CHECK: c"\81\82\83\84\85\86\87\88\89\91\92\93\94\95\96\97\98\99\A2\A3\A4\A5\A6\A7\A8\A9\00" + +const char *Digits = "0123456789"; +// CHECK: c"\F0\F1\F2\F3\F4\F5\F6\F7\F8\F9\00" + +const char *SpecialCharacters = " .<(+|&!$*);^-/,%%_>`:#@="; +// CHECK: c"@KLMNOPZ[\\]^_`akllmnyz{|~\00" + +const char *EscapeCharacters = "\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: c"/\16\0C\15\0D\05\0B\E0}\7Fo\00" + +const char *InvalidEscape = "\y\z"; +//CHECK: c"oo\00" + +const char *HexCharacters = "\x12\x13\x14"; +//CHECK: c"\12\13\14\00" + +const char *OctalCharacters = "\141\142\143"; +//CHECK: c"abc\00" + +const char singleChar = 'a'; +//CHECK: i8 -127 + +const char *UcnCharacters = "\u00E2\u00AC\U000000DF"; +//CHECK: c"B\B0Y\00" + +const char *Unicode = "ÿ"; +//CHECK: c"\DF\00" diff --git a/clang/test/CodeGen/systemz-charset.cpp b/clang/test/CodeGen/systemz-charset.cpp new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/systemz-charset.cpp @@ -0,0 +1,46 @@ +// RUN: %clang %s -std=c++17 -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s + +const char *RawString = R"(Hello\n)"; +//CHECK: c"\C8\85\93\93\96\E0\95\00" + +const char *MultiLineRawString = R"( +Hello +There)"; +//CHECK: c"\15\C8\85\93\93\96\15\E3\88\85\99\85\00" + +char UnicodeChar8 = u8'1'; +//CHECK: i8 49 +char16_t UnicodeChar16 = u'1'; +//CHECK: i16 49 +char32_t UnicodeChar32 = U'1'; +//CHECK: i32 49 + +const char *EscapeCharacters8 = u8"\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: c"\07\08\0C\0A\0D\09\0B\\'\22?\00" + +const char16_t *EscapeCharacters16 = u"\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: [12 x i16] [i16 7, i16 8, i16 12, i16 10, i16 13, i16 9, i16 11, i16 92, i16 39, i16 34, i16 63, i16 0] + +const char32_t *EscapeCharacters32 = U"\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: [12 x i32] [i32 7, i32 8, i32 12, i32 10, i32 13, i32 9, i32 11, i32 92, i32 39, i32 34, i32 63, i32 0] + +const char *UnicodeString8 = u8"Hello"; +//CHECK: c"Hello\00" +const char16_t *UnicodeString16 = u"Hello"; +//CHECK: [6 x i16] [i16 72, i16 101, i16 108, i16 108, i16 111, i16 0] +const char32_t *UnicodeString32 = U"Hello"; +//CHECK: [6 x i32] [i32 72, i32 101, i32 108, i32 108, i32 111, i32 0] + +const char *UnicodeRawString8 = u8R"("Hello\")"; +//CHECK: c"\22Hello\\\22\00" +const char16_t *UnicodeRawString16 = uR"("Hello\")"; +//CHECK: [9 x i16] [i16 34, i16 72, i16 101, i16 108, i16 108, i16 111, i16 92, i16 34, i16 0] +const char32_t *UnicodeRawString32 = UR"("Hello\")"; +//CHECK: [9 x i32] [i32 34, i32 72, i32 101, i32 108, i32 108, i32 111, i32 92, i32 34, i32 0] + +const char *UnicodeUCNString8 = u8"\u00E2\u00AC\U000000DF"; +//CHECK: c"\C3\A2\C2\AC\C3\9F\00" +const char16_t *UnicodeUCNString16 = u"\u00E2\u00AC\U000000DF"; +//CHECK: [4 x i16] [i16 226, i16 172, i16 223, i16 0] +const char32_t *UnicodeUCNString32 = U"\u00E2\u00AC\U000000DF"; +//CHECK: [4 x i32] [i32 226, i32 172, i32 223, i32 0] diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c --- a/clang/test/Driver/cl-options.c +++ b/clang/test/Driver/cl-options.c @@ -210,10 +210,11 @@ // RUN: %clang_cl /source-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=source-charset-utf-16 %s // source-charset-utf-16: invalid value 'utf-16' in '/source-charset:utf-16' -// /execution-charset: should warn on everything except UTF-8. -// RUN: %clang_cl /execution-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-utf-16 %s -// execution-charset-utf-16: invalid value 'utf-16' in '/execution-charset:utf-16' +// /execution-charset: should warn on invalid charsets. +// RUN: %clang_cl /execution-charset:invalid-charset -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-invalid %s +// execution-charset-invalid: invalid value 'invalid-charset' in '/execution-charset:invalid-charset' // + // RUN: %clang_cl /Umymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s // RUN: %clang_cl /U mymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s // U: "-U" "mymacro" diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c --- a/clang/test/Driver/clang_f_opts.c +++ b/clang/test/Driver/clang_f_opts.c @@ -209,8 +209,14 @@ // RUN: %clang -### -S -finput-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-CHARSET %s // CHECK-INVALID-CHARSET: error: invalid value 'iso-8859-1' in '-finput-charset=iso-8859-1' -// RUN: %clang -### -S -fexec-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s -// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'iso-8859-1' in '-fexec-charset=iso-8859-1' +// RUN: %clang -### -S -fexec-charset=invalid-charset -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s +// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'invalid-charset' in '-fexec-charset=invalid-charset' + +// Test that we support the following exec charsets. +// RUN: %clang -### -S -fexec-charset=UTF-8 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s +// RUN: %clang -### -S -fexec-charset=ISO8859-1 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s +// RUN: %clang -### -S -fexec-charset=IBM-1047 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s +// INVALID-NOT: error: invalid value // Test that we don't error on these. // RUN: %clang -### -S -Werror \ @@ -224,7 +230,7 @@ // RUN: -fident -fno-ident \ // RUN: -fimplicit-templates -fno-implicit-templates \ // RUN: -finput-charset=UTF-8 \ -// RUN: -fexec-charset=UTF-8 \ +// RUN: -fexec-charset=UTF-8 \ // RUN: -fivopts -fno-ivopts \ // RUN: -fnon-call-exceptions -fno-non-call-exceptions \ // RUN: -fpermissive -fno-permissive \ diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -194,6 +194,14 @@ set(XAR_LIB xar) endif() +# Check for iconv. +find_package(Iconv) +if(Iconv_FOUND) + set(HAVE_ICONV 1) +else() + set(HAVE_ICONV 0) +endif() + # function checks check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM) find_package(Backtrace) diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h --- a/llvm/include/llvm/ADT/Triple.h +++ b/llvm/include/llvm/ADT/Triple.h @@ -397,6 +397,9 @@ /// if the environment component is present). StringRef getOSAndEnvironmentName() const; + /// getSystemCharset - Get the system charset of the triple. + StringRef getSystemCharset() const; + /// @} /// @name Convenience Predicates /// @{ diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake --- a/llvm/include/llvm/Config/config.h.cmake +++ b/llvm/include/llvm/Config/config.h.cmake @@ -97,6 +97,9 @@ /* Define to 1 if you have the `getrusage' function. */ #cmakedefine HAVE_GETRUSAGE ${HAVE_GETRUSAGE} +/* Define to 1 if you have the iconv library functions. */ +#cmakedefine HAVE_ICONV ${HAVE_ICONV} + /* Define to 1 if you have the `isatty' function. */ #cmakedefine HAVE_ISATTY 1 diff --git a/llvm/include/llvm/Support/CharSet.h b/llvm/include/llvm/Support/CharSet.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Support/CharSet.h @@ -0,0 +1,117 @@ +//===-- CharSet.h - Utility class to convert between char sets ----*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides a utility class to convert between different character +/// set encodings. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_CHARSET_H +#define LLVM_SUPPORT_CHARSET_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/Config/config.h" +#include "llvm/Support/ErrorOr.h" + +#include +#include +#include + +namespace llvm { + +template class SmallVectorImpl; + +/// Utility class to convert between different character set encodings. +/// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8. +/// If the iconv library is available, then arbitrary conversions are supported. +/// TODO Add Windows support. +class CharSetConverter { +public: + using ConverterFunc = + std::function &)>; + using CleanupFunc = std::function; + +private: + ConverterFunc Convert; + CleanupFunc Cleanup; + +public: + enum CharSetNames { + /// UTF-8 character set encoding. + CS_UTF8, + + /// ISO 8859-1 (Latin-1) character set encoding. + CS_LATIN1, + + /// IBM EBCDIC 1047 character set encoding. + CS_IBM1047 + }; + +private: + CharSetConverter(ConverterFunc Convert, CleanupFunc Cleanup) + : Convert(Convert), Cleanup(Cleanup) {} + +public: + /// Creates a CharSetConverter instance. + /// \param[in] CSFrom name of the source character encoding + /// \param[in] CSTo name of the target character encoding + /// \return a CharSetConverter instance + static CharSetConverter create(CharSetNames CSFrom, CharSetNames CSTo); + + /// Creates a CharSetConverter instance. + /// Returns std::errc::invalid_argument in case the requested conversion is + /// not supported. + /// \param[in] CSFrom name of the source character encoding + /// \param[in] CSTo name of the target character encoding + /// \return a CharSetConverter instance or an error code + static ErrorOr create(StringRef CSFrom, StringRef CSTo); + + CharSetConverter(const CharSetConverter &) = delete; + CharSetConverter &operator=(const CharSetConverter &) = delete; + + CharSetConverter(CharSetConverter &&Other) { + this->Convert = Other.Convert; + this->Cleanup = Other.Cleanup; + Other.Cleanup = nullptr; + } + + CharSetConverter &operator=(CharSetConverter &&Other) { + this->Convert = Other.Convert; + this->Cleanup = Other.Cleanup; + Other.Cleanup = nullptr; + return *this; + } + + ~CharSetConverter() { + if (Cleanup) + Cleanup(); + } + + /// Converts a string. + /// \param[in] Source source string + /// \param[in,out] Result container for converted string + /// \return error code in case something went wrong + std::error_code convert(StringRef Source, + SmallVectorImpl &Result) const { + return Convert(Source, Result); + } + + /// Converts a string. + /// \param[in] Source source string + /// \param[in,out] Result container for converted string + /// \return error code in case something went wrong + std::error_code convert(const std::string &Source, + SmallVectorImpl &Result) const { + return convert(StringRef(Source), Result); + } +}; + +} // namespace llvm + +#endif diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -53,6 +53,11 @@ set(system_libs ${system_libs} ${Z3_LIBRARIES}) endif() +# Link iconv library if it is an external library. +if(Iconv_FOUND AND NOT Iconv_IS_BUILT_IN) + set(system_libs ${system_libs} ${Iconv_LIBRARIES}) +endif() + # Override the C runtime allocator on Windows and embed it into LLVM tools & libraries if(LLVM_INTEGRATED_CRT_ALLOC) if (CMAKE_BUILD_TYPE AND NOT ${LLVM_USE_CRT_${uppercase_CMAKE_BUILD_TYPE}} MATCHES "^(MT|MTd)$") @@ -102,6 +107,7 @@ BuryPointer.cpp CachePruning.cpp circular_raw_ostream.cpp + CharSet.cpp Chrono.cpp COM.cpp CodeGenCoverage.cpp diff --git a/llvm/lib/Support/CharSet.cpp b/llvm/lib/Support/CharSet.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Support/CharSet.cpp @@ -0,0 +1,203 @@ +//===-- CharSet.cpp - Utility class to convert between char sets --*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides utility classes to convert between different character +/// set encoding. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/CharSet.h" +#include "llvm/ADT/SmallVector.h" +#include +#include + +#ifdef HAVE_ICONV +#include +#endif + +using namespace llvm; + +namespace { + +// Maps the charset name to enum constant if possible. +Optional getKnownCharSet(StringRef CSName) { +#define CSNAME(CS, STR) \ + if (CSName == STR) \ + return CS + CSNAME(CharSetConverter::CS_UTF8, "UTF-8"); + CSNAME(CharSetConverter::CS_LATIN1, "ISO8859-1"); + CSNAME(CharSetConverter::CS_IBM1047, "IBM-1047"); +#undef CSNAME + return None; +} + +// Character conversion between Enhanced ASCII and EBCDIC (IBM-1047). +const unsigned char ISO88591ToIBM1047[256] = { + 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x15, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, + 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f, 0x40, 0x5a, 0x7f, 0x7b, + 0x5b, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, + 0x4c, 0x7e, 0x6e, 0x6f, 0x7c, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xe2, + 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xad, 0xe0, 0xbd, 0x5f, 0x6d, + 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, + 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, + 0xa7, 0xa8, 0xa9, 0xc0, 0x4f, 0xd0, 0xa1, 0x07, 0x20, 0x21, 0x22, 0x23, + 0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x09, 0x0a, 0x1b, + 0x30, 0x31, 0x1a, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3a, 0x3b, + 0x04, 0x14, 0x3e, 0xff, 0x41, 0xaa, 0x4a, 0xb1, 0x9f, 0xb2, 0x6a, 0xb5, + 0xbb, 0xb4, 0x9a, 0x8a, 0xb0, 0xca, 0xaf, 0xbc, 0x90, 0x8f, 0xea, 0xfa, + 0xbe, 0xa0, 0xb6, 0xb3, 0x9d, 0xda, 0x9b, 0x8b, 0xb7, 0xb8, 0xb9, 0xab, + 0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9e, 0x68, 0x74, 0x71, 0x72, 0x73, + 0x78, 0x75, 0x76, 0x77, 0xac, 0x69, 0xed, 0xee, 0xeb, 0xef, 0xec, 0xbf, + 0x80, 0xfd, 0xfe, 0xfb, 0xfc, 0xba, 0xae, 0x59, 0x44, 0x45, 0x42, 0x46, + 0x43, 0x47, 0x9c, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57, + 0x8c, 0x49, 0xcd, 0xce, 0xcb, 0xcf, 0xcc, 0xe1, 0x70, 0xdd, 0xde, 0xdb, + 0xdc, 0x8d, 0x8e, 0xdf}; + +const unsigned char IBM1047ToISO88591[256] = { + 0x00, 0x01, 0x02, 0x03, 0x9c, 0x09, 0x86, 0x7f, 0x97, 0x8d, 0x8e, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x9d, 0x0a, 0x08, 0x87, + 0x18, 0x19, 0x92, 0x8f, 0x1c, 0x1d, 0x1e, 0x1f, 0x80, 0x81, 0x82, 0x83, + 0x84, 0x85, 0x17, 0x1b, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07, + 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, 0x98, 0x99, 0x9a, 0x9b, + 0x14, 0x15, 0x9e, 0x1a, 0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5, + 0xe7, 0xf1, 0xa2, 0x2e, 0x3c, 0x28, 0x2b, 0x7c, 0x26, 0xe9, 0xea, 0xeb, + 0xe8, 0xed, 0xee, 0xef, 0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e, + 0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5, 0xc7, 0xd1, 0xa6, 0x2c, + 0x25, 0x5f, 0x3e, 0x3f, 0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf, + 0xcc, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22, 0xd8, 0x61, 0x62, 0x63, + 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xfe, 0xb1, + 0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0xaa, 0xba, + 0xe6, 0xb8, 0xc6, 0xa4, 0xb5, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, + 0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0x5b, 0xde, 0xae, 0xac, 0xa3, 0xa5, 0xb7, + 0xa9, 0xa7, 0xb6, 0xbc, 0xbd, 0xbe, 0xdd, 0xa8, 0xaf, 0x5d, 0xb4, 0xd7, + 0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0xad, 0xf4, + 0xf6, 0xf2, 0xf3, 0xf5, 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, + 0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xf9, 0xfa, 0xff, 0x5c, 0xf7, 0x53, 0x54, + 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xb3, 0xdb, + 0xdc, 0xd9, 0xda, 0x9f}; + +enum { NoUTF = 0x0, SrcIsUTF = 0x1, DstIsUTF = 0x2 }; + +std::error_code convertWithTable(const unsigned char *Table, unsigned Flags, + StringRef Source, + SmallVectorImpl &Result) { + const unsigned char *Ptr = + reinterpret_cast(Source.data()); + size_t Length = Source.size(); + while (Length--) { + unsigned char Ch = *Ptr++; + // Handle UTF-8 2-byte-sequences in input. + if (Flags & SrcIsUTF) { + if (Ch >= 128) { + // Only two-byte sequences can be decoded. + if (Ch != 0xc2 && Ch != 0xc3) + return std::make_error_code(std::errc::illegal_byte_sequence); + // Is buffer truncated? + if (!Length) + return std::make_error_code(std::errc::invalid_argument); + unsigned char Ch2 = *Ptr++; + // Is second byte well-formed? + if ((Ch2 & 0xc0) != 0x80) + return std::make_error_code(std::errc::illegal_byte_sequence); + Ch = Ch2 | (Ch << 6); + Length--; + } + } + // Translate the character. + Ch = Table ? Table[Ch] : Ch; + // Handle UTF-8 2-byte-sequences in output. + if (Flags & DstIsUTF) { + if (Ch >= 128) { + // First byte prefixed with either 0xc2 or 0xc3. + Result.push_back(static_cast(0xc0 | (Ch >> 6))); + // Second byte is either the same as the ASCII byte or ASCII byte -64. + Ch = Ch & 0xbf; + } + } + Result.push_back(static_cast(Ch)); + } + return std::error_code(); +} + +#ifdef HAVE_ICONV +std::error_code convertWithIconv(iconv_t ConvDesc, StringRef Source, + SmallVectorImpl &Result) { + // Setup the input. + size_t InputLength = Source.size(); + char *Input = const_cast(Source.data()); + // Setup the output. We directly write into the SmallVector. + size_t Capacity = Result.capacity(); + Result.resize(Capacity); + char *Output = static_cast(Result.data()); + size_t OutputLength = Capacity; + + while (iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength) == + static_cast(-1)) { + if (errno == E2BIG) { + // No space left in output buffer. Double the size of the underlying + // memory in the SmallVectorImpl, adjust pointer and length and continue + // the conversion. + const size_t Used = Capacity - OutputLength; + Capacity *= 2; + Result.resize(Capacity); + Output = static_cast(Result.data()) + Used; + OutputLength = Capacity - Used; + } else + // Some other error occured. + return std::error_code(errno, std::generic_category()); + } + + // Re-adjust size to actual size. + Result.resize(Capacity - OutputLength); + return std::error_code(); +} +#endif +} // namespace + +CharSetConverter CharSetConverter::create(CharSetNames CSFrom, + CharSetNames CSTo) { + unsigned Flags = NoUTF; + if (CSFrom == CS_UTF8) + Flags |= SrcIsUTF; + if (CSTo == CS_UTF8) + Flags |= DstIsUTF; + const unsigned char *Table = nullptr; + if (CSFrom == CS_IBM1047) + Table = IBM1047ToISO88591; + if (CSTo == CS_IBM1047) + Table = ISO88591ToIBM1047; + return CharSetConverter{ + [Table, Flags](StringRef Source, SmallVectorImpl &Result) { + return convertWithTable(Table, Flags, Source, Result); + }, + nullptr}; +} + +ErrorOr CharSetConverter::create(StringRef CSFrom, + StringRef CSTo) { + Optional From = getKnownCharSet(CSFrom); + Optional To = getKnownCharSet(CSTo); + if (From && To) + return create(*From, *To); +#ifdef HAVE_ICONV + iconv_t ConvDesc = iconv_open(CSTo.str().c_str(), CSFrom.str().c_str()); + if (ConvDesc == reinterpret_cast(-1)) + return std::error_code(errno, std::generic_category()); + return CharSetConverter{ + [ConvDesc](StringRef Source, SmallVectorImpl &Result) { + return convertWithIconv(ConvDesc, Source, Result); + }, + [ConvDesc]() { iconv_close(ConvDesc); }}; +#endif + return std::make_error_code(std::errc::invalid_argument); +} \ No newline at end of file diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp --- a/llvm/lib/Support/Triple.cpp +++ b/llvm/lib/Support/Triple.cpp @@ -1046,6 +1046,13 @@ return Tmp.split('-').second; // Strip second component } +// System charset on z/OS is IBM-1047 and UTF-8 otherwise +StringRef Triple::getSystemCharset() const { + if (getOS() == llvm::Triple::ZOS) + return "IBM-1047"; + return "UTF-8"; +} + static unsigned EatNumber(StringRef &Str) { assert(!Str.empty() && isDigit(Str[0]) && "Not a number"); unsigned Result = 0; diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt --- a/llvm/unittests/Support/CMakeLists.txt +++ b/llvm/unittests/Support/CMakeLists.txt @@ -14,6 +14,7 @@ BlockFrequencyTest.cpp BranchProbabilityTest.cpp CachePruningTest.cpp + CharSetTest.cpp CrashRecoveryTest.cpp Casting.cpp CheckedArithmeticTest.cpp diff --git a/llvm/unittests/Support/CharSetTest.cpp b/llvm/unittests/Support/CharSetTest.cpp new file mode 100644 --- /dev/null +++ b/llvm/unittests/Support/CharSetTest.cpp @@ -0,0 +1,191 @@ +//===- unittests/Support/CharSetTest.cpp - Charset conversion tests -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/CharSet.h" +#include "llvm/ADT/SmallString.h" +#include "gtest/gtest.h" +using namespace llvm; + +namespace { + +// String "Hello World!" +static const char HelloA[] = + "\x48\x65\x6C\x6C\x6F\x20\x57\x6F\x72\x6C\x64\x21\x0a"; +static const char HelloE[] = + "\xC8\x85\x93\x93\x96\x40\xE6\x96\x99\x93\x84\x5A\x15"; + +// String "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" +static const char ABCStrA[] = + "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F\x50\x51\x52" + "\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A" + "\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7A"; +static const char ABCStrE[] = + "\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9" + "\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\x81\x82\x83\x84\x85\x86\x87\x88\x89\x91" + "\x92\x93\x94\x95\x96\x97\x98\x99\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9"; + +// String "¡¢£AÄÅÆEÈÉÊaàáâãäeèéêë" +static const char AccentUTF[] = + "\xc2\xa1\xc2\xa2\xc2\xa3\x41\xc3\x84\xc3\x85\xc3\x86\x45\xc3\x88\xc3\x89" + "\xc3\x8a\x61\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\x65\xc3\xa8\xc3\xa9" + "\xc3\xaa\xc3\xab"; +static const char AccentE[] = "\xaa\x4a\xb1\xc1\x63\x67\x9e\xc5\x74\x71\x72" + "\x81\x44\x45\x42\x46\x43\x85\x54\x51\x52\x53"; + +TEST(CharSet, FromASCII) { + // Hello string. + StringRef Src(HelloA); + SmallString<64> Dst; + + CharSetConverter Conv = CharSetConverter::create( + CharSetConverter::CS_LATIN1, CharSetConverter::CS_IBM1047); + std::error_code EC = Conv.convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(HelloE, static_cast(Dst).c_str()); + + // ABC string. + Src = ABCStrA; + Dst.clear(); + EC = Conv.convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(ABCStrE, static_cast(Dst).c_str()); +} + +TEST(CharSet, ToASCII) { + // Hello string. + StringRef Src(HelloE); + SmallString<64> Dst; + + CharSetConverter Conv = CharSetConverter::create(CharSetConverter::CS_IBM1047, + CharSetConverter::CS_LATIN1); + std::error_code EC = Conv.convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(HelloA, static_cast(Dst).c_str()); + + // ABC string. + Src = ABCStrE; + Dst.clear(); + EC = Conv.convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(ABCStrA, static_cast(Dst).c_str()); +} + +TEST(CharSet, FromUTF8) { + // Hello string. + StringRef Src(HelloA); + SmallString<64> Dst; + + CharSetConverter Conv = CharSetConverter::create( + CharSetConverter::CS_UTF8, CharSetConverter::CS_IBM1047); + std::error_code EC = Conv.convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(HelloE, static_cast(Dst).c_str()); + + // ABC string. + Src = ABCStrA; + Dst.clear(); + EC = Conv.convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(ABCStrE, static_cast(Dst).c_str()); + + // Accent string. + Src = AccentUTF; + Dst.clear(); + EC = Conv.convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(AccentE, static_cast(Dst).c_str()); +} + +TEST(CharSet, ToUTF8) { + // Hello string. + StringRef Src(HelloE); + SmallString<64> Dst; + + CharSetConverter Conv = CharSetConverter::create(CharSetConverter::CS_IBM1047, + CharSetConverter::CS_UTF8); + std::error_code EC = Conv.convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(HelloA, static_cast(Dst).c_str()); + + // ABC string. + Src = ABCStrE; + Dst.clear(); + EC = Conv.convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(ABCStrA, static_cast(Dst).c_str()); + + // Accent string. + Src = AccentE; + Dst.clear(); + EC = Conv.convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(AccentUTF, static_cast(Dst).c_str()); +} + +TEST(CharSet, Identity) { + // Hello string. + StringRef Src(HelloA); + SmallString<64> Dst; + + CharSetConverter Conv = CharSetConverter::create(CharSetConverter::CS_LATIN1, + CharSetConverter::CS_LATIN1); + std::error_code EC = Conv.convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(HelloA, static_cast(Dst).c_str()); + + // ABC string. + Src = ABCStrA; + Dst.clear(); + EC = Conv.convert(Src, Dst); + EXPECT_TRUE(!EC); + EXPECT_STREQ(ABCStrA, static_cast(Dst).c_str()); +} + +TEST(CharSet, RoundTrip) { + ErrorOr ConvToUTF16 = + CharSetConverter::create("IBM-1047", "UTF-16"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToUTF16) { + ASSERT_EQ(ConvToUTF16.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + ErrorOr ConvToUTF32 = + CharSetConverter::create("UTF-16", "UTF-32"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToUTF32) { + ASSERT_EQ(ConvToUTF32.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + ErrorOr ConvToEBCDIC = + CharSetConverter::create("UTF-32", "IBM-1047"); + // Stop test if conversion is not supported (no underlying iconv support). + if (!ConvToEBCDIC) { + ASSERT_EQ(ConvToEBCDIC.getError(), + std::make_error_code(std::errc::invalid_argument)); + return; + } + + // Setup source string. + char SrcStr[256]; + for (size_t I = 0; I < 256; ++I) + SrcStr[I] = (I + 1) % 256; + + SmallString<99> Dst1Str, Dst2Str, Dst3Str; + + std::error_code EC = ConvToUTF16->convert(StringRef(SrcStr), Dst1Str); + EXPECT_TRUE(!EC); + EC = ConvToUTF32->convert(Dst1Str, Dst2Str); + EXPECT_TRUE(!EC); + EC = ConvToEBCDIC->convert(Dst2Str, Dst3Str); + EXPECT_TRUE(!EC); + EXPECT_STREQ(SrcStr, static_cast(Dst3Str).c_str()); +} + +} // namespace