Skip to content

Commit 7423f40

Browse files
author
Marianne Mailhot-Sarrasin
committedMar 11, 2016
More UTF string conversion wrappers
Added new string conversion wrappers that convert between `std::string` (of UTF-8 bytes) and `std::wstring`, which is particularly useful for Win32 interop. Also fixed a missing string conversion for `getenv` on Win32, using these new wrappers. The motivation behind this is to provide the support functions required for LLDB to work properly on Windows with non-ASCII data; however, the functions are not LLDB specific. Patch by cameron314 Differential Revision: http://reviews.llvm.org/D17549 llvm-svn: 263247
1 parent 47c3a47 commit 7423f40

File tree

4 files changed

+162
-3
lines changed

4 files changed

+162
-3
lines changed
 

‎llvm/include/llvm/Support/ConvertUTF.h

+28
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,25 @@ namespace llvm {
197197
bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
198198
char *&ResultPtr, const UTF8 *&ErrorPtr);
199199

200+
/**
201+
* Converts a UTF-8 StringRef to a std::wstring.
202+
* \return true on success.
203+
*/
204+
bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result);
205+
206+
/**
207+
* Converts a UTF-8 C-string to a std::wstring.
208+
* \return true on success.
209+
*/
210+
bool ConvertUTF8toWide(const char *Source, std::wstring &Result);
211+
212+
/**
213+
* Converts a std::wstring to a UTF-8 encoded std::string.
214+
* \return true on success.
215+
*/
216+
bool convertWideToUTF8(const std::wstring &Source, std::string &Result);
217+
218+
200219
/**
201220
* Convert an Unicode code point to UTF8 sequence.
202221
*
@@ -251,6 +270,15 @@ bool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
251270
*/
252271
bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
253272

273+
/**
274+
* Converts a UTF16 string into a UTF8 std::string.
275+
*
276+
* \param [in] Src A buffer of UTF-16 encoded text.
277+
* \param [out] Out Converted UTF-8 is stored here on success.
278+
* \returns true on success
279+
*/
280+
bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out);
281+
254282
/**
255283
* Converts a UTF-8 string into a UTF-16 string with native endianness.
256284
*

‎llvm/lib/Support/CommandLine.cpp

+19
Original file line numberDiff line numberDiff line change
@@ -787,9 +787,28 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
787787
assert(envVar && "Environment variable name missing");
788788

789789
// Get the environment variable they want us to parse options out of.
790+
#ifdef _WIN32
791+
std::wstring wenvVar;
792+
if (!llvm::ConvertUTF8toWide(envVar, wenvVar)) {
793+
assert(false &&
794+
"Unicode conversion of environment variable name failed");
795+
return;
796+
}
797+
const wchar_t *wenvValue = _wgetenv(wenvVar.c_str());
798+
if (!wenvValue)
799+
return;
800+
std::string envValueBuffer;
801+
if (!llvm::convertWideToUTF8(wenvValue, envValueBuffer)) {
802+
assert(false &&
803+
"Unicode conversion of environment variable value failed");
804+
return;
805+
}
806+
const char *envValue = envValueBuffer.c_str();
807+
#else
790808
const char *envValue = getenv(envVar);
791809
if (!envValue)
792810
return;
811+
#endif
793812

794813
// Get program's "name", which we wouldn't know without the caller
795814
// telling us.

‎llvm/lib/Support/ConvertUTFWrapper.cpp

+79-2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
//===----------------------------------------------------------------------===//
99

1010
#include "llvm/Support/ConvertUTF.h"
11+
#include "llvm/Support/ErrorHandling.h"
1112
#include "llvm/Support/SwapByteOrder.h"
1213
#include <string>
1314
#include <vector>
@@ -36,7 +37,7 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
3637
ConversionFlags flags = strictConversion;
3738
result = ConvertUTF8toUTF16(
3839
&sourceStart, sourceStart + Source.size(),
39-
&targetStart, targetStart + 2*Source.size(), flags);
40+
&targetStart, targetStart + Source.size(), flags);
4041
if (result == conversionOK)
4142
ResultPtr = reinterpret_cast<char*>(targetStart);
4243
else
@@ -49,7 +50,7 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
4950
ConversionFlags flags = strictConversion;
5051
result = ConvertUTF8toUTF32(
5152
&sourceStart, sourceStart + Source.size(),
52-
&targetStart, targetStart + 4*Source.size(), flags);
53+
&targetStart, targetStart + Source.size(), flags);
5354
if (result == conversionOK)
5455
ResultPtr = reinterpret_cast<char*>(targetStart);
5556
else
@@ -130,6 +131,13 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
130131
return true;
131132
}
132133

134+
bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out)
135+
{
136+
return convertUTF16ToUTF8String(
137+
llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
138+
Src.size() * sizeof(UTF16)), Out);
139+
}
140+
133141
bool convertUTF8ToUTF16String(StringRef SrcUTF8,
134142
SmallVectorImpl<UTF16> &DstUTF16) {
135143
assert(DstUTF16.empty());
@@ -168,5 +176,74 @@ bool convertUTF8ToUTF16String(StringRef SrcUTF8,
168176
return true;
169177
}
170178

179+
static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 ||
180+
sizeof(wchar_t) == 4,
181+
"Expected wchar_t to be 1, 2, or 4 bytes");
182+
183+
template <typename TResult>
184+
static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source,
185+
TResult &Result) {
186+
// Even in the case of UTF-16, the number of bytes in a UTF-8 string is
187+
// at least as large as the number of elements in the resulting wide
188+
// string, because surrogate pairs take at least 4 bytes in UTF-8.
189+
Result.resize(Source.size() + 1);
190+
char *ResultPtr = reinterpret_cast<char *>(&Result[0]);
191+
const UTF8 *ErrorPtr;
192+
if (!ConvertUTF8toWide(sizeof(wchar_t), Source, ResultPtr, ErrorPtr)) {
193+
Result.clear();
194+
return false;
195+
}
196+
Result.resize(reinterpret_cast<wchar_t *>(ResultPtr) - &Result[0]);
197+
return true;
198+
}
199+
200+
bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result) {
201+
return ConvertUTF8toWideInternal(Source, Result);
202+
}
203+
204+
bool ConvertUTF8toWide(const char *Source, std::wstring &Result) {
205+
if (!Source) {
206+
Result.clear();
207+
return true;
208+
}
209+
return ConvertUTF8toWide(llvm::StringRef(Source), Result);
210+
}
211+
212+
bool convertWideToUTF8(const std::wstring &Source, std::string &Result) {
213+
if (sizeof(wchar_t) == 1) {
214+
const UTF8 *Start = reinterpret_cast<const UTF8 *>(Source.data());
215+
const UTF8 *End =
216+
reinterpret_cast<const UTF8 *>(Source.data() + Source.size());
217+
if (!isLegalUTF8String(&Start, End))
218+
return false;
219+
Result.resize(Source.size());
220+
memcpy(&Result[0], Source.data(), Source.size());
221+
return true;
222+
} else if (sizeof(wchar_t) == 2) {
223+
return convertUTF16ToUTF8String(
224+
llvm::ArrayRef<UTF16>(reinterpret_cast<const UTF16 *>(Source.data()),
225+
Source.size()),
226+
Result);
227+
} else if (sizeof(wchar_t) == 4) {
228+
const UTF32 *Start = reinterpret_cast<const UTF32 *>(Source.data());
229+
const UTF32 *End =
230+
reinterpret_cast<const UTF32 *>(Source.data() + Source.size());
231+
Result.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT * Source.size());
232+
UTF8 *ResultPtr = reinterpret_cast<UTF8 *>(&Result[0]);
233+
UTF8 *ResultEnd = reinterpret_cast<UTF8 *>(&Result[0] + Result.size());
234+
if (ConvertUTF32toUTF8(&Start, End, &ResultPtr, ResultEnd,
235+
strictConversion) == conversionOK) {
236+
Result.resize(reinterpret_cast<char *>(ResultPtr) - &Result[0]);
237+
return true;
238+
} else {
239+
Result.clear();
240+
return false;
241+
}
242+
} else {
243+
llvm_unreachable(
244+
"Control should never reach this point; see static_assert further up");
245+
}
246+
}
247+
171248
} // end namespace llvm
172249

‎llvm/unittests/Support/ConvertUTFTest.cpp

+36-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ TEST(ConvertUTFTest, OddLengthInput) {
5959

6060
TEST(ConvertUTFTest, Empty) {
6161
std::string Result;
62-
bool Success = convertUTF16ToUTF8String(None, Result);
62+
bool Success = convertUTF16ToUTF8String(llvm::ArrayRef<char>(None), Result);
6363
EXPECT_TRUE(Success);
6464
EXPECT_TRUE(Result.empty());
6565
}
@@ -80,6 +80,41 @@ TEST(ConvertUTFTest, HasUTF16BOM) {
8080
EXPECT_FALSE(HasBOM);
8181
}
8282

83+
TEST(ConvertUTFTest, UTF16WrappersForConvertUTF16ToUTF8String) {
84+
// Src is the look of disapproval.
85+
static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
86+
ArrayRef<UTF16> SrcRef = makeArrayRef((const UTF16 *)Src, 4);
87+
std::string Result;
88+
bool Success = convertUTF16ToUTF8String(SrcRef, Result);
89+
EXPECT_TRUE(Success);
90+
std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
91+
EXPECT_EQ(Expected, Result);
92+
}
93+
94+
TEST(ConvertUTFTest, ConvertUTF8toWide) {
95+
// Src is the look of disapproval.
96+
static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
97+
std::wstring Result;
98+
bool Success = ConvertUTF8toWide((const char*)Src, Result);
99+
EXPECT_TRUE(Success);
100+
std::wstring Expected(L"\x0ca0_\x0ca0");
101+
EXPECT_EQ(Expected, Result);
102+
Result.clear();
103+
Success = ConvertUTF8toWide(StringRef(Src, 7), Result);
104+
EXPECT_TRUE(Success);
105+
EXPECT_EQ(Expected, Result);
106+
}
107+
108+
TEST(ConvertUTFTest, convertWideToUTF8) {
109+
// Src is the look of disapproval.
110+
static const wchar_t Src[] = L"\x0ca0_\x0ca0";
111+
std::string Result;
112+
bool Success = convertWideToUTF8(Src, Result);
113+
EXPECT_TRUE(Success);
114+
std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
115+
EXPECT_EQ(Expected, Result);
116+
}
117+
83118
struct ConvertUTFResultContainer {
84119
ConversionResult ErrorCode;
85120
std::vector<unsigned> UnicodeScalars;

0 commit comments

Comments
 (0)
Please sign in to comment.