Proposed solution is to reuse to_chars - like function into buffer on stack and copy result into string / wstring. It allows to use SSO for small numbers.
Details
Diff Detail
Event Timeline
Code correctness was checked via
make llvm-check
Please, let me know if this command doesn't do what is expected according to documentation (I'm newby in clang project).
Performance tests:
http://quick-bench.com/wOBLE4yDNQp0x7RGg7ex32atgQY
In the case of quick-bench cache expired, the testing code is:
#include <string> #include <type_traits> namespace ref { template<typename S, typename P, typename V > inline S as_string(P sprintf_like, S s, const typename S::value_type* fmt, V a) { typedef typename S::size_type size_type; size_type available = s.size(); while (true) { int status = sprintf_like(&s[0], available + 1, fmt, a); if ( status >= 0 ) { size_type used = static_cast<size_type>(status); if ( used <= available ) { s.resize( used ); break; } available = used; // Assume this is advice of how much space we need. } else available = available * 2 + 1; s.resize(available); } return s; } template <class S, class V, bool = std::is_floating_point<V>::value> struct initial_string; template <class V, bool b> struct initial_string<std::string, V, b> { std::string operator()() const { std::string s; s.resize(s.capacity()); return s; } }; template <class V> struct initial_string<std::wstring, V, false> { std::wstring operator()() const { const size_t n = (std::numeric_limits<unsigned long long>::digits / 3) + ((std::numeric_limits<unsigned long long>::digits % 3) != 0) + 1; std::wstring s(n, wchar_t()); s.resize(s.capacity()); return s; } }; template <class V> struct initial_string<std::wstring, V, true> { std::wstring operator()() const { std::wstring s(20, wchar_t()); s.resize(s.capacity()); return s; } }; typedef int (*wide_printf)(wchar_t* __restrict, size_t, const wchar_t*__restrict, ...); inline wide_printf get_swprintf() { #ifndef _LIBCPP_MSVCRT return swprintf; #else return static_cast<int (__cdecl*)(wchar_t* __restrict, size_t, const wchar_t*__restrict, ...)>(_snwprintf); #endif } std::string __attribute__ ((noinline)) to_string(int val) { return as_string(snprintf, initial_string<std::string, int>()(), "%d", val); } std::string __attribute__ ((noinline)) to_string(unsigned val) { return as_string(snprintf, initial_string<std::string, unsigned>()(), "%u", val); } std::string __attribute__ ((noinline)) to_string(long val) { return as_string(snprintf, initial_string<std::string, long>()(), "%ld", val); } std::string __attribute__ ((noinline)) to_string(unsigned long val) { return as_string(snprintf, initial_string<std::string, unsigned long>()(), "%lu", val); } std::string __attribute__ ((noinline)) to_string(long long val) { return as_string(snprintf, initial_string<std::string, long long>()(), "%lld", val); } std::string __attribute__ ((noinline)) to_string(unsigned long long val) { return as_string(snprintf, initial_string<std::string, unsigned long long>()(), "%llu", val); } std::wstring __attribute__ ((noinline)) to_wstring(int val) { return as_string(get_swprintf(), initial_string<std::wstring, int>()(), L"%d", val); } std::wstring __attribute__ ((noinline)) to_wstring(unsigned val) { return as_string(get_swprintf(), initial_string<std::wstring, unsigned>()(), L"%u", val); } std::wstring __attribute__ ((noinline)) to_wstring(long val) { return as_string(get_swprintf(), initial_string<std::wstring, long>()(), L"%ld", val); } std::wstring __attribute__ ((noinline)) to_wstring(unsigned long val) { return as_string(get_swprintf(), initial_string<std::wstring, unsigned long>()(), L"%lu", val); } std::wstring __attribute__ ((noinline)) to_wstring(long long val) { return as_string(get_swprintf(), initial_string<std::wstring, long long>()(), L"%lld", val); } std::wstring __attribute__ ((noinline)) to_wstring(unsigned long long val) { return as_string(get_swprintf(), initial_string<std::wstring, unsigned long long>()(), L"%llu", val); } } // ref /// /// /// namespace tgt { template<typename S, typename P, typename V> inline S as_string(P sprintf_like, const typename S::value_type* fmt, V a) { constexpr size_t size = 4 * sizeof(V) + 1; // +1 for null char added by swprintf typename S::value_type tmp[size] = {}; const int len = sprintf_like(tmp, size, fmt, a); return S(tmp, tmp + len); } typedef int (*wide_printf)(wchar_t* __restrict, size_t, const wchar_t*__restrict, ...); inline wide_printf get_swprintf() { #ifndef _LIBCPP_MSVCRT return swprintf; #else return static_cast<int (__cdecl*)(wchar_t* __restrict, size_t, const wchar_t*__restrict, ...)>(_snwprintf); #endif } std::string __attribute__ ((noinline)) to_string(int val) { return as_string<std::string>(snprintf, "%d", val); } std::string __attribute__ ((noinline)) to_string(unsigned val) { return as_string<std::string>(snprintf, "%u", val); } std::string __attribute__ ((noinline)) to_string(long val) { return as_string<std::string>(snprintf, "%ld", val); } std::string __attribute__ ((noinline)) to_string(unsigned long val) { return as_string<std::string>(snprintf, "%lu", val); } std::string __attribute__ ((noinline)) to_string(long long val) { return as_string<std::string>(snprintf, "%lld", val); } std::string __attribute__ ((noinline)) to_string(unsigned long long val) { return as_string<std::string>(snprintf, "%llu", val); } std::wstring __attribute__ ((noinline)) to_wstring(int val) { return as_string<std::wstring>(get_swprintf(), L"%d", val); } std::wstring __attribute__ ((noinline)) to_wstring(unsigned val) { return as_string<std::wstring>(get_swprintf(), L"%u", val); } std::wstring __attribute__ ((noinline)) to_wstring(long val) { return as_string<std::wstring>(get_swprintf(), L"%ld", val); } std::wstring __attribute__ ((noinline)) to_wstring(unsigned long val) { return as_string<std::wstring>(get_swprintf(), L"%lu", val); } std::wstring __attribute__ ((noinline)) to_wstring(long long val) { return as_string<std::wstring>(get_swprintf(), L"%lld", val); } std::wstring __attribute__ ((noinline)) to_wstring(unsigned long long val) { return as_string<std::wstring>(get_swprintf(), L"%llu", val); } } // tgt const unsigned long long one = 1; const unsigned long long ullmax = std::numeric_limits<unsigned long long>::max(); static void to_string_ref_1(benchmark::State& state) { for (auto _ : state) { const auto x = ref::to_string(one); benchmark::DoNotOptimize(x); } } BENCHMARK(to_string_ref_1); static void to_string_tgt_1(benchmark::State& state) { for (auto _ : state) { const auto x = tgt::to_string(one); benchmark::DoNotOptimize(x); } } BENCHMARK(to_string_tgt_1); static void to_string_ref_max(benchmark::State& state) { for (auto _ : state) { const auto x = ref::to_string(ullmax); benchmark::DoNotOptimize(x); } } BENCHMARK(to_string_ref_max); static void to_string_tgt_max(benchmark::State& state) { for (auto _ : state) { const auto x = tgt::to_string(ullmax); benchmark::DoNotOptimize(x); } } BENCHMARK(to_string_tgt_max); static void to_wstring_ref_1(benchmark::State& state) { for (auto _ : state) { const auto x = ref::to_wstring(one); benchmark::DoNotOptimize(x); } } BENCHMARK(to_wstring_ref_1); static void to_wstring_tgt_1(benchmark::State& state) { for (auto _ : state) { const auto x = tgt::to_wstring(one); benchmark::DoNotOptimize(x); } } BENCHMARK(to_wstring_tgt_1); static void to_wstring_ref_max(benchmark::State& state) { for (auto _ : state) { const auto x = ref::to_wstring(ullmax); benchmark::DoNotOptimize(x); } } BENCHMARK(to_wstring_ref_max); static void to_wstring_tgt_max(benchmark::State& state) { for (auto _ : state) { const auto x = tgt::to_wstring(ullmax); benchmark::DoNotOptimize(x); } } BENCHMARK(to_wstring_tgt_max);
I don't think llvm-check build target exists. Did you mean check-llvm?
Even then, it will only check LLVM, not libc++; you'd want check-libcxx or check-all.
Please, let me know if this command doesn't do what is expected according to documentation (I'm newby in clang project).
libcxx/src/string.cpp | ||
---|---|---|
357–358 | initial_string should preallocate sufficiently-long string already. | |
359–360 | Missing error checking. RETURN VALUE Upon successful return, these functions return the number of characters printed (excluding the null byte used to end output to strings). |
libcxx/src/string.cpp | ||
---|---|---|
454 | Considering that performance is of the essence and that we only do simple conversions (without leading whitespace, zeroes, etc.), would it make sense to handroll this for the integer cases? This is likely a lot faster than using snprintf(). |
libcxx/src/string.cpp | ||
---|---|---|
357–358 | We would lost benefits from SSO if initial_string preallocates sufficient long string. Consider the following example: const long long one = 1; const auto s = to_wstring(one); Implementation with preallocation does the following:
I.e. 1 malloc call. Proposed solution:
I.e. 0 malloc calls | |
359–360 | size is chosen in the way that there should be enough space for integer: GCC implementation ignores negative return values so as they knows that space is enough: In case of unexpected sprintf failure I will add fallback to the previous algorithm. | |
454 | I will try to implement that and estimate performance gain. |
Now that we have to_chars, can we use those facilities for to_string? (rather than duplicate them?)
[ Ok, we don't have \to_chars` for FP yet ]
libcxx/src/string.cpp | ||
---|---|---|
359–360 | Because of licensing issues, we aren't able to refer to GCC's source code. |
The annoying answer to this is that we don't have to_chars support for C++11, but to_string exists in C++11.
So it's more complicated than just a simple replacement.
libcxx/src/string.cpp | ||
---|---|---|
359–360 | Ooooops, sorry, didn't know about that. Also I didnn't found how to edit / delete comment with link to gcc repo. If there is a way, please help me. | |
454 | Ed, I have locally implemented simple conversion and it gives significant performance boost (about 20x times when SSO plays). There is one more question I'm looking for answer for days. According to std::to_string documentation result for integer types must be equal to sprintf call with %i parameter (%l for long etc etc) According to to_chars documentation to_chars is the only locale-independent formatter According to this note clang aims to work with the system C library I've checked glibc, ditelibc and musl standard C lib implementations, and they provides the expected result for sprintf(s, "%i", x) calls without locale checks for thousands separator: sprintf(s, "%i", 100500) --> 100500 So, on the one side, performance boost is really impressive, on the other side there is no guarantee that sprintf(s, "%i", x) will always ignore thousands separator. I want to ask your opinion as much more experienced compiler developer. |
libcxx/src/string.cpp | ||
---|---|---|
454 |
Even though this is POSIX and not ISO C, it seems that thousands separators are only added to integers when %'d is used (notice the apostrophe). See this page: http://pubs.opengroup.org/onlinepubs/9699919799/functions/fprintf.html <- search for 'apostrophe' ISO C also says the following about printf()'s 'd' conversion specifier:
Notice that the description doesn't mention anything about thousand separators. In other words, I think it's fine to bypass sprintf() for integer literals. For floating point literals it is a different story, because formatting those is locale dependent. |
libcxx/src/string.cpp | ||
---|---|---|
359–360 | Hi, Roman. to_string implementation was rewritten to naive algorithm according to Ed proposal. | |
454 | Thank you a lot! It gives 5x - 18x speedup: |
Hi!
Could you please help me to complete this pull request.
I do not have commit access to clang repository.
If review is completed could you please commit changes?
In general, I'm not happy with this direction.
I think it is unwise to have two different implementations of "integer to string" functionality in the library, doing the same thing.
I'm happy with the idea of making to_string faster, but not this way.
Also, the only people who are authorized to approve changes for libc++ are: @ldionne, @EricWF and myself.
Now that revision 356585 has landed, we can use std::to_chars to implement to_string
I would suggest something like (untested code):
template <class T> string __to_chars(T t) { char buf[numeric_limits<T>::digits10() + 3]; // +1 for '-' +1 for nul, and +1 for extra to_chars_result res(buf, buf + sizeof(buf), t, 10); return string(buf, res.ptr); }
and then call *that* from the various to_string implementations.
Hi, Marshall.
I've started implementing and testing to_string via to_chars and realized that to_chars doesn't support wchar_t unlike to_wstring.
I'm going to extend __to_chars_itoa implementation in order to support wchar_t and reuse those internal template function.
Could you please confirm that this way is ok in order not to through away the third attempt on completion.
to_string via to_chars implementation is ready.
Performance improvement is nice:
http://quick-bench.com/fFyPrEWpB3jhCHQkLgkOAgsdaEI
Hi, Marshall.
I've started implementing and testing to_string via to_chars and realized that to_chars doesn't support wchar_t unlike to_wstring.
I'm going to extend __to_chars_itoa implementation in order to support wchar_t and reuse those internal template function.Could you please confirm that this way is ok in order not to through away the third attempt on completion.
I would just use to_chars to make a narrow result and then widen it.
libcxx/src/string.cpp | ||
---|---|---|
443 | I wouldn't do this at all. Thus getting rid of all the sprintf_like infrastructure. |
Simplified version is uploaded.
Yes, it is significantly shorter and cleaner.
Thank you!
Performance improvement is still nice, proposed version is 8-20x times faster than original implementation.
http://quick-bench.com/AzveFnRyMp8AcCWRODcxMhiqz3Y
Removed struct initial_string template arguments for integers, they are not needed anymore
Hi Marshall,
Seems like this changeset is ready.
Could you please review it and land if it is ok?
Hi!
Is there any issues I can help with the proposed to_string implementation?
I've manually run make check-cxx tests, they didn't show new failures.
This looks good to me.
One of these days, we'll have to_chars for floating point as well, and we can clean this up some more.
I will commit this today.
After this change landed I started getting odd failures with check-llvm with MSan or ASan like the following: http://lab.llvm.org:8011/builders/sanitizer-x86_64-linux-bootstrap-msan/builds/12853
/b/sanitizer-x86_64-linux-bootstrap-msan/build/llvm/test/ThinLTO/X86/dot-dumper-full-lto.ll:12:10: error: CHECK: expected string not found in input
; CHECK: subgraph cluster_4294967295
<stdin>:3:2: note: possible intended match here
subgraph cluster_0004294967295 {
The error stems from that string being generated with:
OS << " subgraph cluster_" << std::to_string(ModId) << " {\n";
For some reason std::to_string((uint64_t)0xffffffff) used to return "4294967295" but now returns "0004294967295"; however, I couldn't understand why this was only happening with ASan/MSan. I tried to look at the libcxx implementation of how this works and it seems like it should only EVER return "0004294967295". Looking at the implementation of itoa::u64toa(), the logic for handling 9-12 digits seems incorrect, it will always add additional zeroes: for the number above b0=0 c0=42 so it runs append1 "0" append4 "0042" append4 "9496" append4 "7295".
I tried doing a local non-ASan build and building a trivial test program that does std::to_string((uint64_t)4294967295) and it gives the expected incorrect result so I guess the other bots are just not building the tests with libc++.
I tested the following change and verified that it works for the numeric example above, but don't have time to write a test and monitor for build failures so I'm going to revert this change for now. Leaving it here for whoever wants to pick it up:
--- a/libcxx/src/charconv.cpp +++ b/libcxx/src/charconv.cpp @@ -176,22 +176,39 @@ __u64toa(uint64_t value, char* buffer) const uint32_t b0 = v0 / 10000; const uint32_t c0 = v0 % 10000; - if (v0 < 1000000) + if (b0) { - if (v0 < 100000) - buffer = append1(buffer, b0); + if (b0 < 100) + { + if (b0 < 10) + buffer = append1(buffer, b0); + else + buffer = append2(buffer, b0); + } + else + { + if (b0 < 1000) + buffer = append3(buffer, b0); + else + buffer = append4(buffer, b0); + } + } + + if (c0 < 100) + { + if (c0 < 10) + buffer = append1(buffer, c0); else - buffer = append2(buffer, b0); + buffer = append2(buffer, c0); } else { - if (v0 < 10000000) - buffer = append3(buffer, b0); + if (c0 < 1000) + buffer = append3(buffer, c0); else - buffer = append4(buffer, b0); + buffer = append4(buffer, c0); } - buffer = append4(buffer, c0); buffer = append4(buffer, v1 / 10000); buffer = append4(buffer, v1 % 10000); }
Vlad, yes, you are right, std::to_chars produce unexpected result for the specific range of values.
I will try to fix that and add tests.
It is needed to make sure that std::from_chars will be able to understand both std::to_chars outputs in order to save compatibility.
I have replicated this problem with to_chars. https://bugs.llvm.org/show_bug.cgi?id=42166
Assuming that I have applied Vlad's patch correctly, I find that it doesn't work.
to_chars(buf, buf + 100, (int64_t)0x10000000000) gives a result of 199511627776 instead of 1099511627776
(note the missing zero)
Glad I waited for someone to implement tests! It looks like my approach of comparing against b0/c0 directly doesn't work since it won't 'see' that there need to be leading zeroes added in the c0 case, you need to use v0 directly.
initial_string should preallocate sufficiently-long string already.
If it does not, it should be fixed, no?