Page MenuHomePhabricator

[libc++] Speedup to_string and to_wstring for integers using stack buffer and SSO
ClosedPublic

Authored by ivafanas on Mar 9 2019, 11:20 AM.

Details

Summary

Proposed solution is to reuse to_chars - like function into buffer on stack and copy result into string / wstring. It allows to use SSO for small numbers.

Diff Detail

Event Timeline

ivafanas created this revision.Mar 9 2019, 11:20 AM

Code correctness was checked via

make llvm-check

Please, let me know if this command doesn't do what is expected according to documentation (I'm newby in clang project).

Performance tests:
http://quick-bench.com/wOBLE4yDNQp0x7RGg7ex32atgQY

In the case of quick-bench cache expired, the testing code is:

#include <string>
#include <type_traits>

namespace ref
{

template<typename S, typename P, typename V >
inline
S
as_string(P sprintf_like, S s, const typename S::value_type* fmt, V a)
{
    typedef typename S::size_type size_type;
    size_type available = s.size();
    while (true)
    {
        int status = sprintf_like(&s[0], available + 1, fmt, a);
        if ( status >= 0 )
        {
            size_type used = static_cast<size_type>(status);
            if ( used <= available )
            {
                s.resize( used );
                break;
            }
            available = used; // Assume this is advice of how much space we need.
        }
        else
            available = available * 2 + 1;
        s.resize(available);
    }
    return s;
}

template <class S, class V, bool = std::is_floating_point<V>::value>
struct initial_string;

template <class V, bool b>
struct initial_string<std::string, V, b>
{
    std::string
    operator()() const
    {
        std::string s;
        s.resize(s.capacity());
        return s;
    }
};

template <class V>
struct initial_string<std::wstring, V, false>
{
    std::wstring
    operator()() const
    {
        const size_t n = (std::numeric_limits<unsigned long long>::digits / 3)
          + ((std::numeric_limits<unsigned long long>::digits % 3) != 0)
          + 1;
        std::wstring s(n, wchar_t());
        s.resize(s.capacity());
        return s;
    }
};

template <class V>
struct initial_string<std::wstring, V, true>
{
    std::wstring
    operator()() const
    {
        std::wstring s(20, wchar_t());
        s.resize(s.capacity());
        return s;
    }
};

typedef int (*wide_printf)(wchar_t* __restrict, size_t, const wchar_t*__restrict, ...);

inline
wide_printf
get_swprintf()
{
#ifndef _LIBCPP_MSVCRT
    return swprintf;
#else
    return static_cast<int (__cdecl*)(wchar_t* __restrict, size_t, const wchar_t*__restrict, ...)>(_snwprintf);
#endif
}

std::string __attribute__ ((noinline)) to_string(int val)
{
    return as_string(snprintf, initial_string<std::string, int>()(), "%d", val);
}

std::string __attribute__ ((noinline)) to_string(unsigned val)
{
    return as_string(snprintf, initial_string<std::string, unsigned>()(), "%u", val);
}

std::string __attribute__ ((noinline)) to_string(long val)
{
    return as_string(snprintf, initial_string<std::string, long>()(), "%ld", val);
}

std::string __attribute__ ((noinline)) to_string(unsigned long val)
{
    return as_string(snprintf, initial_string<std::string, unsigned long>()(), "%lu", val);
}

std::string __attribute__ ((noinline)) to_string(long long val)
{
    return as_string(snprintf, initial_string<std::string, long long>()(), "%lld", val);
}

std::string __attribute__ ((noinline)) to_string(unsigned long long val)
{
    return as_string(snprintf, initial_string<std::string, unsigned long long>()(), "%llu", val);
}

std::wstring __attribute__ ((noinline)) to_wstring(int val)
{
    return as_string(get_swprintf(), initial_string<std::wstring, int>()(), L"%d", val);
}

std::wstring __attribute__ ((noinline)) to_wstring(unsigned val)
{
    return as_string(get_swprintf(), initial_string<std::wstring, unsigned>()(), L"%u", val);
}

std::wstring __attribute__ ((noinline)) to_wstring(long val)
{
    return as_string(get_swprintf(), initial_string<std::wstring, long>()(), L"%ld", val);
}

std::wstring __attribute__ ((noinline)) to_wstring(unsigned long val)
{
    return as_string(get_swprintf(), initial_string<std::wstring, unsigned long>()(), L"%lu", val);
}

std::wstring __attribute__ ((noinline)) to_wstring(long long val)
{
    return as_string(get_swprintf(), initial_string<std::wstring, long long>()(), L"%lld", val);
}

std::wstring __attribute__ ((noinline)) to_wstring(unsigned long long val)
{
    return as_string(get_swprintf(), initial_string<std::wstring, unsigned long long>()(), L"%llu", val);
}

}  // ref



///
///
///



namespace tgt
{

  template<typename S, typename P, typename V>
  inline S as_string(P sprintf_like, const typename S::value_type* fmt, V a)
  {
    constexpr size_t size = 4 * sizeof(V) + 1; // +1 for null char added by swprintf
    typename S::value_type tmp[size] = {};
    const int len = sprintf_like(tmp, size, fmt, a);
    return S(tmp, tmp + len);
  }

typedef int (*wide_printf)(wchar_t* __restrict, size_t, const wchar_t*__restrict, ...);

inline
wide_printf
get_swprintf()
{
#ifndef _LIBCPP_MSVCRT
    return swprintf;
#else
    return static_cast<int (__cdecl*)(wchar_t* __restrict, size_t, const wchar_t*__restrict, ...)>(_snwprintf);
#endif
}

std::string __attribute__ ((noinline)) to_string(int val)
{
    return as_string<std::string>(snprintf, "%d", val);
}

std::string __attribute__ ((noinline)) to_string(unsigned val)
{
    return as_string<std::string>(snprintf, "%u", val);
}

std::string __attribute__ ((noinline)) to_string(long val)
{
    return as_string<std::string>(snprintf, "%ld", val);
}

std::string __attribute__ ((noinline)) to_string(unsigned long val)
{
    return as_string<std::string>(snprintf, "%lu", val);
}

std::string __attribute__ ((noinline)) to_string(long long val)
{
    return as_string<std::string>(snprintf, "%lld", val);
}

std::string __attribute__ ((noinline)) to_string(unsigned long long val)
{
    return as_string<std::string>(snprintf, "%llu", val);
}

std::wstring __attribute__ ((noinline)) to_wstring(int val)
{
    return as_string<std::wstring>(get_swprintf(), L"%d", val);
}

std::wstring __attribute__ ((noinline)) to_wstring(unsigned val)
{
    return as_string<std::wstring>(get_swprintf(), L"%u", val);
}

std::wstring __attribute__ ((noinline)) to_wstring(long val)
{
    return as_string<std::wstring>(get_swprintf(), L"%ld", val);
}

std::wstring __attribute__ ((noinline)) to_wstring(unsigned long val)
{
    return as_string<std::wstring>(get_swprintf(), L"%lu", val);
}

std::wstring __attribute__ ((noinline)) to_wstring(long long val)
{
    return as_string<std::wstring>(get_swprintf(), L"%lld", val);
}

std::wstring __attribute__ ((noinline)) to_wstring(unsigned long long val)
{
    return as_string<std::wstring>(get_swprintf(), L"%llu", val);
}

}  // tgt



const unsigned long long one = 1;
const unsigned long long ullmax = std::numeric_limits<unsigned long long>::max();

static void to_string_ref_1(benchmark::State& state) {
  for (auto _ : state) {
    const auto x = ref::to_string(one);
    benchmark::DoNotOptimize(x);
  }
}
BENCHMARK(to_string_ref_1);

static void to_string_tgt_1(benchmark::State& state) {
  for (auto _ : state) {
    const auto x = tgt::to_string(one);
    benchmark::DoNotOptimize(x);
  }
}
BENCHMARK(to_string_tgt_1);

static void to_string_ref_max(benchmark::State& state) {
  for (auto _ : state) {
    const auto x = ref::to_string(ullmax);
    benchmark::DoNotOptimize(x);
  }
}
BENCHMARK(to_string_ref_max);

static void to_string_tgt_max(benchmark::State& state) {
  for (auto _ : state) {
    const auto x = tgt::to_string(ullmax);
    benchmark::DoNotOptimize(x);
  }
}
BENCHMARK(to_string_tgt_max);

static void to_wstring_ref_1(benchmark::State& state) {
  for (auto _ : state) {
    const auto x = ref::to_wstring(one);
    benchmark::DoNotOptimize(x);
  }
}
BENCHMARK(to_wstring_ref_1);

static void to_wstring_tgt_1(benchmark::State& state) {
  for (auto _ : state) {
    const auto x = tgt::to_wstring(one);
    benchmark::DoNotOptimize(x);
  }
}
BENCHMARK(to_wstring_tgt_1);

static void to_wstring_ref_max(benchmark::State& state) {
  for (auto _ : state) {
    const auto x = ref::to_wstring(ullmax);
    benchmark::DoNotOptimize(x);
  }
}
BENCHMARK(to_wstring_ref_max);

static void to_wstring_tgt_max(benchmark::State& state) {
  for (auto _ : state) {
    const auto x = tgt::to_wstring(ullmax);
    benchmark::DoNotOptimize(x);
  }
}
BENCHMARK(to_wstring_tgt_max);
lebedev.ri retitled this revision from Speedup to_string and to_wstring for integers using stack buffer and SSO to [libc++] Speedup to_string and to_wstring for integers using stack buffer and SSO.Mar 9 2019, 11:52 AM
lebedev.ri added a project: Restricted Project.

Code correctness was checked via

make llvm-check

I don't think llvm-check build target exists. Did you mean check-llvm?
Even then, it will only check LLVM, not libc++; you'd want check-libcxx or check-all.

Please, let me know if this command doesn't do what is expected according to documentation (I'm newby in clang project).

libcxx/src/string.cpp
358–359

initial_string should preallocate sufficiently-long string already.
If it does not, it should be fixed, no?

360–361

Missing error checking.

RETURN VALUE
       Upon successful return, these functions return the number of characters printed (excluding the null byte used to end output to strings).
ed added inline comments.Mar 10 2019, 1:45 PM
libcxx/src/string.cpp
437

Considering that performance is of the essence and that we only do simple conversions (without leading whitespace, zeroes, etc.), would it make sense to handroll this for the integer cases? This is likely a lot faster than using snprintf().

ivafanas added inline comments.Mar 11 2019, 1:19 AM
libcxx/src/string.cpp
358–359

We would lost benefits from SSO if initial_string preallocates sufficient long string.

Consider the following example:

const long long one = 1;
const auto s = to_wstring(one);

Implementation with preallocation does the following:

  • Allocate 4 * sizeof(long long) * sizeof(wchar_t) bytes in the string (about 128 bytes)
  • swprintf 1 into initial_string

I.e. 1 malloc call.

Proposed solution:

  • make buffer on stack of size (4 * sizeof(long long) + 1) * sizeof(wchar_t) (about 132 bytes) (no allocation)
  • swprintf 1 into buffer (i.e. 2 symbols)
  • construct wstring from 2 symbols of temporary buffer (<-- here is SSO used, no allocation)

I.e. 0 malloc calls

360–361

size is chosen in the way that there should be enough space for integer:
4 * sizeof(V): 2^8 = 256 - 3 digits + 1 digit for minus.

GCC implementation ignores negative return values so as they knows that space is enough:
https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/include/ext/string_conversions.h#L99

In case of unexpected sprintf failure I will add fallback to the previous algorithm.

437

I will try to implement that and estimate performance gain.

Now that we have to_chars, can we use those facilities for to_string? (rather than duplicate them?)
[ Ok, we don't have \to_chars` for FP yet ]

mclow.lists added inline comments.Mar 12 2019, 3:43 PM
libcxx/src/string.cpp
360–361

Because of licensing issues, we aren't able to refer to GCC's source code.
Please help us do so by not posting links, snippets, etc from GCC here.

mclow.lists added a comment.EditedMar 12 2019, 4:15 PM

Now that we have to_chars, can we use those facilities for to_string? (rather than duplicate them?)

The annoying answer to this is that we don't have to_chars support for C++11, but to_string exists in C++11.

So it's more complicated than just a simple replacement.

ivafanas added inline comments.Mar 13 2019, 10:21 AM
libcxx/src/string.cpp
360–361

Ooooops, sorry, didn't know about that.

Also I didnn't found how to edit / delete comment with link to gcc repo. If there is a way, please help me.

437

Ed, I have locally implemented simple conversion and it gives significant performance boost (about 20x times when SSO plays).

There is one more question I'm looking for answer for days.

According to std::to_string documentation result for integer types must be equal to sprintf call with %i parameter (%l for long etc etc)
https://en.cppreference.com/w/cpp/string/basic_string/to_string

According to to_chars documentation to_chars is the only locale-independent formatter
https://en.cppreference.com/w/cpp/utility/to_chars

According to this note clang aims to work with the system C library
http://clang-developers.42468.n3.nabble.com/C-Standard-Library-td2127533.html

I've checked glibc, ditelibc and musl standard C lib implementations, and they provides the expected result for sprintf(s, "%i", x) calls without locale checks for thousands separator: sprintf(s, "%i", 100500) --> 100500
I've tested msvc sprintf(s, "%i", x) calls with the different locales, it also prints integers without thousands separator.

So, on the one side, performance boost is really impressive, on the other side there is no guarantee that sprintf(s, "%i", x) will always ignore thousands separator.

I want to ask your opinion as much more experienced compiler developer.
Should we use fast simple conversion or leave slow sprintf calls?

ed added inline comments.Mar 14 2019, 4:45 AM
libcxx/src/string.cpp
437

So, on the one side, performance boost is really impressive, on the other side there is no guarantee that sprintf(s, "%i", x) will always ignore thousands separator.

Even though this is POSIX and not ISO C, it seems that thousands separators are only added to integers when %'d is used (notice the apostrophe). See this page:

http://pubs.opengroup.org/onlinepubs/9699919799/functions/fprintf.html <- search for 'apostrophe'

ISO C also says the following about printf()'s 'd' conversion specifier:

The int argument is converted to signed decimal in the style [−]dddd. The precision specifies the minimum number of digits to appear; if the value being converted can be represented in fewer digits, it is expanded with leading zeros. The default precision is 1. The result of converting a zero value with a precision of zero is no characters.

Notice that the description doesn't mention anything about thousand separators.

In other words, I think it's fine to bypass sprintf() for integer literals. For floating point literals it is a different story, because formatting those is locale dependent.

ivafanas updated this revision to Diff 191026.Mar 17 2019, 10:03 AM

Updated to_string implementation for integer types. Naive algorithm is used.

ivafanas marked 2 inline comments as done.Mar 17 2019, 10:19 AM
ivafanas added inline comments.
libcxx/src/string.cpp
360–361

Hi, Roman.

to_string implementation was rewritten to naive algorithm according to Ed proposal.
There is no need to check sprintf return value anymore.

437

Thank you a lot!
Updated implementation has been uploaded to review.

It gives 5x - 18x speedup:
http://quick-bench.com/fnW_oJWH9D8BYNw-Y9mQaIYvD24

ed accepted this revision.Mar 18 2019, 1:48 AM

Thanks for working on this!

This revision is now accepted and ready to land.Mar 18 2019, 1:48 AM

Hi!

Could you please help me to complete this pull request.

I do not have commit access to clang repository.
If review is completed could you please commit changes?

In general, I'm not happy with this direction.
I think it is unwise to have two different implementations of "integer to string" functionality in the library, doing the same thing.
I'm happy with the idea of making to_string faster, but not this way.

Also, the only people who are authorized to approve changes for libc++ are: @ldionne, @EricWF and myself.

mclow.lists requested changes to this revision.Mar 20 2019, 11:16 AM

Now that revision 356585 has landed, we can use std::to_chars to implement to_string

I would suggest something like (untested code):

template <class T>
string __to_chars(T t)
{
char buf[numeric_limits<T>::digits10() + 3]; // +1 for '-' +1 for nul, and +1 for extra
to_chars_result res(buf, buf + sizeof(buf), t, 10);
return string(buf, res.ptr);
}

and then call *that* from the various to_string implementations.

This revision now requires changes to proceed.Mar 20 2019, 11:16 AM

FWIW, I also prefer Marshall's suggestion. Let's share code if we can.

Now that revision 356585 has landed, we can use std::to_chars to implement to_string

I would suggest something like (untested code):

template <class T>
string __to_chars(T t)
{
char buf[numeric_limits<T>::digits10() + 3]; // +1 for '-' +1 for nul, and +1 for extra
to_chars_result res(buf, buf + sizeof(buf), t, 10);
return string(buf, res.ptr);
}

and then call *that* from the various to_string implementations.

Good.
I will try to implement this idea and test it for performance.

Now that revision 356585 has landed, we can use std::to_chars to implement to_string

I would suggest something like (untested code):

template <class T>
string __to_chars(T t)
{
char buf[numeric_limits<T>::digits10() + 3]; // +1 for '-' +1 for nul, and +1 for extra
to_chars_result res(buf, buf + sizeof(buf), t, 10);
return string(buf, res.ptr);
}

and then call *that* from the various to_string implementations.

Hi, Marshall.

I've started implementing and testing to_string via to_chars and realized that to_chars doesn't support wchar_t unlike to_wstring.
I'm going to extend __to_chars_itoa implementation in order to support wchar_t and reuse those internal template function.

Could you please confirm that this way is ok in order not to through away the third attempt on completion.

ivafanas updated this revision to Diff 192007.Mar 23 2019, 11:20 AM
ivafanas edited the summary of this revision. (Show Details)

to_string via to_chars implementation is ready.

Performance improvement is nice:
http://quick-bench.com/fFyPrEWpB3jhCHQkLgkOAgsdaEI

ivafanas updated this revision to Diff 192008.Mar 23 2019, 11:25 AM

Diff from the corrected clang version is uploaded

ivafanas updated this revision to Diff 192019.Mar 23 2019, 7:46 PM

return back memcpy, it is faster for long integers in about 10%

Hi, Marshall.

I've started implementing and testing to_string via to_chars and realized that to_chars doesn't support wchar_t unlike to_wstring.
I'm going to extend __to_chars_itoa implementation in order to support wchar_t and reuse those internal template function.

Could you please confirm that this way is ok in order not to through away the third attempt on completion.

I would just use to_chars to make a narrow result and then widen it.

mclow.lists added inline comments.Mar 26 2019, 10:22 AM
libcxx/src/string.cpp
428

I wouldn't do this at all.
I would just _LIBCPP_ASSERT that it succeeded.

Thus getting rid of all the sprintf_like infrastructure.

ivafanas updated this revision to Diff 192659.Mar 28 2019, 9:45 AM

Simplified version is uploaded.
Yes, it is significantly shorter and cleaner.
Thank you!

Performance improvement is still nice, proposed version is 8-20x times faster than original implementation.
http://quick-bench.com/AzveFnRyMp8AcCWRODcxMhiqz3Y

ed accepted this revision.Mar 29 2019, 3:15 PM
ivafanas updated this revision to Diff 193061.Apr 1 2019, 4:04 AM

Removed struct initial_string template arguments for integers, they are not needed anymore

Hi Marshall,

Seems like this changeset is ready.
Could you please review it and land if it is ok?

Hi,

Could anyone please review this diff?

This is looking much better to me. I'll run some tests later today and confirm.

This is looking much better to me. I'll run some tests later today and confirm.

Hi!

Is there any issues I can help with the proposed to_string implementation?
I've manually run make check-cxx tests, they didn't show new failures.

Hi,

Can I help with testing / integration?
It seems like a forgotten review.

mclow.lists accepted this revision.Jun 5 2019, 12:12 PM

This looks good to me.
One of these days, we'll have to_chars for floating point as well, and we can clean this up some more.

I will commit this today.

This revision is now accepted and ready to land.Jun 5 2019, 12:12 PM
mclow.lists closed this revision.Jun 5 2019, 2:01 PM

Committed as revision 362649

Thank you!!

After this change landed I started getting odd failures with check-llvm with MSan or ASan like the following: http://lab.llvm.org:8011/builders/sanitizer-x86_64-linux-bootstrap-msan/builds/12853
/b/sanitizer-x86_64-linux-bootstrap-msan/build/llvm/test/ThinLTO/X86/dot-dumper-full-lto.ll:12:10: error: CHECK: expected string not found in input
; CHECK: subgraph cluster_4294967295
<stdin>:3:2: note: possible intended match here
subgraph cluster_0004294967295 {

The error stems from that string being generated with:

OS << "  subgraph cluster_" << std::to_string(ModId) << " {\n";

For some reason std::to_string((uint64_t)0xffffffff) used to return "4294967295" but now returns "0004294967295"; however, I couldn't understand why this was only happening with ASan/MSan. I tried to look at the libcxx implementation of how this works and it seems like it should only EVER return "0004294967295". Looking at the implementation of itoa::u64toa(), the logic for handling 9-12 digits seems incorrect, it will always add additional zeroes: for the number above b0=0 c0=42 so it runs append1 "0" append4 "0042" append4 "9496" append4 "7295".

I tried doing a local non-ASan build and building a trivial test program that does std::to_string((uint64_t)4294967295) and it gives the expected incorrect result so I guess the other bots are just not building the tests with libc++.

I tested the following change and verified that it works for the numeric example above, but don't have time to write a test and monitor for build failures so I'm going to revert this change for now. Leaving it here for whoever wants to pick it up:

--- a/libcxx/src/charconv.cpp
+++ b/libcxx/src/charconv.cpp
@@ -176,22 +176,39 @@ __u64toa(uint64_t value, char* buffer)
         const uint32_t b0 = v0 / 10000;
         const uint32_t c0 = v0 % 10000;

-        if (v0 < 1000000)
+        if (b0)
         {
-            if (v0 < 100000)
-                buffer = append1(buffer, b0);
+            if (b0 < 100)
+            {
+                if (b0 < 10)
+                    buffer = append1(buffer, b0);
+                else
+                    buffer = append2(buffer, b0);
+            }
+            else
+            {
+                if (b0 < 1000)
+                    buffer = append3(buffer, b0);
+                else
+                    buffer = append4(buffer, b0);
+            }
+        }
+
+        if (c0 < 100)
+        {
+            if (c0 < 10)
+                buffer = append1(buffer, c0);
             else
-                buffer = append2(buffer, b0);
+                buffer = append2(buffer, c0);
         }
         else
         {
-            if (v0 < 10000000)
-                buffer = append3(buffer, b0);
+            if (c0 < 1000)
+                buffer = append3(buffer, c0);
             else
-                buffer = append4(buffer, b0);
+                buffer = append4(buffer, c0);
         }

-        buffer = append4(buffer, c0);
         buffer = append4(buffer, v1 / 10000);
         buffer = append4(buffer, v1 % 10000);
     }

Vlad,

Understood, I will have a look on that.
Thanks.

Vlad, yes, you are right, std::to_chars produce unexpected result for the specific range of values.

I will try to fix that and add tests.
It is needed to make sure that std::from_chars will be able to understand both std::to_chars outputs in order to save compatibility.

I have replicated this problem with to_chars. https://bugs.llvm.org/show_bug.cgi?id=42166

I tested the following change and verified that it works for the numeric example above, but don't have time to write a test and monitor for build failures so I'm going to revert this change for now. Leaving it here for whoever wants to pick it up:

Assuming that I have applied Vlad's patch correctly, I find that it doesn't work.

to_chars(buf, buf + 100, (int64_t)0x10000000000) gives a result of 199511627776 instead of 1099511627776
(note the missing zero)

I tested the following change and verified that it works for the numeric example above, but don't have time to write a test and monitor for build failures so I'm going to revert this change for now. Leaving it here for whoever wants to pick it up:

Assuming that I have applied Vlad's patch correctly, I find that it doesn't work.

to_chars(buf, buf + 100, (int64_t)0x10000000000) gives a result of 199511627776 instead of 1099511627776
(note the missing zero)

Glad I waited for someone to implement tests! It looks like my approach of comparing against b0/c0 directly doesn't work since it won't 'see' that there need to be leading zeroes added in the c0 case, you need to use v0 directly.

Recommitted as revision 363003