Details
- Reviewers
ldionne - Group Reviewers
Restricted Project - Commits
- rG68b1035965f0: [libc++][PSTL] Add a __parallel_sort implementation to libdispatch
Diff Detail
- Repository
- rG LLVM Github Monorepo
Event Timeline
Benchmark numbers:
------------------------------------------------------------------------------------------------------------- Benchmark serial parallel ------------------------------------------------------------------------------------------------------------- BM_pstl_stable_sort_uint32_Random/1 1.93 ns 534 ns BM_pstl_stable_sort_uint32_Random/4 17.5 ns 800 ns BM_pstl_stable_sort_uint32_Random/16 118 ns 1198 ns BM_pstl_stable_sort_uint32_Random/64 776 ns 2142 ns BM_pstl_stable_sort_uint32_Random/256 4592 ns 6945 ns BM_pstl_stable_sort_uint32_Random/1024 23893 ns 31171 ns BM_pstl_stable_sort_uint32_Random/16384 563914 ns 433386 ns BM_pstl_stable_sort_uint32_Random/262144 11894672 ns 3885956 ns BM_pstl_stable_sort_uint32_Ascending/1 1.93 ns 568 ns BM_pstl_stable_sort_uint32_Ascending/4 4.25 ns 789 ns BM_pstl_stable_sort_uint32_Ascending/16 8.52 ns 1068 ns BM_pstl_stable_sort_uint32_Ascending/64 25.0 ns 1483 ns BM_pstl_stable_sort_uint32_Ascending/256 832 ns 3632 ns BM_pstl_stable_sort_uint32_Ascending/1024 6140 ns 13805 ns BM_pstl_stable_sort_uint32_Ascending/16384 194077 ns 199223 ns BM_pstl_stable_sort_uint32_Ascending/262144 4527549 ns 2033799 ns BM_pstl_stable_sort_uint32_Descending/1 2.05 ns 533 ns BM_pstl_stable_sort_uint32_Descending/4 7.75 ns 775 ns BM_pstl_stable_sort_uint32_Descending/16 56.6 ns 1061 ns BM_pstl_stable_sort_uint32_Descending/64 864 ns 1547 ns BM_pstl_stable_sort_uint32_Descending/256 4199 ns 3638 ns BM_pstl_stable_sort_uint32_Descending/1024 19518 ns 15025 ns BM_pstl_stable_sort_uint32_Descending/16384 409599 ns 249699 ns BM_pstl_stable_sort_uint32_Descending/262144 8099483 ns 2720430 ns BM_pstl_stable_sort_uint32_SingleElement/1 1.98 ns 536 ns BM_pstl_stable_sort_uint32_SingleElement/4 4.24 ns 778 ns BM_pstl_stable_sort_uint32_SingleElement/16 8.57 ns 1081 ns BM_pstl_stable_sort_uint32_SingleElement/64 25.0 ns 1464 ns BM_pstl_stable_sort_uint32_SingleElement/256 815 ns 3753 ns BM_pstl_stable_sort_uint32_SingleElement/1024 6148 ns 13789 ns BM_pstl_stable_sort_uint32_SingleElement/16384 193470 ns 200398 ns BM_pstl_stable_sort_uint32_SingleElement/262144 4593444 ns 2050435 ns BM_pstl_stable_sort_uint32_PipeOrgan/1 1.99 ns 545 ns BM_pstl_stable_sort_uint32_PipeOrgan/4 4.80 ns 765 ns BM_pstl_stable_sort_uint32_PipeOrgan/16 22.9 ns 1060 ns BM_pstl_stable_sort_uint32_PipeOrgan/64 206 ns 1526 ns BM_pstl_stable_sort_uint32_PipeOrgan/256 2528 ns 4080 ns BM_pstl_stable_sort_uint32_PipeOrgan/1024 12875 ns 15882 ns BM_pstl_stable_sort_uint32_PipeOrgan/16384 296908 ns 241196 ns BM_pstl_stable_sort_uint32_PipeOrgan/262144 6237556 ns 2797044 ns BM_pstl_stable_sort_uint32_QuickSortAdversary/1 1.96 ns 534 ns BM_pstl_stable_sort_uint32_QuickSortAdversary/4 4.24 ns 816 ns BM_pstl_stable_sort_uint32_QuickSortAdversary/16 8.62 ns 1098 ns BM_pstl_stable_sort_uint32_QuickSortAdversary/64 415 ns 1601 ns BM_pstl_stable_sort_uint32_QuickSortAdversary/256 2744 ns 4634 ns BM_pstl_stable_sort_uint32_QuickSortAdversary/1024 20339 ns 17328 ns BM_pstl_stable_sort_uint32_QuickSortAdversary/16384 428328 ns 289705 ns BM_pstl_stable_sort_uint32_QuickSortAdversary/262144 8292500 ns 2970696 ns BM_pstl_stable_sort_uint64_Random/1 1.99 ns 585 ns BM_pstl_stable_sort_uint64_Random/4 17.7 ns 807 ns BM_pstl_stable_sort_uint64_Random/16 119 ns 1176 ns BM_pstl_stable_sort_uint64_Random/64 787 ns 2253 ns BM_pstl_stable_sort_uint64_Random/256 4568 ns 7001 ns BM_pstl_stable_sort_uint64_Random/1024 24210 ns 32133 ns BM_pstl_stable_sort_uint64_Random/16384 578554 ns 493008 ns BM_pstl_stable_sort_uint64_Random/262144 12190776 ns 4313737 ns BM_pstl_stable_sort_uint64_Ascending/1 1.95 ns 550 ns BM_pstl_stable_sort_uint64_Ascending/4 4.29 ns 789 ns BM_pstl_stable_sort_uint64_Ascending/16 8.70 ns 1044 ns BM_pstl_stable_sort_uint64_Ascending/64 25.3 ns 1506 ns BM_pstl_stable_sort_uint64_Ascending/256 850 ns 3718 ns BM_pstl_stable_sort_uint64_Ascending/1024 6313 ns 13998 ns BM_pstl_stable_sort_uint64_Ascending/16384 198880 ns 262187 ns BM_pstl_stable_sort_uint64_Ascending/262144 4857250 ns 2386621 ns BM_pstl_stable_sort_uint64_Descending/1 1.98 ns 534 ns BM_pstl_stable_sort_uint64_Descending/4 7.88 ns 780 ns BM_pstl_stable_sort_uint64_Descending/16 57.4 ns 1048 ns BM_pstl_stable_sort_uint64_Descending/64 863 ns 1634 ns BM_pstl_stable_sort_uint64_Descending/256 4166 ns 3778 ns BM_pstl_stable_sort_uint64_Descending/1024 19769 ns 15531 ns BM_pstl_stable_sort_uint64_Descending/16384 408186 ns 327655 ns BM_pstl_stable_sort_uint64_Descending/262144 8160153 ns 3131787 ns BM_pstl_stable_sort_uint64_SingleElement/1 1.97 ns 541 ns BM_pstl_stable_sort_uint64_SingleElement/4 4.28 ns 797 ns BM_pstl_stable_sort_uint64_SingleElement/16 8.69 ns 1038 ns BM_pstl_stable_sort_uint64_SingleElement/64 25.3 ns 1514 ns BM_pstl_stable_sort_uint64_SingleElement/256 851 ns 3719 ns BM_pstl_stable_sort_uint64_SingleElement/1024 6320 ns 13969 ns BM_pstl_stable_sort_uint64_SingleElement/16384 197247 ns 270237 ns BM_pstl_stable_sort_uint64_SingleElement/262144 4851639 ns 2421014 ns BM_pstl_stable_sort_uint64_PipeOrgan/1 1.96 ns 536 ns BM_pstl_stable_sort_uint64_PipeOrgan/4 5.06 ns 774 ns BM_pstl_stable_sort_uint64_PipeOrgan/16 22.8 ns 1046 ns BM_pstl_stable_sort_uint64_PipeOrgan/64 204 ns 1557 ns BM_pstl_stable_sort_uint64_PipeOrgan/256 2517 ns 4156 ns BM_pstl_stable_sort_uint64_PipeOrgan/1024 12857 ns 16116 ns BM_pstl_stable_sort_uint64_PipeOrgan/16384 305902 ns 307511 ns BM_pstl_stable_sort_uint64_PipeOrgan/262144 6482625 ns 3157936 ns BM_pstl_stable_sort_uint64_QuickSortAdversary/1 1.96 ns 540 ns BM_pstl_stable_sort_uint64_QuickSortAdversary/4 4.27 ns 895 ns BM_pstl_stable_sort_uint64_QuickSortAdversary/16 8.77 ns 1040 ns BM_pstl_stable_sort_uint64_QuickSortAdversary/64 415 ns 1658 ns BM_pstl_stable_sort_uint64_QuickSortAdversary/256 2762 ns 4587 ns BM_pstl_stable_sort_uint64_QuickSortAdversary/1024 20056 ns 17438 ns BM_pstl_stable_sort_uint64_QuickSortAdversary/16384 432698 ns 387232 ns BM_pstl_stable_sort_uint64_QuickSortAdversary/262144 8477346 ns 3282792 ns BM_pstl_stable_sort_pair<uint32, uint32>_Random/1 17.4 ns 565 ns BM_pstl_stable_sort_pair<uint32, uint32>_Random/4 35.5 ns 876 ns BM_pstl_stable_sort_pair<uint32, uint32>_Random/16 202 ns 1383 ns BM_pstl_stable_sort_pair<uint32, uint32>_Random/64 1214 ns 2819 ns BM_pstl_stable_sort_pair<uint32, uint32>_Random/256 6523 ns 10135 ns BM_pstl_stable_sort_pair<uint32, uint32>_Random/1024 33622 ns 46725 ns BM_pstl_stable_sort_pair<uint32, uint32>_Random/16384 763475 ns 643777 ns BM_pstl_stable_sort_pair<uint32, uint32>_Random/262144 15756837 ns 5658928 ns BM_pstl_stable_sort_pair<uint32, uint32>_Ascending/1 17.1 ns 561 ns BM_pstl_stable_sort_pair<uint32, uint32>_Ascending/4 26.2 ns 851 ns BM_pstl_stable_sort_pair<uint32, uint32>_Ascending/16 57.1 ns 1237 ns BM_pstl_stable_sort_pair<uint32, uint32>_Ascending/64 396 ns 2035 ns BM_pstl_stable_sort_pair<uint32, uint32>_Ascending/256 2293 ns 6025 ns BM_pstl_stable_sort_pair<uint32, uint32>_Ascending/1024 12738 ns 24016 ns BM_pstl_stable_sort_pair<uint32, uint32>_Ascending/16384 319562 ns 393215 ns BM_pstl_stable_sort_pair<uint32, uint32>_Ascending/262144 7093596 ns 3672095 ns BM_pstl_stable_sort_pair<uint32, uint32>_Descending/1 17.0 ns 562 ns BM_pstl_stable_sort_pair<uint32, uint32>_Descending/4 26.3 ns 880 ns BM_pstl_stable_sort_pair<uint32, uint32>_Descending/16 97.0 ns 1218 ns BM_pstl_stable_sort_pair<uint32, uint32>_Descending/64 548 ns 2064 ns BM_pstl_stable_sort_pair<uint32, uint32>_Descending/256 2968 ns 5648 ns BM_pstl_stable_sort_pair<uint32, uint32>_Descending/1024 15541 ns 23177 ns BM_pstl_stable_sort_pair<uint32, uint32>_Descending/16384 360516 ns 412686 ns BM_pstl_stable_sort_pair<uint32, uint32>_Descending/262144 7714000 ns 3346048 ns BM_pstl_stable_sort_pair<uint32, uint32>_SingleElement/1 17.0 ns 582 ns BM_pstl_stable_sort_pair<uint32, uint32>_SingleElement/4 27.2 ns 844 ns BM_pstl_stable_sort_pair<uint32, uint32>_SingleElement/16 64.5 ns 1245 ns BM_pstl_stable_sort_pair<uint32, uint32>_SingleElement/64 445 ns 2115 ns BM_pstl_stable_sort_pair<uint32, uint32>_SingleElement/256 2625 ns 6361 ns BM_pstl_stable_sort_pair<uint32, uint32>_SingleElement/1024 14423 ns 26096 ns BM_pstl_stable_sort_pair<uint32, uint32>_SingleElement/16384 355161 ns 473513 ns BM_pstl_stable_sort_pair<uint32, uint32>_SingleElement/262144 7851652 ns 4256463 ns BM_pstl_stable_sort_pair<uint32, uint32>_PipeOrgan/1 17.0 ns 575 ns BM_pstl_stable_sort_pair<uint32, uint32>_PipeOrgan/4 25.9 ns 860 ns BM_pstl_stable_sort_pair<uint32, uint32>_PipeOrgan/16 73.0 ns 1247 ns BM_pstl_stable_sort_pair<uint32, uint32>_PipeOrgan/64 460 ns 2068 ns BM_pstl_stable_sort_pair<uint32, uint32>_PipeOrgan/256 2595 ns 6562 ns BM_pstl_stable_sort_pair<uint32, uint32>_PipeOrgan/1024 13828 ns 26194 ns BM_pstl_stable_sort_pair<uint32, uint32>_PipeOrgan/16384 336549 ns 436980 ns BM_pstl_stable_sort_pair<uint32, uint32>_PipeOrgan/262144 7377147 ns 4021613 ns BM_pstl_stable_sort_pair<uint32, uint32>_QuickSortAdversary/1 17.0 ns 569 ns BM_pstl_stable_sort_pair<uint32, uint32>_QuickSortAdversary/4 27.9 ns 861 ns BM_pstl_stable_sort_pair<uint32, uint32>_QuickSortAdversary/16 77.9 ns 1251 ns BM_pstl_stable_sort_pair<uint32, uint32>_QuickSortAdversary/64 545 ns 2542 ns BM_pstl_stable_sort_pair<uint32, uint32>_QuickSortAdversary/256 3276 ns 7132 ns BM_pstl_stable_sort_pair<uint32, uint32>_QuickSortAdversary/1024 18322 ns 30304 ns BM_pstl_stable_sort_pair<uint32, uint32>_QuickSortAdversary/16384 462334 ns 525712 ns BM_pstl_stable_sort_pair<uint32, uint32>_QuickSortAdversary/262144 10032232 ns 4340623 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Random/1 18.7 ns 563 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Random/4 43.6 ns 906 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Random/16 249 ns 1479 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Random/64 1409 ns 3153 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Random/256 7587 ns 11542 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Random/1024 38372 ns 51703 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Random/16384 873886 ns 722067 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Random/262144 18441289 ns 7160888 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Ascending/1 18.4 ns 587 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Ascending/4 27.5 ns 947 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Ascending/16 98.0 ns 1315 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Ascending/64 417 ns 2395 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Ascending/256 2332 ns 7074 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Ascending/1024 12245 ns 28658 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Ascending/16384 296193 ns 584751 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Ascending/262144 6857146 ns 5269052 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Descending/1 18.4 ns 584 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Descending/4 27.5 ns 905 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Descending/16 151 ns 1333 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Descending/64 601 ns 2242 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Descending/256 3113 ns 6811 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Descending/1024 15251 ns 28626 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Descending/16384 345008 ns 565317 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_Descending/262144 7446533 ns 4839819 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_SingleElement/1 18.8 ns 580 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_SingleElement/4 29.5 ns 867 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_SingleElement/16 106 ns 1359 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_SingleElement/64 480 ns 2351 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_SingleElement/256 2672 ns 7546 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_SingleElement/1024 14060 ns 32064 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_SingleElement/16384 335052 ns 620179 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_SingleElement/262144 7655247 ns 6110748 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_PipeOrgan/1 18.7 ns 569 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_PipeOrgan/4 27.8 ns 860 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_PipeOrgan/16 120 ns 1304 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_PipeOrgan/64 509 ns 2206 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_PipeOrgan/256 2656 ns 7267 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_PipeOrgan/1024 13435 ns 30041 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_PipeOrgan/16384 318072 ns 568759 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_PipeOrgan/262144 7258031 ns 5375570 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_QuickSortAdversary/1 18.8 ns 568 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_QuickSortAdversary/4 29.4 ns 887 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_QuickSortAdversary/16 120 ns 1337 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_QuickSortAdversary/64 614 ns 2358 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_QuickSortAdversary/256 3339 ns 8096 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_QuickSortAdversary/1024 17214 ns 37114 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_QuickSortAdversary/16384 407828 ns 683645 ns BM_pstl_stable_sort_tuple<uint32, uint64, uint32>_QuickSortAdversary/262144 9226697 ns 6520879 ns BM_pstl_stable_sort_string_Random/1 18.2 ns 573 ns BM_pstl_stable_sort_string_Random/4 71.8 ns 963 ns BM_pstl_stable_sort_string_Random/16 451 ns 1650 ns BM_pstl_stable_sort_string_Random/64 2815 ns 4853 ns BM_pstl_stable_sort_string_Random/256 16983 ns 22339 ns BM_pstl_stable_sort_string_Random/1024 88698 ns 98363 ns BM_pstl_stable_sort_string_Random/16384 2105551 ns 1211421 ns BM_pstl_stable_sort_string_Random/262144 54640692 ns 28420500 ns BM_pstl_stable_sort_string_Ascending/1 18.3 ns 567 ns BM_pstl_stable_sort_string_Ascending/4 34.9 ns 909 ns BM_pstl_stable_sort_string_Ascending/16 120 ns 1376 ns BM_pstl_stable_sort_string_Ascending/64 587 ns 2445 ns BM_pstl_stable_sort_string_Ascending/256 4323 ns 8777 ns BM_pstl_stable_sort_string_Ascending/1024 21786 ns 37000 ns BM_pstl_stable_sort_string_Ascending/16384 509321 ns 661146 ns BM_pstl_stable_sort_string_Ascending/262144 16427214 ns 14593143 ns BM_pstl_stable_sort_string_Descending/1 18.6 ns 589 ns BM_pstl_stable_sort_string_Descending/4 39.8 ns 935 ns BM_pstl_stable_sort_string_Descending/16 252 ns 1380 ns BM_pstl_stable_sort_string_Descending/64 1129 ns 2542 ns BM_pstl_stable_sort_string_Descending/256 6668 ns 9051 ns BM_pstl_stable_sort_string_Descending/1024 32704 ns 39067 ns BM_pstl_stable_sort_string_Descending/16384 708264 ns 666882 ns BM_pstl_stable_sort_string_Descending/262144 21184333 ns 12753673 ns BM_pstl_stable_sort_string_SingleElement/1 18.3 ns 563 ns BM_pstl_stable_sort_string_SingleElement/4 43.4 ns 917 ns BM_pstl_stable_sort_string_SingleElement/16 159 ns 1453 ns BM_pstl_stable_sort_string_SingleElement/64 824 ns 2737 ns BM_pstl_stable_sort_string_SingleElement/256 4574 ns 9319 ns BM_pstl_stable_sort_string_SingleElement/1024 23883 ns 39474 ns BM_pstl_stable_sort_string_SingleElement/16384 564291 ns 734450 ns BM_pstl_stable_sort_string_SingleElement/262144 12667527 ns 8225023 ns BM_pstl_stable_sort_string_PipeOrgan/1 18.5 ns 573 ns BM_pstl_stable_sort_string_PipeOrgan/4 35.4 ns 977 ns BM_pstl_stable_sort_string_PipeOrgan/16 185 ns 1375 ns BM_pstl_stable_sort_string_PipeOrgan/64 860 ns 2527 ns BM_pstl_stable_sort_string_PipeOrgan/256 5302 ns 9320 ns BM_pstl_stable_sort_string_PipeOrgan/1024 26029 ns 40149 ns BM_pstl_stable_sort_string_PipeOrgan/16384 575983 ns 670767 ns BM_pstl_stable_sort_string_PipeOrgan/262144 18165333 ns 15620318 ns BM_pstl_stable_sort_string_QuickSortAdversary/1 18.2 ns 563 ns BM_pstl_stable_sort_string_QuickSortAdversary/4 71.7 ns 936 ns BM_pstl_stable_sort_string_QuickSortAdversary/16 449 ns 1658 ns BM_pstl_stable_sort_string_QuickSortAdversary/64 2809 ns 4777 ns BM_pstl_stable_sort_string_QuickSortAdversary/256 17149 ns 22100 ns BM_pstl_stable_sort_string_QuickSortAdversary/1024 88657 ns 97570 ns BM_pstl_stable_sort_string_QuickSortAdversary/16384 2091408 ns 1217799 ns BM_pstl_stable_sort_string_QuickSortAdversary/262144 53567200 ns 29541200 ns BM_pstl_stable_sort_float_Random/1 1.95 ns 539 ns BM_pstl_stable_sort_float_Random/4 22.5 ns 804 ns BM_pstl_stable_sort_float_Random/16 147 ns 1294 ns BM_pstl_stable_sort_float_Random/64 934 ns 2539 ns BM_pstl_stable_sort_float_Random/256 6600 ns 9269 ns BM_pstl_stable_sort_float_Random/1024 35831 ns 43842 ns BM_pstl_stable_sort_float_Random/16384 844270 ns 585408 ns BM_pstl_stable_sort_float_Random/262144 18131421 ns 5370938 ns BM_pstl_stable_sort_float_Ascending/1 1.96 ns 539 ns BM_pstl_stable_sort_float_Ascending/4 4.25 ns 783 ns BM_pstl_stable_sort_float_Ascending/16 8.59 ns 1107 ns BM_pstl_stable_sort_float_Ascending/64 25.2 ns 1555 ns BM_pstl_stable_sort_float_Ascending/256 1200 ns 4571 ns BM_pstl_stable_sort_float_Ascending/1024 9188 ns 19242 ns BM_pstl_stable_sort_float_Ascending/16384 287505 ns 286224 ns BM_pstl_stable_sort_float_Ascending/262144 6861088 ns 2885462 ns BM_pstl_stable_sort_float_Descending/1 1.96 ns 540 ns BM_pstl_stable_sort_float_Descending/4 7.69 ns 794 ns BM_pstl_stable_sort_float_Descending/16 56.4 ns 1109 ns BM_pstl_stable_sort_float_Descending/64 800 ns 1672 ns BM_pstl_stable_sort_float_Descending/256 4309 ns 4345 ns BM_pstl_stable_sort_float_Descending/1024 21579 ns 19584 ns BM_pstl_stable_sort_float_Descending/16384 490209 ns 328602 ns BM_pstl_stable_sort_float_Descending/262144 10140493 ns 3444498 ns BM_pstl_stable_sort_float_SingleElement/1 2.27 ns 549 ns BM_pstl_stable_sort_float_SingleElement/4 4.51 ns 777 ns BM_pstl_stable_sort_float_SingleElement/16 8.50 ns 1112 ns BM_pstl_stable_sort_float_SingleElement/64 25.2 ns 1567 ns BM_pstl_stable_sort_float_SingleElement/256 1206 ns 4558 ns BM_pstl_stable_sort_float_SingleElement/1024 9163 ns 19022 ns BM_pstl_stable_sort_float_SingleElement/16384 287924 ns 281133 ns BM_pstl_stable_sort_float_SingleElement/262144 6857598 ns 3049492 ns BM_pstl_stable_sort_float_PipeOrgan/1 1.96 ns 562 ns BM_pstl_stable_sort_float_PipeOrgan/4 4.86 ns 832 ns BM_pstl_stable_sort_float_PipeOrgan/16 23.4 ns 1151 ns BM_pstl_stable_sort_float_PipeOrgan/64 204 ns 1621 ns BM_pstl_stable_sort_float_PipeOrgan/256 2771 ns 5180 ns BM_pstl_stable_sort_float_PipeOrgan/1024 15168 ns 21905 ns BM_pstl_stable_sort_float_PipeOrgan/16384 384810 ns 336797 ns BM_pstl_stable_sort_float_PipeOrgan/262144 8402687 ns 3817686 ns BM_pstl_stable_sort_float_QuickSortAdversary/1 1.96 ns 589 ns BM_pstl_stable_sort_float_QuickSortAdversary/4 4.26 ns 801 ns BM_pstl_stable_sort_float_QuickSortAdversary/16 8.53 ns 1123 ns BM_pstl_stable_sort_float_QuickSortAdversary/64 417 ns 1735 ns BM_pstl_stable_sort_float_QuickSortAdversary/256 3250 ns 5725 ns BM_pstl_stable_sort_float_QuickSortAdversary/1024 23423 ns 23649 ns BM_pstl_stable_sort_float_QuickSortAdversary/16384 525503 ns 416050 ns BM_pstl_stable_sort_float_QuickSortAdversary/262144 10546348 ns 3761952 ns
These don't look that great currently. I think we can improve this a lot by
- improving the chunk size. Currently, for smaller ranges we don't do anything in the first step, since all chunks are 1 element in size
- Don't wait every time until all the chunks are ready. If the first two chunks are merged, we can start merging them. This requires a different API though.
- Don't inplace_merge serially. This would be trivial if we don't sync all the steps.
- Avoid initializing the temporary buffer before we need to
libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h | ||
---|---|---|
251–257 | We could remove this and instead use something like: template <class _Tp> _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _Tp* __relocate_at(_Tp* __location, type_identity_t<_Tp>&& __object) { _Tp* __result = std::construct_at(__location, std::move(__object)); std::destroy_at(std::addressof(__object)); return __result; } And then: template <class _Compare, class _InputIterator1, class _InputIterator2, class _OutputIterator, class _CopyOrRelocate, class _CopyOrRelocateN> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator __merge(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2, _OutputIterator __result, _Compare __comp, _CopyOrRelocate __copy, _CopyOrRelocateN __copy_n) { for (; __first1 != __last1; ++__result) { if (__first2 == __last2) return __copy(__first1, __last1, __result); if (__comp(*__first2, *__first1)) { __copy_n(__first2, 1, __result); ++__first2; } else { __copy_n(__first1, 1, __result); ++__first1; } } return __copy(__first2, __last2, __result); } And we can pass it __relocate and __relocate_n in our case, and __copy and __copy_n in the normal case. | |
libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.stable_sort.pass.cpp | ||
25 | Let's split this into a separate patch. While we're at it, let's do a quick pass over all the algorithm tests to make sure we're testing the pstl version. | |
26 | Let's also add tests with types that are not default constructible. |
libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h | ||
---|---|---|
251–257 | Tracked as https://github.com/llvm/llvm-project/issues/63928 to avoid blocking this patch. Please leave a ref to that bug report in a comment. |
We could remove this and instead use something like:
And then:
And we can pass it __relocate and __relocate_n in our case, and __copy and __copy_n in the normal case.