Index: pstl/include/pstl/internal/parallel_backend_tbb.h =================================================================== --- pstl/include/pstl/internal/parallel_backend_tbb.h +++ pstl/include/pstl/internal/parallel_backend_tbb.h @@ -25,6 +25,7 @@ #include #include #include +#include #if TBB_INTERFACE_VERSION < 10000 # error Intel(R) Threading Building Blocks 2018 is required; older versions are not supported. @@ -71,7 +72,11 @@ inline void __cancel_execution() { +#if TBB_INTERFACE_VERSION <= 12000 tbb::task::self().group()->cancel_group_execution(); +#else + tbb::task::current_context()->cancel_group_execution(); +#endif } //------------------------------------------------------------------------ @@ -413,17 +418,308 @@ //------------------------------------------------------------------------ #define _PSTL_MERGE_CUT_OFF 2000 +template +class __func_task; +template +class __root_task; + +#if TBB_INTERFACE_VERSION <= 12000 +class __task : public tbb::task +{ + public: + template + __task* + make_continuation(_Fn&& __f) + { + return new (allocate_continuation()) __func_task::type>(std::forward<_Fn>(__f)); + } + + template + __task* + make_child_of(__task* parent, _Fn&& __f) + { + return new (parent->allocate_child()) __func_task::type>(std::forward<_Fn>(__f)); + } + + template + __task* + make_additional_child_of(tbb::task* parent, _Fn&& __f) + { + return new (tbb::task::allocate_additional_child_of(*parent)) + __func_task::type>(std::forward<_Fn>(__f)); + } + + inline void + recycle_as_continuation() + { + tbb::task::recycle_as_continuation(); + } + + inline void + recycle_as_child_of(__task* parent) + { + tbb::task::recycle_as_child_of(*parent); + } + + inline void + spawn(__task* __t) + { + tbb::task::spawn(*__t); + } + + template + static inline void + spawn_root_and_wait(__root_task<_Fn>& __root) + { + tbb::task::spawn_root_and_wait(*__root._M_task); + } +}; + +template +class __func_task : public __task +{ + _Func _M_func; + + tbb::task* + execute() + { + return _M_func(this); + }; + + public: + template + __func_task(_Fn&& __f) : _M_func{std::forward<_Fn>(__f)} + { + } + + _Func& + body() + { + return _M_func; + } +}; + +template +class __root_task +{ + tbb::task* _M_task; + + public: + template + __root_task(Args&&... args) + : _M_task{new (tbb::task::allocate_root()) __func_task<_Func>{_Func(std::forward(args)...)}} + { + } + + friend class __task; + friend class __func_task<_Func>; +}; + +#else // TBB_INTERFACE_VERSION <= 12000 +class __task : public tbb::detail::d1::task +{ + protected: + tbb::detail::d1::small_object_allocator _M_allocator{}; + tbb::detail::d1::execution_data* _M_execute_data{}; + __task* _M_parent{}; + std::atomic _M_refcount{}; + bool _M_recycle{}; + + template + __task* + allocate_func_task(_Fn&& __f) + { + assert(_M_execute_data != nullptr); + tbb::detail::d1::small_object_allocator __alloc{}; + auto __t = + __alloc.new_object<__func_task::type>>(*_M_execute_data, std::forward<_Fn>(__f)); + __t->_M_allocator = __alloc; + return __t; + } + + public: + __task* + parent() + { + return _M_parent; + } + + void + set_ref_count(int __n) + { + _M_refcount.store(__n, std::memory_order_release); + } + + template + __task* + make_continuation(_Fn&& __f) + { + auto __t = allocate_func_task(std::forward<_Fn&&>(__f)); + __t->_M_parent = _M_parent; + _M_parent = nullptr; + return __t; + } + + template + __task* + make_child_of(__task* __parent, _Fn&& __f) + { + auto __t = allocate_func_task(std::forward<_Fn&&>(__f)); + __t->_M_parent = __parent; + return __t; + } + + template + __task* + make_additional_child_of(__task* __parent, _Fn&& __f) + { + auto __t = make_child_of(__parent, std::forward<_Fn>(__f)); + assert(__parent->_M_refcount.load(std::memory_order_relaxed) > 0); + ++__parent->_M_refcount; + return __t; + } + + inline void + recycle_as_continuation() + { + _M_recycle = true; + } + + inline void + recycle_as_child_of(__task* parent) + { + _M_recycle = true; + _M_parent = parent; + } + + inline void + spawn(__task* __t) + { + assert(_M_execute_data != nullptr); + tbb::detail::d1::spawn(*__t, *_M_execute_data->context); + } + + template + static inline void + spawn_root_and_wait(__root_task<_Fn>& __root) + { + tbb::detail::d1::execute_and_wait(*__root._M_func_task, __root._M_context, __root._M_wait_object, + __root._M_context); + } + + template + friend class __func_task; +}; + +template +class __func_task : public __task +{ + _Func _M_func; + + __task* + execute(tbb::detail::d1::execution_data& __ed) override + { + _M_execute_data = &__ed; + _M_recycle = false; + __task* __next = _M_func(this); + return finalize(__next); + }; + + __task* + cancel(tbb::detail::d1::execution_data& __ed) override + { + return finalize(nullptr); + } + + __task* + finalize(__task* __next) + { + bool __recycle = _M_recycle; + _M_recycle = false; + + if (__recycle) + { + return __next; + } + + auto __parent = _M_parent; + auto __alloc = _M_allocator; + auto __ed = _M_execute_data; + + this->~__func_task(); + + assert(__parent != nullptr); + assert(__parent->_M_refcount.load(std::memory_order_relaxed) > 0); + if (--__parent->_M_refcount == 0) + { + assert(__next == nullptr); + __alloc.deallocate(this, *__ed); + return __parent; + } + + return __next; + } + + friend class __root_task<_Func>; + + public: + template + __func_task(_Fn&& __f) : _M_func(std::forward<_Fn>(__f)) + { + } + + _Func& + body() + { + return _M_func; + } +}; + +template +class __root_task : public __task +{ + __task* + execute(tbb::detail::d1::execution_data& __ed) override + { + _M_wait_object.release(); + return nullptr; + }; + + __task* + cancel(tbb::detail::d1::execution_data& __ed) override + { + _M_wait_object.release(); + return nullptr; + } + + __func_task<_Func>* _M_func_task{}; + tbb::detail::d1::wait_context _M_wait_object{}; + tbb::task_group_context _M_context{}; + + public: + template + __root_task(Args&&... args) : _M_wait_object{1} + { + tbb::detail::d1::small_object_allocator __alloc{}; + _M_func_task = __alloc.new_object<__func_task<_Func>>(_Func(std::forward(args)...)); + _M_func_task->_M_allocator = __alloc; + _M_func_task->_M_parent = this; + _M_refcount.store(1, std::memory_order_relaxed); + } + + friend class __task; +}; +#endif // TBB_INTERFACE_VERSION <= 12000 + template -class __merge_task : public tbb::task +class __merge_func { typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1; typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2; typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType; typedef typename std::iterator_traits<_RandomAccessIterator1>::value_type _ValueType; - /*override*/ tbb::task* - execute(); _RandomAccessIterator1 _M_x_beg; _RandomAccessIterator2 _M_z_beg; @@ -529,7 +825,7 @@ }; public: - __merge_task(_SizeType __xs, _SizeType __xe, _SizeType __ys, _SizeType __ye, _SizeType __zs, _Compare __comp, + __merge_func(_SizeType __xs, _SizeType __xe, _SizeType __ys, _SizeType __ye, _SizeType __zs, _Compare __comp, _Cleanup, _LeafMerge __leaf_merge, _SizeType __nsort, _RandomAccessIterator1 __x_beg, _RandomAccessIterator2 __z_beg, bool __x_orig, bool __y_orig, bool __root) : _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_x_beg(__x_beg), _M_z_beg(__z_beg), @@ -554,12 +850,14 @@ _y_orig = __on_off; } + __task* + operator()(__task* __self); + private: - __merge_task* - parent_merge() const + __merge_func* + parent_merge(__task* __self) const { - tbb::task* p = (_root ? nullptr : parent()); - return static_cast<__merge_task*>(p); + return _root ? nullptr : &static_cast<__func_task<__merge_func>*>(__self->parent())->body(); } bool x_less_y() @@ -615,8 +913,8 @@ _y_orig = !_y_orig; } - tbb::task* - merge_ranges() + __task* + merge_ranges(__task* __self) { assert(_x_orig == _y_orig); //two merged subrange must be lie into the same buffer @@ -626,7 +924,7 @@ // need to merge {x} and {y} if (__n > __merge_cut_off) - return split_merging(); + return split_merging(__self); //merge to buffer if (_x_orig) @@ -634,7 +932,7 @@ _M_leaf_merge(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs, _M_comp, __move_value_construct(), __move_value_construct(), __move_range_construct(), __move_range_construct()); - assert(parent_merge()); //not root merging task + assert(parent_merge(__self)); //not root merging task } //merge to "origin" else @@ -656,13 +954,13 @@ return nullptr; } - tbb::task* - process_ranges() + __task* + process_ranges(__task* __self) { assert(_x_orig == _y_orig); assert(!_split); - auto p = parent_merge(); + auto p = parent_merge(__self); if (!p) { //root merging task @@ -685,7 +983,7 @@ move_y_range(); //parallel moving } // need to merge {x} and {y}. - return merge_ranges(); + return merge_ranges(__self); } //else: not root merging task (parent_merge() == NULL) //optimization, just for sort algorithm, //{x} <= {y} @@ -699,12 +997,12 @@ const auto id_range = _M_zs; p->set_odd(id_range, !_x_orig); - return merge_ranges(); + return merge_ranges(__self); } //splitting as merge task into 2 of the same level - tbb::task* - split_merging() + __task* + split_merging(__task* __self) { assert(_x_orig == _y_orig); const auto __nx = (_M_xe - _M_xs); @@ -732,43 +1030,42 @@ } auto __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys)); + __merge_func __right_func(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _Cleanup(), _M_leaf_merge, _M_nsort, + _M_x_beg, _M_z_beg, _x_orig, _y_orig, _root); + __right_func._split = true; + auto __merge_task = __self->make_additional_child_of(__self->parent(), std::move(__right_func)); + __self->spawn(__merge_task); + __self->recycle_as_continuation(); - __merge_task* __right = new (tbb::task::allocate_additional_child_of(*parent())) - __merge_task(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _Cleanup(), _M_leaf_merge, _M_nsort, _M_x_beg, - _M_z_beg, _x_orig, _y_orig, _root); - - __right->_split = true; - - tbb::task::spawn(*__right); - tbb::task::recycle_as_continuation(); _M_xe = __xm; _M_ye = __ym; _split = true; - return this; + return __self; } }; template -tbb::task* -__merge_task<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Cleanup, _LeafMerge>::execute() +__task* +__merge_func<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Cleanup, _LeafMerge>:: +operator()(__task* __self) { //a. split merge task into 2 of the same level; the special logic, //without processing(process_ranges) adjacent sub-ranges x and y if (_split) - return merge_ranges(); + return merge_ranges(__self); //b. General merging of adjacent sub-ranges x and y (with optimization in case of {x} <= {y} ) //1. x and y are in the even buffer //2. x and y are in the odd buffer if (_x_orig == _y_orig) - return process_ranges(); + return process_ranges(__self); //3. x is in even buffer, y is in the odd buffer //4. x is in odd buffer, y is in the even buffer - if (!parent_merge()) + if (!parent_merge(__self)) { //root merge task if (_x_orig) move_x_range(); @@ -788,11 +1085,11 @@ move_y_range(); } - return process_ranges(); + return process_ranges(__self); } template -class __stable_sort_task : public tbb::task +class __stable_sort_func { public: typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1; @@ -800,8 +1097,6 @@ typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType; private: - /*override*/ tbb::task* - execute(); _RandomAccessIterator1 _M_xs, _M_xe, _M_x_beg; _RandomAccessIterator2 _M_zs, _M_z_beg; _Compare _M_comp; @@ -810,22 +1105,25 @@ _SizeType _M_nsort; //zero or number of elements to be sorted for partial_sort alforithm public: - __stable_sort_task(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __zs, + __stable_sort_func(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __zs, bool __root, _Compare __comp, _LeafSort __leaf_sort, _SizeType __nsort, _RandomAccessIterator1 __x_beg, _RandomAccessIterator2 __z_beg) : _M_xs(__xs), _M_xe(__xe), _M_x_beg(__x_beg), _M_zs(__zs), _M_z_beg(__z_beg), _M_comp(__comp), _M_leaf_sort(__leaf_sort), _M_root(__root), _M_nsort(__nsort) { } + + __task* + operator()(__task* __self); }; #define _PSTL_STABLE_SORT_CUT_OFF 500 template -tbb::task* -__stable_sort_task<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _LeafSort>::execute() +__task* +__stable_sort_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _LeafSort>::operator()(__task* __self) { - typedef __merge_task<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, __utils::__serial_destroy, + typedef __merge_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, __utils::__serial_destroy, __utils::__serial_move_merge> _MergeTaskType; @@ -835,34 +1133,27 @@ if (__n <= __sort_cut_off) { _M_leaf_sort(_M_xs, _M_xe, _M_comp); - assert(!_M_root); - - tbb::task* p = parent(); - const auto id_range = _M_xs - _M_x_beg; - return nullptr; } const _RandomAccessIterator1 __xm = _M_xs + __n / 2; const _RandomAccessIterator2 __zm = _M_zs + (__xm - _M_xs); const _RandomAccessIterator2 __ze = _M_zs + __n; - _MergeTaskType* __m = new (allocate_continuation()) _MergeTaskType( - _M_xs - _M_x_beg, __xm - _M_x_beg, __xm - _M_x_beg, _M_xe - _M_x_beg, _M_zs - _M_z_beg, _M_comp, - __utils::__serial_destroy(), __utils::__serial_move_merge(__nmerge), _M_nsort, _M_x_beg, _M_z_beg, - /*x_orig*/ true, /*y_orig*/ true, /*root*/ _M_root); - + _MergeTaskType __m(_MergeTaskType(_M_xs - _M_x_beg, __xm - _M_x_beg, __xm - _M_x_beg, _M_xe - _M_x_beg, + _M_zs - _M_z_beg, _M_comp, __utils::__serial_destroy(), + __utils::__serial_move_merge(__nmerge), _M_nsort, _M_x_beg, _M_z_beg, + /*x_orig*/ true, /*y_orig*/ true, /*root*/ _M_root)); + auto __parent = __self->make_continuation(std::move(__m)); + __parent->set_ref_count(2); + auto __right = __self->make_child_of( + __parent, __stable_sort_func(__xm, _M_xe, __zm, false, _M_comp, _M_leaf_sort, _M_nsort, _M_x_beg, _M_z_beg)); + __self->spawn(__right); + __self->recycle_as_child_of(__parent); _M_root = false; - - __m->set_ref_count(2); - auto __right = new (__m->allocate_child()) - __stable_sort_task(__xm, _M_xe, __zm, _M_root, _M_comp, _M_leaf_sort, _M_nsort, _M_x_beg, _M_z_beg); - - spawn(*__right); - recycle_as_child_of(*__m); _M_xe = __xm; - return this; + return __self; } template @@ -882,11 +1173,9 @@ if (__n > __sort_cut_off) { __buffer<_ValueType> __buf(__n); - tbb::task* root = new (tbb::task::allocate_root()) - __stable_sort_task<_RandomAccessIterator, _ValueType*, _Compare, _LeafSort>( - __xs, __xe, __buf.get(), true, __comp, __leaf_sort, __nsort, __xs, __buf.get()); - tbb::task::spawn_root_and_wait(*root); - + __root_task<__stable_sort_func<_RandomAccessIterator, _ValueType*, _Compare, _LeafSort>> __root{ + __xs, __xe, __buf.get(), true, __comp, __leaf_sort, __nsort, __xs, __buf.get()}; + __task::spawn_root_and_wait(__root); return; } //serial sort @@ -899,10 +1188,8 @@ //------------------------------------------------------------------------ template -class __merge_task_static : public tbb::task +class __merge_func_static { - /*override*/ tbb::task* - execute(); _RandomAccessIterator1 _M_xs, _M_xe; _RandomAccessIterator2 _M_ys, _M_ye; _RandomAccessIterator3 _M_zs; @@ -910,20 +1197,23 @@ _LeafMerge _M_leaf_merge; public: - __merge_task_static(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys, + __merge_func_static(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp, _LeafMerge __leaf_merge) : _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_comp(__comp), _M_leaf_merge(__leaf_merge) { } + + __task* + operator()(__task* __self); }; //TODO: consider usage of parallel_for with a custom blocked_range template -tbb::task* -__merge_task_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, __M_Compare, - _LeafMerge>::execute() +__task* +__merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, __M_Compare, _LeafMerge>:: +operator()(__task* __self) { typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1; typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2; @@ -949,14 +1239,14 @@ __ym = std::lower_bound(_M_ys, _M_ye, *__xm, _M_comp); } const _RandomAccessIterator3 __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys)); - tbb::task* __right = new (tbb::task::allocate_additional_child_of(*parent())) - __merge_task_static(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _M_leaf_merge); - tbb::task::spawn(*__right); - tbb::task::recycle_as_continuation(); + auto __right = __self->make_additional_child_of( + __self->parent(), __merge_func_static(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _M_leaf_merge)); + __self->spawn(__right); + __self->recycle_as_continuation(); _M_xe = __xm; _M_ye = __ym; - return this; + return __self; } template _TaskType; - tbb::task::spawn_root_and_wait(*new (tbb::task::allocate_root()) - _TaskType(__xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge)); + __root_task<_TaskType> __root{__xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge}; + __task::spawn_root_and_wait(__root); }); } }