From a5ac91cc047773bdf84ae8924a4f5a22e5e2abc6 Mon Sep 17 00:00:00 2001 From: andor <16273755+Andor233@users.noreply.github.com> Date: Sat, 1 Jun 2024 03:26:04 +0800 Subject: [PATCH] 1. Add _STD to defend ADL, towards #140, 2. Use the _Intermediate_result Only when the intermediate type is different with output type --- stl/inc/execution | 232 +++++++++++++++++++++++++++++++++++----------- stl/inc/vector | 18 ++-- stl/inc/xmemory | 6 +- 3 files changed, 190 insertions(+), 66 deletions(-) diff --git a/stl/inc/execution b/stl/inc/execution index 5aa3739ff5..49bb59141a 100644 --- a/stl/inc/execution +++ b/stl/inc/execution @@ -3634,6 +3634,19 @@ struct _Scan_decoupled_lookback { } } + template + void _Apply_exclusive_predecessor_origin(_Ty& _Preceding, _FwdIt _First, const _FwdIt _Last, _BinOp _Reduce_op) { + // apply _Preceding to [_First, _Last) and _Sum._Ref(), using _Reduce_op + _STD _Implicitly_construct_in_place_by_binary_op(_Sum._Ref(), _Reduce_op, _Preceding, _Local._Ref()); + _State.store(_Local_available | _Sum_available); + *_First = _Preceding; + +#pragma loop(ivdep) + while (++_First != _Last) { + *_First = _Reduce_op(_Preceding, _STD move(*_First)); + } + } + template void _Apply_inclusive_predecessor( _Ty& _Preceding, _FwdIt _First, _FwdIt2 _First2, const _FwdIt _Last, _BinOp _Reduce_op) { @@ -3652,6 +3665,18 @@ struct _Scan_decoupled_lookback { } } + template + void _Apply_inclusive_predecessor_origin(_Ty& _Preceding, _FwdIt _First, const _FwdIt _Last, _BinOp _Reduce_op) { + // apply _Preceding to [_First, _Last) and _Sum._Ref(), using _Reduce_op + _STD _Implicitly_construct_in_place_by_binary_op(_Sum._Ref(), _Reduce_op, _Preceding, _Local._Ref()); + _State.store(_Local_available | _Sum_available); + +#pragma loop(ivdep) + for (; _First != _Last; ++_First) { + *_First = _Reduce_op(_Preceding, _STD move(*_First)); + } + } + ~_Scan_decoupled_lookback() { const auto _State_bits = _State.load(memory_order_relaxed); if (_State_bits & _Sum_available) { @@ -4370,6 +4395,25 @@ _FwdIt2 _Exclusive_scan_per_chunk(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _ } } +template +_FwdIt2 _Exclusive_scan_per_chunk_origin(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _Ty& _Val) { + // local-sum for parallel exclusive_scan; writes local sums into [_Dest + 1, _Dest + (_Last - _First)) and stores + // successor sum in _Val + // pre: _Val is *uninitialized* && _First != _Last + _STD _Construct_in_place(_Val, *_First); + for (;;) { + ++_First; + ++_Dest; + if (_First == _Last) { + return _Dest; + } + + _Ty _Tmp = _Reduce_op(_Val, *_First); // temp to enable _First == _Dest + *_Dest = _Val; + _Val = _STD move(_Tmp); + } +} + template void _Exclusive_scan_per_chunk_complete( _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _Ty& _Val, _Ty& _Init) { @@ -4430,20 +4474,36 @@ struct _Static_partitioned_exclusive_scan2 { return _Cancellation_status::_Running; } - _STD vector<_Ty> _Intermediate_result; - - // Calculate local sum and publish to other threads - const auto _Last = _STD _Exclusive_scan_per_chunk( - _In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Local._Ref(), _Intermediate_result); - _Chunk->_Store_available_state(_Local_available); + // If the intermediate type is same with output type, then we can direct store the bop result in the output + if constexpr (_STD is_same_v<_Ty, typename std::iterator_traits<_FwdIt2>::value_type>) { + // Calculate local sum and publish to other threads + const auto _Last = _STD _Exclusive_scan_per_chunk_origin( + _In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Local._Ref()); + _Chunk->_Store_available_state(_Local_available); - // Apply the predecessor overall sum to current overall sum and elements - if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly - _Chunk->_Apply_exclusive_predecessor( - _Prev_chunk->_Sum._Ref(), _Dest, _STD begin(_Intermediate_result), _Last, _Reduce_op); + // Apply the predecessor overall sum to current overall sum and elements + if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly + _Chunk->_Apply_exclusive_predecessor_origin(_Prev_chunk->_Sum._Ref(), _Dest, _Last, _Reduce_op); + } else { + auto _Tmp = _STD _Get_lookback_sum(_Prev_chunk, _Reduce_op); + _Chunk->_Apply_exclusive_predecessor_origin(_Tmp, _Dest, _Last, _Reduce_op); + } } else { - auto _Tmp = _STD _Get_lookback_sum(_Prev_chunk, _Reduce_op); - _Chunk->_Apply_exclusive_predecessor(_Tmp, _Dest, _STD begin(_Intermediate_result), _Last, _Reduce_op); + _STD vector<_Ty> _Intermediate_result; + + // Calculate local sum and publish to other threads + const auto _Last = _STD _Exclusive_scan_per_chunk( + _In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Local._Ref(), _Intermediate_result); + _Chunk->_Store_available_state(_Local_available); + + // Apply the predecessor overall sum to current overall sum and elements + if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly + _Chunk->_Apply_exclusive_predecessor( + _Prev_chunk->_Sum._Ref(), _Dest, _Intermediate_result.data(), _Last, _Reduce_op); + } else { + auto _Tmp = _STD _Get_lookback_sum(_Prev_chunk, _Reduce_op); + _Chunk->_Apply_exclusive_predecessor(_Tmp, _Dest, _Intermediate_result.data(), _Last, _Reduce_op); + } } return _Cancellation_status::_Running; @@ -4590,23 +4650,38 @@ struct _Static_partitioned_inclusive_scan2 { return _Cancellation_status::_Running; } - // Make a vector to avoid the type of *_Dest is different with _Ty - _STD vector<_Ty> _Intermediate_result; - - // Calculate local sum and publish to other threads - const auto _Last = _STD _Inclusive_scan_per_chunk(_In_range._First, _In_range._Last, _Dest, _Reduce_op, - _Chunk->_Local._Ref(), _No_init_tag{}, _Intermediate_result); - _Chunk->_Store_available_state(_Local_available); + // If the intermediate type is same with output type, then we can direct store the bop result in the output + if constexpr (_STD is_same_v<_Ty, typename std::iterator_traits<_FwdIt2>::value_type>) { + // Calculate local sum and publish to other threads + const auto _Last = _STD _Inclusive_scan_per_chunk_complete( + _In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Local._Ref(), _No_init_tag{}); + _Chunk->_Store_available_state(_Local_available); - // Apply the predecessor overall sum to current overall sum and elements - if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly - _Chunk->_Apply_inclusive_predecessor( - _Prev_chunk->_Sum._Ref(), _Dest, _STD begin(_Intermediate_result), _Last, _Reduce_op); + // Apply the predecessor overall sum to current overall sum and elements + if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly + _Chunk->_Apply_inclusive_predecessor_origin(_Prev_chunk->_Sum._Ref(), _Dest, _Last, _Reduce_op); + } else { + auto _Tmp = _STD _Get_lookback_sum(_Prev_chunk, _Reduce_op); + _Chunk->_Apply_inclusive_predecessor_origin(_Tmp, _Dest, _Last, _Reduce_op); + } } else { - auto _Tmp = _STD _Get_lookback_sum(_Prev_chunk, _Reduce_op); - _Chunk->_Apply_inclusive_predecessor(_Tmp, _Dest, _STD begin(_Intermediate_result), _Last, _Reduce_op); + // Make a vector to avoid the type of *_Dest is different with _Ty + _STD vector<_Ty> _Intermediate_result; + + // Calculate local sum and publish to other threads + const auto _Last = _STD _Inclusive_scan_per_chunk(_In_range._First, _In_range._Last, _Dest, _Reduce_op, + _Chunk->_Local._Ref(), _No_init_tag{}, _Intermediate_result); + _Chunk->_Store_available_state(_Local_available); + + // Apply the predecessor overall sum to current overall sum and elements + if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly + _Chunk->_Apply_inclusive_predecessor( + _Prev_chunk->_Sum._Ref(), _Dest, _Intermediate_result.data(), _Last, _Reduce_op); + } else { + auto _Tmp = _STD _Get_lookback_sum(_Prev_chunk, _Reduce_op); + _Chunk->_Apply_inclusive_predecessor(_Tmp, _Dest, _Intermediate_result.data(), _Last, _Reduce_op); + } } - return _Cancellation_status::_Running; } @@ -4724,6 +4799,25 @@ _FwdIt2 _Transform_exclusive_scan_per_chunk(_FwdIt1 _First, const _FwdIt1 _Last, } } +template +_FwdIt2 _Transform_exclusive_scan_per_chunk_origin(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _UnaryOp _Transform_op, _Ty& _Val) { + // Local-sum for parallel transform_exclusive_scan; writes local sums into [_Dest + 1, _Dest + (_Last - _First)) and + // stores successor sum in _Val. + // pre: _Val is *uninitialized* && _First != _Last + _STD _Construct_in_place_by_transform_deref(_Val, _Transform_op, _First); + for (;;) { + ++_First; + ++_Dest; + if (_First == _Last) { + return _Dest; + } + + _Ty _Tmp = _Reduce_op(_Val, _Transform_op(*_First)); // temp to enable _First == _Dest + *_Dest = _Val; + _Val = _STD move(_Tmp); + } +} + template void _Transform_exclusive_scan_per_chunk_complete(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _UnaryOp _Transform_op, _Ty& _Val, _Ty& _Init) { @@ -4765,9 +4859,6 @@ struct _Static_partitioned_transform_exclusive_scan2 { return _Cancellation_status::_Canceled; } - // Make a vector to avoid the type of *_Dest is different with _Ty - _STD vector<_Ty> _Intermediate_result; - const auto _Chunk_number = _Key._Chunk_number; const auto _In_range = _Basis1._Get_chunk(_Key); const auto _Dest = _Basis2._Get_first(_Chunk_number, _Team._Get_chunk_offset(_Chunk_number)); @@ -4789,20 +4880,38 @@ struct _Static_partitioned_transform_exclusive_scan2 { return _Cancellation_status::_Running; } - // Calculate local sum and publish to other threads - const auto _Last = _STD _Transform_exclusive_scan_per_chunk(_In_range._First, _In_range._Last, _Dest, - _Reduce_op, _Transform_op, _Chunk->_Local._Ref(), _Intermediate_result); - _Chunk->_Store_available_state(_Local_available); + // If the intermediate type is same with output type, then we can direct store the bop result in the output + if constexpr (_STD is_same_v<_Ty, typename std::iterator_traits<_FwdIt2>::value_type>) { + // Calculate local sum and publish to other threads + const auto _Last = _STD _Transform_exclusive_scan_per_chunk_origin( + _In_range._First, _In_range._Last, _Dest, _Reduce_op, _Transform_op, _Chunk->_Local._Ref()); + _Chunk->_Store_available_state(_Local_available); - // Apply the predecessor overall sum to current overall sum and elements - if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly - _Chunk->_Apply_exclusive_predecessor( - _Prev_chunk->_Sum._Ref(), _Dest, _STD begin(_Intermediate_result), _Last, _Reduce_op); + // Apply the predecessor overall sum to current overall sum and elements + if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly + _Chunk->_Apply_exclusive_predecessor_origin(_Prev_chunk->_Sum._Ref(), _Dest, _Last, _Reduce_op); + } else { + auto _Tmp = _STD _Get_lookback_sum(_Prev_chunk, _Reduce_op); + _Chunk->_Apply_exclusive_predecessor_origin(_Tmp, _Dest, _Last, _Reduce_op); + } } else { - auto _Tmp = _STD _Get_lookback_sum(_Prev_chunk, _Reduce_op); - _Chunk->_Apply_exclusive_predecessor(_Tmp, _Dest, _STD begin(_Intermediate_result), _Last, _Reduce_op); + // Make a vector to avoid the type of *_Dest is different with _Ty + _STD vector<_Ty> _Intermediate_result; + + // Calculate local sum and publish to other threads + const auto _Last = _STD _Transform_exclusive_scan_per_chunk(_In_range._First, _In_range._Last, _Dest, + _Reduce_op, _Transform_op, _Chunk->_Local._Ref(), _Intermediate_result); + _Chunk->_Store_available_state(_Local_available); + + // Apply the predecessor overall sum to current overall sum and elements + if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly + _Chunk->_Apply_exclusive_predecessor( + _Prev_chunk->_Sum._Ref(), _Dest, _Intermediate_result.data(), _Last, _Reduce_op); + } else { + auto _Tmp = _STD _Get_lookback_sum(_Prev_chunk, _Reduce_op); + _Chunk->_Apply_exclusive_predecessor(_Tmp, _Dest, _Intermediate_result.data(), _Last, _Reduce_op); + } } - return _Cancellation_status::_Running; } @@ -4949,23 +5058,38 @@ struct _Static_partitioned_transform_inclusive_scan2 { return _Cancellation_status::_Running; } - // Make a vector to avoid the type of *_Dest is different with _Ty - _STD vector<_Ty> _Intermediate_result; + // If the intermediate type is same with output type, then we can direct store the bop result in the output + if constexpr (_STD is_same_v<_Ty, typename std::iterator_traits<_FwdIt2>::value_type>) { + // Calculate local sum and publish to other threads + const auto _Last = _STD _Transform_inclusive_scan_per_chunk_complete(_In_range._First, _In_range._Last, _Dest, + _Reduce_op, _Transform_op, _Chunk->_Local._Ref(), _No_init_tag{}); + _Chunk->_Store_available_state(_Local_available); - // Calculate local sum and publish to other threads - const auto _Last = _STD _Transform_inclusive_scan_per_chunk(_In_range._First, _In_range._Last, _Dest, - _Reduce_op, _Transform_op, _Chunk->_Local._Ref(), _No_init_tag{}, _Intermediate_result); - _Chunk->_Store_available_state(_Local_available); - - // Apply the predecessor overall sum to current overall sum and elements - if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly - _Chunk->_Apply_inclusive_predecessor( - _Prev_chunk->_Sum._Ref(), _Dest, _STD begin(_Intermediate_result), _Last, _Reduce_op); + // Apply the predecessor overall sum to current overall sum and elements + if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly + _Chunk->_Apply_inclusive_predecessor_origin(_Prev_chunk->_Sum._Ref(), _Dest, _Last, _Reduce_op); + } else { + auto _Tmp = _STD _Get_lookback_sum(_Prev_chunk, _Reduce_op); + _Chunk->_Apply_inclusive_predecessor_origin(_Tmp, _Dest, _Last, _Reduce_op); + } } else { - auto _Tmp = _STD _Get_lookback_sum(_Prev_chunk, _Reduce_op); - _Chunk->_Apply_inclusive_predecessor(_Tmp, _Dest, _STD begin(_Intermediate_result), _Last, _Reduce_op); + // Make a vector to avoid the type of *_Dest is different with _Ty + _STD vector<_Ty> _Intermediate_result; + + // Calculate local sum and publish to other threads + const auto _Last = _STD _Transform_inclusive_scan_per_chunk(_In_range._First, _In_range._Last, _Dest, + _Reduce_op, _Transform_op, _Chunk->_Local._Ref(), _No_init_tag{}, _Intermediate_result); + _Chunk->_Store_available_state(_Local_available); + + // Apply the predecessor overall sum to current overall sum and elements + if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly + _Chunk->_Apply_inclusive_predecessor( + _Prev_chunk->_Sum._Ref(), _Dest, _Intermediate_result.data(), _Last, _Reduce_op); + } else { + auto _Tmp = _STD _Get_lookback_sum(_Prev_chunk, _Reduce_op); + _Chunk->_Apply_inclusive_predecessor(_Tmp, _Dest, _Intermediate_result.data(), _Last, _Reduce_op); + } } - return _Cancellation_status::_Running; } diff --git a/stl/inc/vector b/stl/inc/vector index e2d7e0cc60..05f10ee9a7 100644 --- a/stl/inc/vector +++ b/stl/inc/vector @@ -788,7 +788,7 @@ private: if constexpr (conjunction_v, _Uses_default_construct<_Alloc, _Ty*, _Valty...>>) { _ASAN_VECTOR_MODIFY(1); - _Construct_in_place(*_Mylast, _STD forward<_Valty>(_Val)...); + _STD _Construct_in_place(*_Mylast, _STD forward<_Valty>(_Val)...); } else { _ASAN_VECTOR_EXTEND_GUARD(static_cast(_Mylast - _My_data._Myfirst) + 1); _Alty_traits::construct(_Getal(), _Unfancy(_Mylast), _STD forward<_Valty>(_Val)...); @@ -822,27 +822,27 @@ private: const size_type _Newsize = _Oldsize + 1; size_type _Newcapacity = _Calculate_growth(_Newsize); - const pointer _Newvec = _Allocate_at_least_helper(_Al, _Newcapacity); + const pointer _Newvec = _STD _Allocate_at_least_helper(_Al, _Newcapacity); const pointer _Constructed_last = _Newvec + _Whereoff + 1; pointer _Constructed_first = _Constructed_last; _TRY_BEGIN - _Alty_traits::construct(_Al, _Unfancy(_Newvec + _Whereoff), _STD forward<_Valty>(_Val)...); + _Alty_traits::construct(_Al, _STD _Unfancy(_Newvec + _Whereoff), _STD forward<_Valty>(_Val)...); _Constructed_first = _Newvec + _Whereoff; if (_Whereptr == _Mylast) { // at back, provide strong guarantee if constexpr (is_nothrow_move_constructible_v<_Ty> || !is_copy_constructible_v<_Ty>) { - _Uninitialized_move(_Myfirst, _Mylast, _Newvec, _Al); + _STD _Uninitialized_move(_Myfirst, _Mylast, _Newvec, _Al); } else { - _Uninitialized_copy(_Myfirst, _Mylast, _Newvec, _Al); + _STD _Uninitialized_copy(_Myfirst, _Mylast, _Newvec, _Al); } } else { // provide basic guarantee - _Uninitialized_move(_Myfirst, _Whereptr, _Newvec, _Al); + _STD _Uninitialized_move(_Myfirst, _Whereptr, _Newvec, _Al); _Constructed_first = _Newvec; - _Uninitialized_move(_Whereptr, _Mylast, _Newvec + _Whereoff + 1, _Al); + _STD _Uninitialized_move(_Whereptr, _Mylast, _Newvec + _Whereoff + 1, _Al); } _CATCH_ALL - _Destroy_range(_Constructed_first, _Constructed_last, _Al); + _STD _Destroy_range(_Constructed_first, _Constructed_last, _Al); _Al.deallocate(_Newvec, _Newcapacity); _RERAISE; _CATCH_END @@ -2024,7 +2024,7 @@ private: _My_data._Orphan_all(); if (_Myfirst) { // destroy and deallocate old array - _Destroy_range(_Myfirst, _Mylast, _Al); + _STD _Destroy_range(_Myfirst, _Mylast, _Al); _ASAN_VECTOR_REMOVE; _Al.deallocate(_Myfirst, static_cast(_Myend - _Myfirst)); } diff --git a/stl/inc/xmemory b/stl/inc/xmemory index 011c072278..45913c396c 100644 --- a/stl/inc/xmemory +++ b/stl/inc/xmemory @@ -1905,15 +1905,15 @@ _CONSTEXPR20 _Alloc_ptr_t<_Alloc> _Uninitialized_move( // move [_First, _Last) to raw _Dest, using _Al // note: only called internally from elsewhere in the STL using _Ptrval = typename _Alloc::value_type*; - auto _UFirst = _Get_unwrapped(_First); - const auto _ULast = _Get_unwrapped(_Last); + auto _UFirst = _STD _Get_unwrapped(_First); + const auto _ULast = _STD _Get_unwrapped(_Last); if constexpr (conjunction_v::_Bitcopy_constructible>, _Uses_default_construct<_Alloc, _Ptrval, decltype(_STD move(*_UFirst))>>) { #if _HAS_CXX20 if (!_STD is_constant_evaluated()) #endif // _HAS_CXX20 { - _Copy_memmove(_UFirst, _ULast, _Unfancy(_Dest)); + _STD _Copy_memmove(_UFirst, _ULast, _STD _Unfancy(_Dest)); return _Dest + (_ULast - _UFirst); } }