From f0e3c43561b3448066c199067dbebbf06ad0ab59 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 20 Oct 2024 13:19:15 +0300 Subject: [PATCH 01/47] find meow of bitmap --- benchmarks/src/find_first_of.cpp | 2 +- stl/inc/__msvc_string_view.hpp | 128 +++--- stl/src/vector_algorithms.cpp | 750 +++++++++++++++++++++++++------ 3 files changed, 665 insertions(+), 215 deletions(-) diff --git a/benchmarks/src/find_first_of.cpp b/benchmarks/src/find_first_of.cpp index b81e94f6ed..eeffb9e5df 100644 --- a/benchmarks/src/find_first_of.cpp +++ b/benchmarks/src/find_first_of.cpp @@ -59,7 +59,7 @@ void bm(benchmark::State& state) { } void common_args(auto bm) { - bm->Args({2, 3})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2})->Args({102, 4}); + bm->Args({2, 3})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2})->Args({75, 85})->Args({102, 4}); bm->Args({325, 1})->Args({400, 50})->Args({1011, 11})->Args({1502, 23})->Args({3056, 7}); } diff --git a/stl/inc/__msvc_string_view.hpp b/stl/inc/__msvc_string_view.hpp index 676fedf943..58f9d64e43 100644 --- a/stl/inc/__msvc_string_view.hpp +++ b/stl/inc/__msvc_string_view.hpp @@ -29,6 +29,15 @@ extern "C" { // compiler has to assume that the denoted arrays are "globally address taken", and that any later calls to // unanalyzable routines may modify those arrays. +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_1( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept; +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_2( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept; +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_4( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept; +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_8( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept; + __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_1( const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept; __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_2( @@ -38,6 +47,23 @@ __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_2( _STD_BEGIN +template +size_t _Find_first_of_pos_vectorized(const _Ty1* const _Haystack, const size_t _Haystack_length, + const _Ty2* const _Needle, const size_t _Needle_length) noexcept { + _STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2)); + if constexpr (sizeof(_Ty1) == 1) { + return ::__std_find_first_of_trivial_pos_1(_Haystack, _Haystack_length, _Needle, _Needle_length); + } else if constexpr (sizeof(_Ty1) == 2) { + return ::__std_find_first_of_trivial_pos_2(_Haystack, _Haystack_length, _Needle, _Needle_length); + } else if constexpr (sizeof(_Ty1) == 4) { + return ::__std_find_first_of_trivial_pos_4(_Haystack, _Haystack_length, _Needle, _Needle_length); + } else if constexpr (sizeof(_Ty1) == 8) { + return ::__std_find_first_of_trivial_pos_8(_Haystack, _Haystack_length, _Needle, _Needle_length); + } else { + _STL_INTERNAL_STATIC_ASSERT(false); // unexpected size + } +} + template size_t _Find_last_of_pos_vectorized(const _Ty1* const _Haystack, const size_t _Haystack_length, const _Ty2* const _Needle, const size_t _Needle_length) noexcept { @@ -764,48 +790,31 @@ constexpr size_t _Traits_find_first_of(_In_reads_(_Hay_size) const _Traits_ptr_t const auto _Hay_end = _Haystack + _Hay_size; if constexpr (_Is_implementation_handled_char_traits<_Traits>) { - if (!_STD _Is_constant_evaluated()) { - using _Elem = typename _Traits::char_type; - #if _USE_STD_VECTOR_ALGORITHMS - const bool _Try_vectorize = _Hay_size - _Start_at > _Threshold_find_first_of; - - // Additional condition for when the vectorization outperforms the table lookup - constexpr size_t _Find_first_of_bitmap_threshold = sizeof(_Elem) == 1 ? 48 : sizeof(_Elem) == 8 ? 8 : 16; - - const bool _Use_bitmap = !_Try_vectorize || _Needle_size > _Find_first_of_bitmap_threshold; -#else // ^^^ _USE_STD_VECTOR_ALGORITHMS / !_USE_STD_VECTOR_ALGORITHMS vvv - const bool _Use_bitmap = true; -#endif // ^^^ !_USE_STD_VECTOR_ALGORITHMS ^^^ - - if (_Use_bitmap) { - _String_bitmap<_Elem> _Matches; - - if (_Matches._Mark(_Needle, _Needle + _Needle_size)) { - for (auto _Match_try = _Hay_start; _Match_try < _Hay_end; ++_Match_try) { - if (_Matches._Match(*_Match_try)) { - return static_cast(_Match_try - _Haystack); // found a match - } - } - return static_cast(-1); // no match + if (!_STD _Is_constant_evaluated()) { + const size_t _Remaining_size = _Hay_size - _Start_at; + if (_Remaining_size >= _Threshold_find_first_of) { + size_t _Pos = _Find_first_of_pos_vectorized(_Hay_start, _Remaining_size, _Needle, _Needle_size); + if (_Pos != static_cast(-1)) { + _Pos += _Start_at; } - - // couldn't put one of the characters into the bitmap, fall back to vectorized or serial algorithms + return _Pos; } + } +#endif // _USE_STD_VECTOR_ALGORITHMS -#if _USE_STD_VECTOR_ALGORITHMS - if (_Try_vectorize) { - const _Traits_ptr_t<_Traits> _Found = - _STD _Find_first_of_vectorized(_Hay_start, _Hay_end, _Needle, _Needle + _Needle_size); - - if (_Found != _Hay_end) { - return static_cast(_Found - _Haystack); // found a match - } else { - return static_cast(-1); // no match + _String_bitmap _Matches; + + if (_Matches._Mark(_Needle, _Needle + _Needle_size)) { + for (auto _Match_try = _Hay_start; _Match_try < _Hay_end; ++_Match_try) { + if (_Matches._Match(*_Match_try)) { + return static_cast(_Match_try - _Haystack); // found a match } } -#endif // _USE_STD_VECTOR_ALGORITHMS + return static_cast(-1); // no match } + + // couldn't put one of the characters into the bitmap, fall back to serial algorithm } for (auto _Match_try = _Hay_start; _Match_try < _Hay_end; ++_Match_try) { @@ -829,47 +838,32 @@ constexpr size_t _Traits_find_last_of(_In_reads_(_Hay_size) const _Traits_ptr_t< const auto _Hay_start = (_STD min)(_Start_at, _Hay_size - 1); if constexpr (_Is_implementation_handled_char_traits<_Traits>) { - if (!_STD _Is_constant_evaluated()) { - using _Elem = typename _Traits::char_type; - - bool _Use_bitmap = true; + using _Elem = typename _Traits::char_type; #if _USE_STD_VECTOR_ALGORITHMS - bool _Try_vectorize = false; - - if constexpr (sizeof(_Elem) <= 2) { - _Try_vectorize = _Hay_start + 1 > _Threshold_find_first_of; - // Additional condition for when the vectorization outperforms the table lookup - constexpr size_t _Find_last_of_bitmap_threshold = sizeof(_Elem) == 1 ? 48 : 8; - - _Use_bitmap = !_Try_vectorize || _Needle_size > _Find_last_of_bitmap_threshold; + if constexpr (sizeof(_Elem) <= 2) { + if (!_STD _Is_constant_evaluated()) { + const size_t _Remaining_size = _Hay_start + 1; + if (_Remaining_size >= _Threshold_find_first_of) { + return _Find_last_of_pos_vectorized(_Haystack, _Remaining_size, _Needle, _Needle_size); + } } + } #endif // _USE_STD_VECTOR_ALGORITHMS - if (_Use_bitmap) { - _String_bitmap<_Elem> _Matches; - if (_Matches._Mark(_Needle, _Needle + _Needle_size)) { - for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) { - if (_Matches._Match(*_Match_try)) { - return static_cast(_Match_try - _Haystack); // found a match - } - - if (_Match_try == _Haystack) { - return static_cast(-1); // at beginning, no more chance for match - } - } + _String_bitmap<_Elem> _Matches; + if (_Matches._Mark(_Needle, _Needle + _Needle_size)) { + for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) { + if (_Matches._Match(*_Match_try)) { + return static_cast(_Match_try - _Haystack); // found a match } - // couldn't put one of the characters into the bitmap, fall back to vectorized or serial algorithms - } - -#if _USE_STD_VECTOR_ALGORITHMS - if constexpr (sizeof(_Elem) <= 2) { - if (_Try_vectorize) { - return _STD _Find_last_of_pos_vectorized(_Haystack, _Hay_start + 1, _Needle, _Needle_size); + if (_Match_try == _Haystack) { + return static_cast(-1); // at beginning, no more chance for match } } -#endif // _USE_STD_VECTOR_ALGORITHMS } + + // couldn't put one of the characters into the bitmap, fall back to serial algorithm } for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) { diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index cd25f40064..986287128c 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2916,10 +2916,304 @@ namespace { return _Result; } - namespace __std_find_first_of { + namespace __std_find_meow_of { + enum class _Strategy { _No_bitmap, _Scalar_bitmap, _Vector_bitmap }; + + bool _Product_fits_threshold(const size_t _Px1, const size_t _Px2, const size_t _Tx) noexcept { +#ifdef _WIN64 + long long _Rx; + return _mul_overflow_i64(_Px1, _Px2, &_Rx) && static_cast(_Rx) < _Tx; +#else // ^^^ defined(_WIN64) / !defined(_WIN64) + unsigned int _Rx0; + unsigned int _Rx1; + return _mul_full_overflow_u32(_Px1, _Px2, &_Rx0, &_Rx1) && _Rx0 < _Tx; +#endif + } + + template + _Strategy _Pick_strategy_avx(const size_t _Count1, const size_t _Count2) noexcept { + if constexpr (sizeof(_Ty) == 1) { + if (_Count2 <= 15 || _Product_fits_threshold((_Count1 + 15) / 16, (_Count2 + 15) / 16, 60)) { + return _Strategy::_No_bitmap; + } else if (_Count1 * 1ull > _Count2 * 5ull) { + return _Strategy::_Vector_bitmap; + } else { + return _Strategy::_Scalar_bitmap; + } + } else if constexpr (sizeof(_Ty) == 2) { + if (_Count2 <= 7 || _Product_fits_threshold((_Count1 + 7) / 8, (_Count2 + 7) / 8, 60)) { + return _Strategy::_No_bitmap; + } else if (_Count1 * 2ull > _Count2 * 5ull) { + return _Strategy::_Vector_bitmap; + } else { + return _Strategy::_Scalar_bitmap; + } + } else { + if (_Count1 <= 8) { + return _Strategy::_No_bitmap; + } + if (_Count1 > 400) { + return _Strategy::_Vector_bitmap; + } + if (_Count2 * 2 > _Count1) { + return _Strategy::_Scalar_bitmap; + } + return _Strategy::_Vector_bitmap; + } + } + } // namespace __std_find_meow_of + + namespace __std_find_meow_of::_Bitmap { + template + bool _Can_fit_256_bits_sse(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { + if constexpr (sizeof(_Ty) == 1) { + return true; + } else { + __m128i _Mask = _mm_undefined_si128(); + if constexpr (sizeof(_Ty) == 2) { + _Mask = _mm_set1_epi16(static_cast(0xFF00)); + } else if constexpr (sizeof(_Ty) == 4) { + _Mask = _mm_set1_epi32(static_cast(0xFFFF'FF00)); + } else if constexpr (sizeof(_Ty) == 8) { + _Mask = _mm_set1_epi64x(static_cast(0xFFFF'FFFF'FFFF'FF00)); + } else { + static_assert(false, "Unexpected size"); + } + + const size_t _Byte_size = _Needle_length * sizeof(_Ty); + + const void* _Stop = _Needle_ptr; + _Advance_bytes(_Stop, _Byte_size & ~size_t{0x1F}); + while (_Needle_ptr != _Stop) { + const __m128i _Data = _mm_loadu_si128(reinterpret_cast(_Needle_ptr)); + if (!_mm_testz_si128(_Mask, _Data)) { + return false; + } + + _Needle_ptr += 32 / sizeof(_Ty); + } + + _Advance_bytes(_Stop, _Byte_size & 0x1E); + while (_Needle_ptr != _Stop) { + if ((*_Needle_ptr & ~_Ty{0xFF}) != 0) { + return false; + } + + ++_Needle_ptr; + } + + return true; + } + } + + __m256i __vectorcall _Step(const __m256i _Bitmap, const __m256i _Data) noexcept { + __m256i _Data_high = _mm256_srli_epi32(_Data, 5); + __m256i _Bitmap_parts = _mm256_permutevar8x32_epi32(_Bitmap, _Data_high); + __m256i _Data_low_inv = _mm256_andnot_si256(_Data, _mm256_set1_epi32(0x1F)); + __m256i _Mask = _mm256_sllv_epi32(_Bitmap_parts, _Data_low_inv); + return _Mask; + } + + template + __m256i _Load_avx_256_8(const _Ty* const _Src) noexcept { + if constexpr (sizeof(_Ty) == 1) { + return _mm256_cvtepu8_epi32(_mm_loadu_si64(_Src)); + } else if constexpr (sizeof(_Ty) == 2) { + return _mm256_cvtepu16_epi32(_mm_loadu_si128(reinterpret_cast(_Src))); + } else if constexpr (sizeof(_Ty) == 4) { + return _mm256_loadu_si256(reinterpret_cast(_Src)); + } else if constexpr (sizeof(_Ty) == 8) { + const __m256i _Low = _mm256_loadu_si256(reinterpret_cast(_Src)); + const __m256i _High = _mm256_loadu_si256(reinterpret_cast(_Src) + 1); + const __m256i _Pack = _mm256_packs_epi32(_Low, _High); + return _mm256_permutex_epi64(_Pack, _MM_SHUFFLE(3, 1, 2, 0)); + } else { + static_assert(false, "Unexpected size"); + } + } + + template + __m256i _Mask_out_oveflow(const __m256i _Mask, const __m256i _Data) noexcept { + if constexpr (sizeof(_Ty) == 1) { + return _Mask; + } else { + const __m256i _Data_high = _mm256_and_si256(_Data, _mm256_set1_epi32(static_cast(0xFFFF'FF00))); + const __m256i _Fit_mask = _mm256_cmpeq_epi32(_Data_high, _mm256_setzero_si256()); + return _mm256_and_si256(_Mask, _Fit_mask); + } + } + + template + __m256i _Make_bitmap(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { + __m256i _Bitmap = _mm256_setzero_si256(); + + const _Ty* const _Stop = _Needle_ptr + _Needle_length; + + for (; _Needle_ptr != _Stop; ++_Needle_ptr) { + const _Ty _Val = *_Needle_ptr; + const __m256i _Count_low = _mm256_broadcastq_epi64(_mm_cvtsi32_si128(_Val & 0x3F)); + const uint32_t _One_1_high = 1u << uint32_t((_Val >> 3) & 0x18); + const __m256i _One_1_high_unp = _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(_One_1_high)); + const __m256i _One_1 = _mm256_sllv_epi64(_One_1_high_unp, _Count_low); + _Bitmap = _mm256_or_si256(_Bitmap, _One_1); + } + + return _Bitmap; + } + + template + size_t _Impl_first_avx(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, + const size_t _Needle_length) noexcept { + const auto _Haystack_ptr = static_cast(_Haystack); + const auto _Needle_ptr = static_cast(_Needle); + + const __m256i _Bitmap = _Make_bitmap(_Needle_ptr, _Needle_length); + + const size_t _Haystack_length_vec = _Haystack_length & ~size_t{7}; + for (size_t _Ix = 0; _Ix != _Haystack_length_vec; _Ix += 8) { + const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Ix); + const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); + const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + if (_Bingo != 0) { + return _Ix + _tzcnt_u32(_Bingo); + } + } + + const size_t _Haystack_length_tail = _Haystack_length & 7; + if (_Haystack_length_tail != 0) { + const unsigned _Tail_bingo_mask = ((1 << _Haystack_length_tail) - 1); + _Ty _Buf[8]; + memcpy(_Buf, _Haystack_ptr + _Haystack_length_vec, _Haystack_length_tail * sizeof(_Ty)); + const __m256i _Data = _Load_avx_256_8(_Buf); + const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); + const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; + if (_Bingo != 0) { + return _Haystack_length_vec + _tzcnt_u32(_Bingo); + } + } + + return static_cast(-1); + } + + template + size_t _Impl_last_avx(const void* const _Haystack, size_t _Haystack_length, const void* const _Needle, + const size_t _Needle_length) noexcept { + const auto _Haystack_ptr = static_cast(_Haystack); + const auto _Needle_ptr = static_cast(_Needle); + + const __m256i _Bitmap = _Make_bitmap(_Needle_ptr, _Needle_length); + + while (_Haystack_length >= 8) { + _Haystack_length -= 8; + const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Haystack_length); + const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); + const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + if (_Bingo != 0) { + return _Haystack_length + 31 - _lzcnt_u32(_Bingo); + } + } + + const size_t _Haystack_length_tail = _Haystack_length & 7; + if (_Haystack_length_tail != 0) { + const unsigned _Tail_bingo_mask = ((1 << _Haystack_length_tail) - 1); + _Ty _Buf[8]; + memcpy(_Buf, _Haystack_ptr, _Haystack_length_tail * sizeof(_Ty)); + const __m256i _Data = _Load_avx_256_8(_Buf); + const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); + const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; + if (_Bingo != 0) { + return 31 - _lzcnt_u32(_Bingo); + } + } + + return static_cast(-1); + } + + using _Scalar_table_t = bool[256]; + + template + bool _Build_scalar_table(bool* _Table, const void* const _Needle, const size_t _Needle_length) noexcept { + auto _Ptr = static_cast(_Needle); + const auto _End = _Ptr + _Needle_length; + + for (; _Ptr != _End; ++_Ptr) { + const _Ty _Val = *_Ptr; + + if constexpr (sizeof(_Val) > 1) { + if (_Val >= 256) { + return false; + } + } + + _Table[_Val] = true; + } + + return true; + } template - const void* __stdcall _Fallback(const void* _First1, const void* const _Last1, const void* const _First2, + void _Build_scalar_table_no_check( + bool* _Table, const void* const _Needle, const size_t _Needle_length) noexcept { + auto _Ptr = static_cast(_Needle); + const auto _End = _Ptr + _Needle_length; + + for (; _Ptr != _End; ++_Ptr) { + _Table[*_Ptr] = true; + } + } + + + template + size_t _Impl_first_scalar( + const void* const _Haystack, const size_t _Haystack_length, const bool* const _Table) noexcept { + const auto _Haystack_ptr = static_cast(_Haystack); + + for (size_t _Ix = 0; _Ix != _Haystack_length; ++_Ix) { + const _Ty _Val = _Haystack_ptr[_Ix]; + + if constexpr (sizeof(_Val) > 1) { + if (_Val >= 256) { + continue; + } + } + + if (_Table[_Val]) { + return _Ix; + } + } + + return static_cast(-1); + } + + template + size_t _Impl_last_scalar( + const void* const _Haystack, size_t _Haystack_length, const bool* const _Table) noexcept { + const auto _Haystack_ptr = static_cast(_Haystack); + + while (_Haystack_length != 0) { + --_Haystack_length; + + const _Ty _Val = _Haystack_ptr[_Haystack_length]; + + if constexpr (sizeof(_Val) > 1) { + if (_Val >= 256) { + continue; + } + } + + if (_Table[_Val]) { + return _Haystack_length; + } + } + + return static_cast(-1); + } + } // namespace __std_find_meow_of::_Bitmap + + namespace __std_find_meow_of::_First { + + template + const void* _Fallback(const void* _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { auto _Ptr_haystack = static_cast(_First1); const auto _Ptr_haystack_end = static_cast(_Last1); @@ -2937,129 +3231,127 @@ namespace { return _Ptr_haystack; } - template - const void* __stdcall _Impl_pcmpestri(const void* _First1, const void* const _Last1, const void* const _First2, - const void* const _Last2) noexcept { #ifndef _M_ARM64EC - if (_Use_sse42()) { - constexpr int _Op = (sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ANY - | _SIDD_LEAST_SIGNIFICANT; - constexpr int _Part_size_el = sizeof(_Ty) == 1 ? 16 : 8; - const size_t _Needle_length = _Byte_length(_First2, _Last2); - - const size_t _Haystack_length = _Byte_length(_First1, _Last1); - const void* _Stop_at = _First1; - _Advance_bytes(_Stop_at, _Haystack_length & ~size_t{0xF}); + template + const void* _Impl_pcmpestri(const void* _First1, const size_t _Haystack_length, const void* const _First2, + const size_t _Needle_length) noexcept { + constexpr int _Op = + (sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + constexpr int _Part_size_el = sizeof(_Ty) == 1 ? 16 : 8; - if (_Needle_length <= 16) { - // Special handling of small needle - // The generic branch could also be modified to handle it, but with slightly worse performance + const void* _Stop_at = _First1; + _Advance_bytes(_Stop_at, _Haystack_length & ~size_t{0xF}); - const int _Needle_length_el = static_cast(_Needle_length / sizeof(_Ty)); + if (_Needle_length <= 16) { + // Special handling of small needle + // The generic branch could also be modified to handle it, but with slightly worse performance - alignas(16) uint8_t _Tmp2[16]; - memcpy(_Tmp2, _First2, _Needle_length); - const __m128i _Data2 = _mm_load_si128(reinterpret_cast(_Tmp2)); + const int _Needle_length_el = static_cast(_Needle_length / sizeof(_Ty)); - while (_First1 != _Stop_at) { - const __m128i _Data1 = _mm_loadu_si128(static_cast(_First1)); - if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Part_size_el, _Op)) { - const int _Pos = _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Part_size_el, _Op); - _Advance_bytes(_First1, _Pos * sizeof(_Ty)); - return _First1; - } + alignas(16) uint8_t _Tmp2[16]; + memcpy(_Tmp2, _First2, _Needle_length); + const __m128i _Data2 = _mm_load_si128(reinterpret_cast(_Tmp2)); - _Advance_bytes(_First1, 16); + while (_First1 != _Stop_at) { + const __m128i _Data1 = _mm_loadu_si128(static_cast(_First1)); + if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Part_size_el, _Op)) { + const int _Pos = _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Part_size_el, _Op); + _Advance_bytes(_First1, _Pos * sizeof(_Ty)); + return _First1; } - const size_t _Last_part_size = _Haystack_length & 0xF; - const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); + _Advance_bytes(_First1, 16); + } - alignas(16) uint8_t _Tmp1[16]; - memcpy(_Tmp1, _First1, _Last_part_size); - const __m128i _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); + const size_t _Last_part_size = _Haystack_length & 0xF; + const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); - if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op)) { - const int _Pos = _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op); - _Advance_bytes(_First1, _Pos * sizeof(_Ty)); - return _First1; - } + alignas(16) uint8_t _Tmp1[16]; + memcpy(_Tmp1, _First1, _Last_part_size); + const __m128i _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); - _Advance_bytes(_First1, _Last_part_size); + if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op)) { + const int _Pos = _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op); + _Advance_bytes(_First1, _Pos * sizeof(_Ty)); return _First1; - } else { - const void* _Last_needle = _First2; - _Advance_bytes(_Last_needle, _Needle_length & ~size_t{0xF}); + } + + _Advance_bytes(_First1, _Last_part_size); + return _First1; + } else { + const void* _Last_needle = _First2; + _Advance_bytes(_Last_needle, _Needle_length & ~size_t{0xF}); - const int _Last_needle_length = static_cast(_Needle_length & 0xF); + const int _Last_needle_length = static_cast(_Needle_length & 0xF); - alignas(16) uint8_t _Tmp2[16]; - memcpy(_Tmp2, _Last_needle, _Last_needle_length); - const __m128i _Last_needle_val = _mm_load_si128(reinterpret_cast(_Tmp2)); - const int _Last_needle_length_el = _Last_needle_length / sizeof(_Ty); + alignas(16) uint8_t _Tmp2[16]; + memcpy(_Tmp2, _Last_needle, _Last_needle_length); + const __m128i _Last_needle_val = _mm_load_si128(reinterpret_cast(_Tmp2)); + const int _Last_needle_length_el = _Last_needle_length / sizeof(_Ty); - constexpr int _Not_found = 16; // arbitrary value greater than any found value + constexpr int _Not_found = 16; // arbitrary value greater than any found value - int _Found_pos = _Not_found; + int _Found_pos = _Not_found; - const auto _Step = [&_Found_pos](const __m128i _Data2, const int _Size2, const __m128i _Data1, - const int _Size1) noexcept { - if (_mm_cmpestrc(_Data2, _Size2, _Data1, _Size1, _Op)) { - const int _Pos = _mm_cmpestri(_Data2, _Size2, _Data1, _Size1, _Op); - if (_Pos < _Found_pos) { - _Found_pos = _Pos; - } + const auto _Step = [&_Found_pos](const __m128i _Data2, const int _Size2, const __m128i _Data1, + const int _Size1) noexcept { + if (_mm_cmpestrc(_Data2, _Size2, _Data1, _Size1, _Op)) { + const int _Pos = _mm_cmpestri(_Data2, _Size2, _Data1, _Size1, _Op); + if (_Pos < _Found_pos) { + _Found_pos = _Pos; } - }; + } + }; #pragma warning(push) #pragma warning(disable : 4324) // structure was padded due to alignment specifier - const auto _Test_whole_needle = [=](const __m128i _Data1, const int _Size1) noexcept { - const void* _Cur_needle = _First2; - do { - const __m128i _Data2 = _mm_loadu_si128(static_cast(_Cur_needle)); - _Step(_Data2, _Part_size_el, _Data1, _Size1); - _Advance_bytes(_Cur_needle, 16); - } while (_Cur_needle != _Last_needle); - - if (_Last_needle_length_el != 0) { - _Step(_Last_needle_val, _Last_needle_length_el, _Data1, _Size1); - } - }; -#pragma warning(pop) + const auto _Test_whole_needle = [=](const __m128i _Data1, const int _Size1) noexcept { + const void* _Cur_needle = _First2; + do { + const __m128i _Data2 = _mm_loadu_si128(static_cast(_Cur_needle)); + _Step(_Data2, _Part_size_el, _Data1, _Size1); + _Advance_bytes(_Cur_needle, 16); + } while (_Cur_needle != _Last_needle); - while (_First1 != _Stop_at) { - _Test_whole_needle(_mm_loadu_si128(static_cast(_First1)), _Part_size_el); + if (_Last_needle_length_el != 0) { + _Step(_Last_needle_val, _Last_needle_length_el, _Data1, _Size1); + } + }; +#pragma warning(pop) - if (_Found_pos != _Not_found) { - _Advance_bytes(_First1, _Found_pos * sizeof(_Ty)); - return _First1; - } + while (_First1 != _Stop_at) { + _Test_whole_needle(_mm_loadu_si128(static_cast(_First1)), _Part_size_el); - _Advance_bytes(_First1, 16); + if (_Found_pos != _Not_found) { + _Advance_bytes(_First1, _Found_pos * sizeof(_Ty)); + return _First1; } - const size_t _Last_part_size = _Haystack_length & 0xF; - const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); + _Advance_bytes(_First1, 16); + } - alignas(16) uint8_t _Tmp1[16]; - memcpy(_Tmp1, _First1, _Last_part_size); - const __m128i _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); + const size_t _Last_part_size = _Haystack_length & 0xF; + const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); - _Found_pos = _Last_part_size_el; + alignas(16) uint8_t _Tmp1[16]; + memcpy(_Tmp1, _First1, _Last_part_size); + const __m128i _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); - _Test_whole_needle(_Data1, _Last_part_size_el); + _Found_pos = _Last_part_size_el; - _Advance_bytes(_First1, _Found_pos * sizeof(_Ty)); - return _First1; - } + _Test_whole_needle(_Data1, _Last_part_size_el); + + _Advance_bytes(_First1, _Found_pos * sizeof(_Ty)); + return _First1; } -#endif // !_M_ARM64EC - return _Fallback<_Ty>(_First1, _Last1, _First2, _Last2); } +#endif // !_M_ARM64EC + + template + struct _Traits; - struct _Traits_4 : _Find_traits_4 { - using _Ty = uint32_t; + template <> + struct _Traits : _Find_traits_4 { #ifndef _M_ARM64EC template static __m256i _Spread_avx(__m256i _Val, const size_t _Needle_length_el) noexcept { @@ -3104,8 +3396,8 @@ namespace { #endif // !_M_ARM64EC }; - struct _Traits_8 : _Find_traits_8 { - using _Ty = uint64_t; + template <> + struct _Traits : _Find_traits_8 { #ifndef _M_ARM64EC template static __m256i _Spread_avx(const __m256i _Val, const size_t _Needle_length_el) noexcept { @@ -3169,18 +3461,16 @@ namespace { return _Eq; } - template - const void* _Shuffle_impl(const void* _First1, const void* const _Last1, const void* const _First2, + template + const void* _Shuffle_impl(const void* _First1, const size_t _Haystack_length, const void* const _First2, const void* const _Stop2, const size_t _Last2_length_el) noexcept { - using _Ty = _Traits::_Ty; + using _Traits = _Traits<_Ty>; constexpr size_t _Length_el = 32 / sizeof(_Ty); const __m256i _Last2val = _mm256_maskload_epi32( reinterpret_cast(_Stop2), _Avx2_tail_mask_32(_Last2_length_el * (sizeof(_Ty) / 4))); const __m256i _Last2s0 = _Traits::_Spread_avx<_Last2_length_el_magnitude>(_Last2val, _Last2_length_el); - const size_t _Haystack_length = _Byte_length(_First1, _Last1); - const void* _Stop1 = _First1; _Advance_bytes(_Stop1, _Haystack_length & ~size_t{0x1F}); @@ -3226,61 +3516,189 @@ namespace { return _First1; } - template - const void* _Shuffle_impl_dispatch_magnitude(const void* const _First1, const void* const _Last1, + template + const void* _Shuffle_impl_dispatch_magnitude(const void* const _First1, const size_t _Haystack_length, const void* const _First2, const void* const _Stop2, const size_t _Last2_length_el) noexcept { if (_Last2_length_el == 0) { - return _Shuffle_impl<_Traits, _Large, 0>(_First1, _Last1, _First2, _Stop2, _Last2_length_el); + return _Shuffle_impl<_Ty, _Large, 0>(_First1, _Haystack_length, _First2, _Stop2, _Last2_length_el); } else if (_Last2_length_el == 1) { - return _Shuffle_impl<_Traits, _Large, 1>(_First1, _Last1, _First2, _Stop2, _Last2_length_el); + return _Shuffle_impl<_Ty, _Large, 1>(_First1, _Haystack_length, _First2, _Stop2, _Last2_length_el); } else if (_Last2_length_el == 2) { - return _Shuffle_impl<_Traits, _Large, 2>(_First1, _Last1, _First2, _Stop2, _Last2_length_el); + return _Shuffle_impl<_Ty, _Large, 2>(_First1, _Haystack_length, _First2, _Stop2, _Last2_length_el); } else if (_Last2_length_el <= 4) { - return _Shuffle_impl<_Traits, _Large, 4>(_First1, _Last1, _First2, _Stop2, _Last2_length_el); + return _Shuffle_impl<_Ty, _Large, 4>(_First1, _Haystack_length, _First2, _Stop2, _Last2_length_el); } else if (_Last2_length_el <= 8) { - if constexpr (sizeof(_Traits::_Ty) == 4) { - return _Shuffle_impl<_Traits, _Large, 8>(_First1, _Last1, _First2, _Stop2, _Last2_length_el); + if constexpr (sizeof(_Ty) == 4) { + return _Shuffle_impl<_Ty, _Large, 8>(_First1, _Haystack_length, _First2, _Stop2, _Last2_length_el); } } _STL_UNREACHABLE; } + template + const void* _Impl_4_8(const void* const _First1, const size_t _Haystack_length, const void* const _First2, + const size_t _Needle_length) noexcept { + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + + const size_t _Last_needle_length = _Needle_length & 0x1F; + const size_t _Last_needle_length_el = _Last_needle_length / sizeof(_Ty); + + if (const size_t _Needle_length_large = _Needle_length & ~size_t{0x1F}; _Needle_length_large != 0) { + const void* _Stop2 = _First2; + _Advance_bytes(_Stop2, _Needle_length_large); + return _Shuffle_impl_dispatch_magnitude<_Ty, true>( + _First1, _Haystack_length, _First2, _Stop2, _Last_needle_length_el); + } else { + return _Shuffle_impl_dispatch_magnitude<_Ty, false>( + _First1, _Haystack_length, _First2, _First2, _Last_needle_length_el); + } + } #endif // !_M_ARM64EC - template - const void* __stdcall _Impl_4_8(const void* const _First1, const void* const _Last1, const void* const _First2, - const void* const _Last2) noexcept { - using _Ty = _Traits::_Ty; + template + const void* _Dispatch_ptr( + const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) { #ifndef _M_ARM64EC - if (_Use_avx2()) { - _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + if constexpr (sizeof(_Ty) <= 2) { + if (_Use_sse42()) { + return _Impl_pcmpestri<_Ty>( + _First1, _Byte_length(_First1, _Last1), _First2, _Byte_length(_First2, _Last2)); + } + } else { + if (_Use_avx2()) { + return _Impl_4_8<_Ty>( + _First1, _Byte_length(_First1, _Last1), _First2, _Byte_length(_First2, _Last2)); + } + } +#endif // !_M_ARM64EC - const size_t _Needle_length = _Byte_length(_First2, _Last2); - const size_t _Last_needle_length = _Needle_length & 0x1F; - const size_t _Last_needle_length_el = _Last_needle_length / sizeof(_Ty); + return _Fallback<_Ty>(_First1, _Last1, _First2, _Last2); + } - if (const size_t _Needle_length_large = _Needle_length & ~size_t{0x1F}; _Needle_length_large != 0) { - const void* _Stop2 = _First2; - _Advance_bytes(_Stop2, _Needle_length_large); - return _Shuffle_impl_dispatch_magnitude<_Traits, true>( - _First1, _Last1, _First2, _Stop2, _Last_needle_length_el); - } else { - return _Shuffle_impl_dispatch_magnitude<_Traits, false>( - _First1, _Last1, _First2, _First2, _Last_needle_length_el); + template + const size_t _Pos_from_ptr(const void* const _Result, const void* const _First1, const void* const _Last1) { + if (_Result != _Last1) { + return _Byte_length(_First1, _Result) / sizeof(_Ty); + } else { + return static_cast(-1); + } + } + +#ifndef _M_ARM64EC + template + const size_t _Dispatch_pos_sse_1_2( + const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { + const _Strategy _Strat = _Pick_strategy_avx<_Ty>(_Count1, _Count2); + + if (_Strat == _Strategy::_Vector_bitmap && _Use_avx2()) { + if (_Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + return _Bitmap::_Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); + } + } else if (_Strat != _Strategy::_No_bitmap) { + if (_Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + _Bitmap::_Scalar_table_t _Table = {}; + _Bitmap::_Build_scalar_table_no_check<_Ty>(_Table, _First2, _Count2); + return _Bitmap::_Impl_first_scalar<_Ty>(_First1, _Count1, _Table); + } + } + + const void* _Last1 = static_cast(_First1) + _Count1; + const size_t _Size_bytes_1 = _Count1 * sizeof(_Ty); + const size_t _Size_bytes_2 = _Count2 * sizeof(_Ty); + + return _Pos_from_ptr<_Ty>( + _Impl_pcmpestri<_Ty>(_First1, _Size_bytes_1, _First2, _Size_bytes_2), _First1, _Last1); + } + + template + const size_t _Dispatch_pos_avx_4_8( + const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { + const _Strategy _Strat = _Pick_strategy_avx<_Ty>(_Count1, _Count2); + + if (_Strat == _Strategy::_Vector_bitmap) { + if (_Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + return _Bitmap::_Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); + } + } else if (_Strat != _Strategy::_No_bitmap) { + if (_Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + _Bitmap::_Scalar_table_t _Table = {}; + _Bitmap::_Build_scalar_table_no_check<_Ty>(_Table, _First2, _Count2); + return _Bitmap::_Impl_first_scalar<_Ty>(_First1, _Count1, _Table); } } + + const void* _Last1 = static_cast(_First1) + _Count1; + const size_t _Size_bytes_1 = _Count1 * sizeof(_Ty); + const size_t _Size_bytes_2 = _Count2 * sizeof(_Ty); + + return _Pos_from_ptr<_Ty>(_Impl_4_8<_Ty>(_First1, _Size_bytes_1, _First2, _Size_bytes_2), _First1, _Last1); + } + #endif // !_M_ARM64EC - return _Fallback<_Ty>(_First1, _Last1, _First2, _Last2); + + template + const size_t _Dispatch_pos_fallback( + const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { + + _Bitmap::_Scalar_table_t _Table = {}; + if (_Bitmap::_Build_scalar_table<_Ty>(_Table, _First2, _Count2)) { + return _Bitmap::_Impl_first_scalar<_Ty>(_First1, _Count1, _Table); + } + + const void* _Last1 = static_cast(_First1) + _Count1; + const void* _Last2 = static_cast(_First2) + _Count2; + + return _Pos_from_ptr<_Ty>(_Fallback<_Ty>(_First1, _Last1, _First2, _Last2), _First1, _Last1); } - } // namespace __std_find_first_of - template - size_t __stdcall __std_find_last_of_pos_impl(const void* const _Haystack, const size_t _Haystack_length, - const void* const _Needle, const size_t _Needle_length) noexcept { + template + const size_t _Dispatch_pos( + const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { #ifndef _M_ARM64EC - const size_t _Haystack_length_bytes = _Haystack_length * sizeof(_Ty); - if (_Use_sse42() && _Haystack_length_bytes >= 16) { + if constexpr (sizeof(_Ty) <= 2) { + if (_Use_sse42()) { + return _Dispatch_pos_sse_1_2<_Ty>(_First1, _Count1, _First2, _Count2); + } + } else { + if (_Use_avx2()) { + return _Dispatch_pos_avx_4_8<_Ty>(_First1, _Count1, _First2, _Count2); + } + } +#endif // !_M_ARM64EC + return _Dispatch_pos_fallback<_Ty>(_First1, _Count1, _First2, _Count2); + } + } // namespace __std_find_meow_of::_First + + namespace __std_find_meow_of::_Last { + + template + size_t __stdcall _Fallback(const void* const _Haystack, const size_t _Haystack_length, + const void* const _Needle, const size_t _Needle_length) noexcept { + + const auto _Ptr_haystack = static_cast(_Haystack); + size_t _Pos = _Haystack_length; + const auto _Needle_end = static_cast(_Needle) + _Needle_length; + + while (_Pos != 0) { + --_Pos; + + for (auto _Ptr = static_cast(_Needle); _Ptr != _Needle_end; ++_Ptr) { + if (_Ptr_haystack[_Pos] == *_Ptr) { + return _Pos; + } + } + } + + return static_cast(-1); + } + +#ifndef _M_ARM64EC + template + size_t _Impl(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, + const size_t _Needle_length) noexcept { + const size_t _Haystack_length_bytes = _Haystack_length * sizeof(_Ty); + constexpr int _Op = (sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ANY | _SIDD_MOST_SIGNIFICANT; constexpr int _Part_size_el = sizeof(_Ty) == 1 ? 16 : 8; @@ -3378,22 +3796,40 @@ namespace { } } #endif // !_M_ARM64EC - const auto _Ptr_haystack = static_cast(_Haystack); - size_t _Pos = _Haystack_length; - const auto _Needle_end = static_cast(_Needle) + _Needle_length; - while (_Pos != 0) { - --_Pos; + template + size_t _Dispatch_pos(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, + const size_t _Needle_length) noexcept { +#ifndef _M_ARM64EC + if (_Use_sse42()) { + const _Strategy _Strat = _Pick_strategy_avx<_Ty>(_Haystack_length, _Needle_length); - for (auto _Ptr = static_cast(_Needle); _Ptr != _Needle_end; ++_Ptr) { - if (_Ptr_haystack[_Pos] == *_Ptr) { - return _Pos; + if (_Strat == _Strategy::_Vector_bitmap && _Use_avx2()) { + if (_Bitmap::_Can_fit_256_bits_sse(static_cast(_Needle), _Needle_length)) { + return _Bitmap::_Impl_last_avx<_Ty>(_Haystack, _Haystack_length, _Needle, _Needle_length); + } + } else if (_Strat != _Strategy::_No_bitmap) { + if (_Bitmap::_Can_fit_256_bits_sse(static_cast(_Needle), _Needle_length)) { + _Bitmap::_Scalar_table_t _Table = {}; + _Bitmap::_Build_scalar_table_no_check<_Ty>(_Table, _Needle, _Needle_length); + return _Bitmap::_Impl_last_scalar<_Ty>(_Haystack, _Haystack_length, _Table); + } } + + return _Impl<_Ty>(_Haystack, _Haystack_length, _Needle, _Needle_length); + } else +#endif // !_M_ARM64EC + { + _Bitmap::_Scalar_table_t _Table = {}; + if (_Bitmap::_Build_scalar_table<_Ty>(_Table, _Needle, _Needle_length)) { + return _Bitmap::_Impl_last_scalar<_Ty>(_Haystack, _Haystack_length, _Table); + } + + return _Fallback<_Ty>(_Haystack, _Haystack_length, _Needle, _Needle_length); } } - return static_cast(-1); - } + } // namespace __std_find_meow_of::_Last template __declspec(noalias) size_t __stdcall __std_mismatch_impl( @@ -3704,32 +4140,52 @@ __declspec(noalias) size_t __stdcall __std_count_trivial_8( const void* __stdcall __std_find_first_of_trivial_1( const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { - return __std_find_first_of::_Impl_pcmpestri(_First1, _Last1, _First2, _Last2); + return __std_find_meow_of::_First::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); } const void* __stdcall __std_find_first_of_trivial_2( const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { - return __std_find_first_of::_Impl_pcmpestri(_First1, _Last1, _First2, _Last2); + return __std_find_meow_of::_First::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); } const void* __stdcall __std_find_first_of_trivial_4( const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { - return __std_find_first_of::_Impl_4_8<__std_find_first_of::_Traits_4>(_First1, _Last1, _First2, _Last2); + return __std_find_meow_of::_First::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); } const void* __stdcall __std_find_first_of_trivial_8( const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { - return __std_find_first_of::_Impl_4_8<__std_find_first_of::_Traits_8>(_First1, _Last1, _First2, _Last2); + return __std_find_meow_of::_First::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); +} + +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_1( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { + return __std_find_meow_of::_First::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); +} + +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_2( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { + return __std_find_meow_of::_First::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); +} + +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_4( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { + return __std_find_meow_of::_First::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); +} + +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_8( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { + return __std_find_meow_of::_First::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); } __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_1(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { - return __std_find_last_of_pos_impl(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_meow_of::_Last::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); } __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_2(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { - return __std_find_last_of_pos_impl(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_meow_of::_Last::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); } const void* __stdcall __std_search_1( From 019bc07880d43a14d6dfad4975553aded21e681d Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 20 Oct 2024 14:04:08 +0300 Subject: [PATCH 02/47] big characters threshold --- stl/src/vector_algorithms.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 986287128c..a044921b44 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2948,17 +2948,24 @@ namespace { } else { return _Strategy::_Scalar_bitmap; } - } else { - if (_Count1 <= 8) { + } else if constexpr (sizeof(_Ty) == 4) { + if (_Count2 <= 7 || _Product_fits_threshold((_Count1 + 7) / 8, (_Count2 + 7) / 8, 25)) { return _Strategy::_No_bitmap; - } - if (_Count1 > 400) { + } else if (_Count1 * 4ull > _Count2 * 5ull) { return _Strategy::_Vector_bitmap; + } else { + return _Strategy::_Scalar_bitmap; } - if (_Count2 * 2 > _Count1) { + } else if constexpr (sizeof(_Ty) == 8) { + if (_Count2 <= 3 || _Product_fits_threshold((_Count1 + 3) / 4, (_Count2 + 3) / 4, 25)) { + return _Strategy::_No_bitmap; + } else if (_Count1 > _Count2) { + return _Strategy::_Vector_bitmap; + } else { return _Strategy::_Scalar_bitmap; } - return _Strategy::_Vector_bitmap; + } else { + static_assert(false, "unexpected size"); } } } // namespace __std_find_meow_of From 7b8ebfd2ffa3e806483044e3d6486f5cb7187839 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 20 Oct 2024 14:45:34 +0300 Subject: [PATCH 03/47] vectorize large needles too --- stl/inc/__msvc_string_view.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/inc/__msvc_string_view.hpp b/stl/inc/__msvc_string_view.hpp index 58f9d64e43..3280fa6161 100644 --- a/stl/inc/__msvc_string_view.hpp +++ b/stl/inc/__msvc_string_view.hpp @@ -793,7 +793,7 @@ constexpr size_t _Traits_find_first_of(_In_reads_(_Hay_size) const _Traits_ptr_t #if _USE_STD_VECTOR_ALGORITHMS if (!_STD _Is_constant_evaluated()) { const size_t _Remaining_size = _Hay_size - _Start_at; - if (_Remaining_size >= _Threshold_find_first_of) { + if (_Remaining_size + _Needle_size >= _Threshold_find_first_of) { size_t _Pos = _Find_first_of_pos_vectorized(_Hay_start, _Remaining_size, _Needle, _Needle_size); if (_Pos != static_cast(-1)) { _Pos += _Start_at; @@ -843,7 +843,7 @@ constexpr size_t _Traits_find_last_of(_In_reads_(_Hay_size) const _Traits_ptr_t< if constexpr (sizeof(_Elem) <= 2) { if (!_STD _Is_constant_evaluated()) { const size_t _Remaining_size = _Hay_start + 1; - if (_Remaining_size >= _Threshold_find_first_of) { + if (_Remaining_size + _Needle_size >= _Threshold_find_first_of) { return _Find_last_of_pos_vectorized(_Haystack, _Remaining_size, _Needle, _Needle_size); } } From 7f62323e167cb0f9e8b3b759f64e0f109aef03f1 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 20 Oct 2024 14:51:58 +0300 Subject: [PATCH 04/47] arm64ec --- stl/src/vector_algorithms.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index a044921b44..b2b038c201 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2916,6 +2916,7 @@ namespace { return _Result; } +#ifndef _M_ARM64EC namespace __std_find_meow_of { enum class _Strategy { _No_bitmap, _Scalar_bitmap, _Vector_bitmap }; @@ -2969,8 +2970,10 @@ namespace { } } } // namespace __std_find_meow_of +#endif // ! _M_ARM64EC namespace __std_find_meow_of::_Bitmap { +#ifndef _M_ARM64EC template bool _Can_fit_256_bits_sse(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { if constexpr (sizeof(_Ty) == 1) { @@ -3135,6 +3138,7 @@ namespace { return static_cast(-1); } +#endif // ! _M_ARM64EC using _Scalar_table_t = bool[256]; @@ -3158,6 +3162,7 @@ namespace { return true; } +#ifndef _M_ARM64EC template void _Build_scalar_table_no_check( bool* _Table, const void* const _Needle, const size_t _Needle_length) noexcept { @@ -3168,7 +3173,7 @@ namespace { _Table[*_Ptr] = true; } } - +#endif // ! _M_ARM64EC template size_t _Impl_first_scalar( From 6e08e9eb6332ef74520c387ee72f9e32dce78a08 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 20 Oct 2024 14:52:17 +0300 Subject: [PATCH 05/47] missed noexcept --- stl/src/vector_algorithms.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index b2b038c201..d7aefd0508 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3569,8 +3569,8 @@ namespace { #endif // !_M_ARM64EC template - const void* _Dispatch_ptr( - const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) { + const void* _Dispatch_ptr(const void* const _First1, const void* const _Last1, const void* const _First2, + const void* const _Last2) noexcept { #ifndef _M_ARM64EC if constexpr (sizeof(_Ty) <= 2) { if (_Use_sse42()) { @@ -3589,7 +3589,8 @@ namespace { } template - const size_t _Pos_from_ptr(const void* const _Result, const void* const _First1, const void* const _Last1) { + const size_t _Pos_from_ptr( + const void* const _Result, const void* const _First1, const void* const _Last1) noexcept { if (_Result != _Last1) { return _Byte_length(_First1, _Result) / sizeof(_Ty); } else { From 102b85f3f0f45b96c27e95999695ad445685ca83 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 20 Oct 2024 15:09:44 +0300 Subject: [PATCH 06/47] More interesting cases --- benchmarks/src/find_first_of.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/src/find_first_of.cpp b/benchmarks/src/find_first_of.cpp index eeffb9e5df..1ef839977e 100644 --- a/benchmarks/src/find_first_of.cpp +++ b/benchmarks/src/find_first_of.cpp @@ -59,8 +59,9 @@ void bm(benchmark::State& state) { } void common_args(auto bm) { - bm->Args({2, 3})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2})->Args({75, 85})->Args({102, 4}); - bm->Args({325, 1})->Args({400, 50})->Args({1011, 11})->Args({1502, 23})->Args({3056, 7}); + bm->Args({2, 3})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2}); + bm->Args({75, 85})->Args({102, 4})->Args({200, 46})->Args({325, 1})->Args({400, 50}); + bm->Args({1011, 11})->Args({1280, 46})->Args({1502, 23})->Args({2203, 54})->Args({3056, 7}); } BENCHMARK(bm)->Apply(common_args); From 12f6112358aeb3f70fcc2afab6c00b22dcc7094a Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 20 Oct 2024 17:48:32 +0300 Subject: [PATCH 07/47] Not that edge! --- stl/src/vector_algorithms.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index d7aefd0508..d5249ef441 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2934,7 +2934,7 @@ namespace { template _Strategy _Pick_strategy_avx(const size_t _Count1, const size_t _Count2) noexcept { if constexpr (sizeof(_Ty) == 1) { - if (_Count2 <= 15 || _Product_fits_threshold((_Count1 + 15) / 16, (_Count2 + 15) / 16, 60)) { + if (_Count1 <= 15 || _Product_fits_threshold((_Count1 + 15) / 16, (_Count2 + 15) / 16, 60)) { return _Strategy::_No_bitmap; } else if (_Count1 * 1ull > _Count2 * 5ull) { return _Strategy::_Vector_bitmap; @@ -2942,7 +2942,7 @@ namespace { return _Strategy::_Scalar_bitmap; } } else if constexpr (sizeof(_Ty) == 2) { - if (_Count2 <= 7 || _Product_fits_threshold((_Count1 + 7) / 8, (_Count2 + 7) / 8, 60)) { + if (_Count1 <= 7 || _Product_fits_threshold((_Count1 + 7) / 8, (_Count2 + 7) / 8, 60)) { return _Strategy::_No_bitmap; } else if (_Count1 * 2ull > _Count2 * 5ull) { return _Strategy::_Vector_bitmap; @@ -2950,7 +2950,7 @@ namespace { return _Strategy::_Scalar_bitmap; } } else if constexpr (sizeof(_Ty) == 4) { - if (_Count2 <= 7 || _Product_fits_threshold((_Count1 + 7) / 8, (_Count2 + 7) / 8, 25)) { + if (_Count1 <= 7 || _Product_fits_threshold((_Count1 + 7) / 8, (_Count2 + 7) / 8, 25)) { return _Strategy::_No_bitmap; } else if (_Count1 * 4ull > _Count2 * 5ull) { return _Strategy::_Vector_bitmap; @@ -2958,7 +2958,7 @@ namespace { return _Strategy::_Scalar_bitmap; } } else if constexpr (sizeof(_Ty) == 8) { - if (_Count2 <= 3 || _Product_fits_threshold((_Count1 + 3) / 4, (_Count2 + 3) / 4, 25)) { + if (_Count1 <= 3 || _Product_fits_threshold((_Count1 + 3) / 4, (_Count2 + 3) / 4, 25)) { return _Strategy::_No_bitmap; } else if (_Count1 > _Count2) { return _Strategy::_Vector_bitmap; From 6ee0450cc0bd0e67732e4905c6ca639347da4aa8 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 20 Oct 2024 18:00:13 +0300 Subject: [PATCH 08/47] +case --- benchmarks/src/find_first_of.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/src/find_first_of.cpp b/benchmarks/src/find_first_of.cpp index 1ef839977e..889f154059 100644 --- a/benchmarks/src/find_first_of.cpp +++ b/benchmarks/src/find_first_of.cpp @@ -59,7 +59,7 @@ void bm(benchmark::State& state) { } void common_args(auto bm) { - bm->Args({2, 3})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2}); + bm->Args({2, 3})->Args({6, 81})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2}); bm->Args({75, 85})->Args({102, 4})->Args({200, 46})->Args({325, 1})->Args({400, 50}); bm->Args({1011, 11})->Args({1280, 46})->Args({1502, 23})->Args({2203, 54})->Args({3056, 7}); } From 60bbd798554550c8932336693e20c94e2f373657 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 27 Oct 2024 15:06:48 +0200 Subject: [PATCH 09/47] make bitmap small and large --- stl/src/vector_algorithms.cpp | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 1c83e3745d..4c3896e4cc 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3068,7 +3068,7 @@ namespace { } template - __m256i _Make_bitmap(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { + __m256i _Make_bitmap_small(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { __m256i _Bitmap = _mm256_setzero_si256(); const _Ty* const _Stop = _Needle_ptr + _Needle_length; @@ -3085,6 +3085,35 @@ namespace { return _Bitmap; } + template + __m256i _Make_bitmap_large(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { + alignas(32) uint8_t _Table[256] = {}; + + const _Ty* const _Stop = _Needle_ptr + _Needle_length; + + for (; _Needle_ptr != _Stop; ++_Needle_ptr) { + _Table[*_Needle_ptr] = 0xFF; + } + + auto _Table_as_avx = reinterpret_cast(_Table); + + return _mm256_setr_epi32( // + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 0)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 1)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 2)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 3)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 4)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 5)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 6)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 7))); + } + + template + __m256i _Make_bitmap(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { + return _Needle_length <= 20 ? _Make_bitmap_small(_Needle_ptr, _Needle_length) + : _Make_bitmap_large(_Needle_ptr, _Needle_length); + } + template size_t _Impl_first_avx(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { From 8caa54708d5f463327c9361a6eaaca3e366508cc Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 27 Oct 2024 15:29:12 +0200 Subject: [PATCH 10/47] change strategy --- stl/src/vector_algorithms.cpp | 148 +++++++++++++++++++--------------- 1 file changed, 81 insertions(+), 67 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 4c3896e4cc..29dd696a7d 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2930,9 +2930,64 @@ namespace { return _Result; } + namespace __std_find_meow_of::_Bitmap { #ifndef _M_ARM64EC - namespace __std_find_meow_of { - enum class _Strategy { _No_bitmap, _Scalar_bitmap, _Vector_bitmap }; + template + bool _Use_bitmap_avx(const size_t _Count1, const size_t _Count2) { + if constexpr (sizeof(_Ty) == 1) { + if (_Count2 <= 16) { + return _Count1 >= 1000; + } else if (_Count2 <= 48) { + return _Count1 >= 80; + } else if (_Count2 <= 240) { + return _Count1 >= 40; + } else if (_Count2 <= 1000) { + return _Count1 >= 32; + } else { + return _Count1 >= 16; + } + } else if constexpr (sizeof(_Ty) == 2) { + if (_Count2 <= 8) { + return _Count1 >= 128; + } else if (_Count2 <= 48) { + return _Count1 >= 128; + } else if (_Count2 <= 72) { + return _Count1 >= 23; + } else if (_Count2 <= 144) { + return _Count1 >= 15; + } else { + return _Count1 >= 7; + } + } else if constexpr (sizeof(_Ty) == 4) { + if (_Count2 <= 8) { + return _Count1 > 64; + } else if (_Count2 <= 24) { + return _Count1 > 40; + } else if (_Count2 <= 44) { + return _Count1 > 24; + } else if (_Count2 <= 112) { + return _Count1 > 16; + } else { + return _Count1 > 8; + } + } else if constexpr (sizeof(_Ty) == 8) { + if (_Count2 <= 8) { + return _Count1 > 40; + } else if (_Count2 <= 12) { + return _Count1 > 20; + } else if (_Count2 <= 48) { + return _Count1 > 16; + } else if (_Count2 <= 64) { + return _Count1 > 12; + } else if (_Count2 <= 192) { + return _Count1 > 8; + } else { + return _Count1 > 4; + } + } else { + static_assert(false, "unexpected size"); + } + } bool _Product_fits_threshold(const size_t _Px1, const size_t _Px2, const size_t _Tx) noexcept { #ifdef _WIN64 @@ -2946,48 +3001,16 @@ namespace { } template - _Strategy _Pick_strategy_avx(const size_t _Count1, const size_t _Count2) noexcept { + bool _Use_bitmap_sse(const size_t _Count1, const size_t _Count2) noexcept { if constexpr (sizeof(_Ty) == 1) { - if (_Count1 <= 15 || _Product_fits_threshold((_Count1 + 15) / 16, (_Count2 + 15) / 16, 60)) { - return _Strategy::_No_bitmap; - } else if (_Count1 * 1ull > _Count2 * 5ull) { - return _Strategy::_Vector_bitmap; - } else { - return _Strategy::_Scalar_bitmap; - } + return _Count1 >= 16 && !_Product_fits_threshold((_Count1 + 15) / 16, _Count2 / 16, 60); } else if constexpr (sizeof(_Ty) == 2) { - if (_Count1 <= 7 || _Product_fits_threshold((_Count1 + 7) / 8, (_Count2 + 7) / 8, 60)) { - return _Strategy::_No_bitmap; - } else if (_Count1 * 2ull > _Count2 * 5ull) { - return _Strategy::_Vector_bitmap; - } else { - return _Strategy::_Scalar_bitmap; - } - } else if constexpr (sizeof(_Ty) == 4) { - if (_Count1 <= 7 || _Product_fits_threshold((_Count1 + 7) / 8, (_Count2 + 7) / 8, 25)) { - return _Strategy::_No_bitmap; - } else if (_Count1 * 4ull > _Count2 * 5ull) { - return _Strategy::_Vector_bitmap; - } else { - return _Strategy::_Scalar_bitmap; - } - } else if constexpr (sizeof(_Ty) == 8) { - if (_Count1 <= 3 || _Product_fits_threshold((_Count1 + 3) / 4, (_Count2 + 3) / 4, 25)) { - return _Strategy::_No_bitmap; - } else if (_Count1 > _Count2) { - return _Strategy::_Vector_bitmap; - } else { - return _Strategy::_Scalar_bitmap; - } + return _Count1 >= 8 && !_Product_fits_threshold((_Count1 + 7) / 8, _Count2 / 8, 60); } else { static_assert(false, "unexpected size"); } } - } // namespace __std_find_meow_of -#endif // ! _M_ARM64EC - namespace __std_find_meow_of::_Bitmap { -#ifndef _M_ARM64EC template bool _Can_fit_256_bits_sse(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { if constexpr (sizeof(_Ty) == 1) { @@ -3186,7 +3209,7 @@ namespace { using _Scalar_table_t = bool[256]; template - bool _Build_scalar_table(bool* _Table, const void* const _Needle, const size_t _Needle_length) noexcept { + bool _Build_scalar_table(const void* const _Needle, const size_t _Needle_length, bool* _Table) noexcept { auto _Ptr = static_cast(_Needle); const auto _End = _Ptr + _Needle_length; @@ -3208,7 +3231,7 @@ namespace { #ifndef _M_ARM64EC template void _Build_scalar_table_no_check( - bool* _Table, const void* const _Needle, const size_t _Needle_length) noexcept { + const void* const _Needle, const size_t _Needle_length, bool* _Table) noexcept { auto _Ptr = static_cast(_Needle); const auto _End = _Ptr + _Needle_length; @@ -3645,16 +3668,16 @@ namespace { template const size_t _Dispatch_pos_sse_1_2( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { - const _Strategy _Strat = _Pick_strategy_avx<_Ty>(_Count1, _Count2); - - if (_Strat == _Strategy::_Vector_bitmap && _Use_avx2()) { - if (_Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + if (_Use_avx2()) { + if (_Bitmap::_Use_bitmap_avx<_Ty>(_Count2, _Count1) + && _Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { return _Bitmap::_Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); } - } else if (_Strat != _Strategy::_No_bitmap) { - if (_Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + } else { + if (_Bitmap::_Use_bitmap_sse<_Ty>(_Count2, _Count1) + && _Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { _Bitmap::_Scalar_table_t _Table = {}; - _Bitmap::_Build_scalar_table_no_check<_Ty>(_Table, _First2, _Count2); + _Bitmap::_Build_scalar_table_no_check<_Ty>(_First2, _Count2, _Table); return _Bitmap::_Impl_first_scalar<_Ty>(_First1, _Count1, _Table); } } @@ -3670,18 +3693,9 @@ namespace { template const size_t _Dispatch_pos_avx_4_8( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { - const _Strategy _Strat = _Pick_strategy_avx<_Ty>(_Count1, _Count2); - - if (_Strat == _Strategy::_Vector_bitmap) { - if (_Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { - return _Bitmap::_Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); - } - } else if (_Strat != _Strategy::_No_bitmap) { - if (_Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { - _Bitmap::_Scalar_table_t _Table = {}; - _Bitmap::_Build_scalar_table_no_check<_Ty>(_Table, _First2, _Count2); - return _Bitmap::_Impl_first_scalar<_Ty>(_First1, _Count1, _Table); - } + if (_Bitmap::_Use_bitmap_avx<_Ty>(_Count2, _Count1) + && _Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + return _Bitmap::_Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); } const void* _Last1 = static_cast(_First1) + _Count1; @@ -3698,7 +3712,7 @@ namespace { const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { _Bitmap::_Scalar_table_t _Table = {}; - if (_Bitmap::_Build_scalar_table<_Ty>(_Table, _First2, _Count2)) { + if (_Bitmap::_Build_scalar_table<_Ty>(_First2, _Count2, _Table)) { return _Bitmap::_Impl_first_scalar<_Ty>(_First1, _Count1, _Table); } @@ -3858,16 +3872,16 @@ namespace { const size_t _Needle_length) noexcept { #ifndef _M_ARM64EC if (_Use_sse42()) { - const _Strategy _Strat = _Pick_strategy_avx<_Ty>(_Haystack_length, _Needle_length); - - if (_Strat == _Strategy::_Vector_bitmap && _Use_avx2()) { - if (_Bitmap::_Can_fit_256_bits_sse(static_cast(_Needle), _Needle_length)) { + if (_Use_avx2()) { + if (_Bitmap::_Use_bitmap_avx<_Ty>(_Haystack_length, _Needle_length) + && _Bitmap::_Can_fit_256_bits_sse(static_cast(_Needle), _Needle_length)) { return _Bitmap::_Impl_last_avx<_Ty>(_Haystack, _Haystack_length, _Needle, _Needle_length); } - } else if (_Strat != _Strategy::_No_bitmap) { - if (_Bitmap::_Can_fit_256_bits_sse(static_cast(_Needle), _Needle_length)) { + } else { + if (_Bitmap::_Use_bitmap_sse<_Ty>(_Haystack_length, _Needle_length) + && _Bitmap::_Can_fit_256_bits_sse(static_cast(_Needle), _Needle_length)) { _Bitmap::_Scalar_table_t _Table = {}; - _Bitmap::_Build_scalar_table_no_check<_Ty>(_Table, _Needle, _Needle_length); + _Bitmap::_Build_scalar_table_no_check<_Ty>(_Needle, _Needle_length, _Table); return _Bitmap::_Impl_last_scalar<_Ty>(_Haystack, _Haystack_length, _Table); } } @@ -3877,7 +3891,7 @@ namespace { #endif // !_M_ARM64EC { _Bitmap::_Scalar_table_t _Table = {}; - if (_Bitmap::_Build_scalar_table<_Ty>(_Table, _Needle, _Needle_length)) { + if (_Bitmap::_Build_scalar_table<_Ty>(_Needle, _Needle_length, _Table)) { return _Bitmap::_Impl_last_scalar<_Ty>(_Haystack, _Haystack_length, _Table); } From 6f6f97b5da29f3556dc72b2c9c3c92303c94dcec Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 27 Oct 2024 15:51:45 +0200 Subject: [PATCH 11/47] ensure the same (mis)alignment --- benchmarks/src/find_first_of.cpp | 41 +++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/benchmarks/src/find_first_of.cpp b/benchmarks/src/find_first_of.cpp index 889f154059..47f8933950 100644 --- a/benchmarks/src/find_first_of.cpp +++ b/benchmarks/src/find_first_of.cpp @@ -9,12 +9,50 @@ #include #include #include +#include #include #include #include using namespace std; +#if 1 // TRANSITION, GH-5043 + +template +struct skewed_allocator { + using value_type = T; + + T* allocate(size_t n) { + const auto p = static_cast(_aligned_malloc(n * sizeof(T) + Skew, Alignment)); + if (!p) { + throw std::bad_alloc{}; + } + return reinterpret_cast(p + Skew); + } + + void deallocate(T* p, size_t) { + if (p) { + _aligned_free(reinterpret_cast(p) - Skew); + } + } +}; + +// The purpose is to provide consistent behavior for benchmarks. +// 64 seems to be reasonable alignment for practical perf uses, +// as it is both cache line size and maximum vector instruction size (on x64). +// However to provide even more consistency, aligning to page, +// to make sure the same number of page boundaries is crossed each time. +constexpr size_t page_size = 4096; + +// A realistic skew relative to allocation granularity, when a variable is placed +// next to a pointer in a structure or on stack. Also corresponds to the default packing. +constexpr size_t skew = 8; + +template +struct unaligned_allocator : skewed_allocator {}; + +#endif // ^^^ TRANSITION, GH-5043 ^^^ + enum class AlgType { std_func, str_member_first, str_member_last }; template @@ -24,7 +62,8 @@ void bm(benchmark::State& state) { const size_t HSize = Pos * 2; const size_t Which = 0; - using container = conditional_t, basic_string>; + using container = conditional_t>, + basic_string, unaligned_allocator>>; constexpr T HaystackFiller{' '}; static_assert(HaystackFiller < Start, "The following iota() should not produce the haystack filler."); From 1c76e30759224f5065828a320a18efa9273fb082 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 27 Oct 2024 18:55:20 +0200 Subject: [PATCH 12/47] Brute force SSE 4.2 thresholds --- stl/src/vector_algorithms.cpp | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 29dd696a7d..9541110520 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2989,23 +2989,30 @@ namespace { } } - bool _Product_fits_threshold(const size_t _Px1, const size_t _Px2, const size_t _Tx) noexcept { -#ifdef _WIN64 - long long _Rx; - return _mul_overflow_i64(_Px1, _Px2, &_Rx) && static_cast(_Rx) < _Tx; -#else // ^^^ defined(_WIN64) / !defined(_WIN64) - unsigned int _Rx0; - unsigned int _Rx1; - return _mul_full_overflow_u32(_Px1, _Px2, &_Rx0, &_Rx1) && _Rx0 < _Tx; -#endif - } - template bool _Use_bitmap_sse(const size_t _Count1, const size_t _Count2) noexcept { if constexpr (sizeof(_Ty) == 1) { - return _Count1 >= 16 && !_Product_fits_threshold((_Count1 + 15) / 16, _Count2 / 16, 60); + if (_Count2 <= 32) { + return false; + } else if (_Count2 <= 48) { + return _Count1 >= 415; + } else if (_Count2 <= 64) { + return _Count1 >= 223; + } else if (_Count2 <= 80) { + return _Count1 >= 127; + } else if (_Count2 <= 540) { + return _Count1 >= 47; + } else { + return _Count1 >= 31; + } } else if constexpr (sizeof(_Ty) == 2) { - return _Count1 >= 8 && !_Product_fits_threshold((_Count1 + 7) / 8, _Count2 / 8, 60); + if (_Count2 <= 8) { + return false; + } else if (_Count2 <= 80) { + return _Count1 >= 15; + } else { + return _Count1 >= 7; + } } else { static_assert(false, "unexpected size"); } From 4560fdf13b7b8fad3f3e8115e35ed69be693d361 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 27 Oct 2024 19:12:59 +0200 Subject: [PATCH 13/47] update not aligned allocator --- benchmarks/src/find_first_of.cpp | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/benchmarks/src/find_first_of.cpp b/benchmarks/src/find_first_of.cpp index 47f8933950..edcf9dba32 100644 --- a/benchmarks/src/find_first_of.cpp +++ b/benchmarks/src/find_first_of.cpp @@ -22,6 +22,23 @@ template struct skewed_allocator { using value_type = T; + static_assert( + Alignment % alignof(T) == 0 && Skew % alignof(T) == 0, "Chosen parameters will produce unaligned T objects"); + + template + struct rebind { + using type = skewed_allocator; + }; + + skewed_allocator() = default; + template + skewed_allocator(const skewed_allocator&) {} + + template + bool operator==(const skewed_allocator&) const { + return true; + } + T* allocate(size_t n) { const auto p = static_cast(_aligned_malloc(n * sizeof(T) + Skew, Alignment)); if (!p) { @@ -49,7 +66,7 @@ constexpr size_t page_size = 4096; constexpr size_t skew = 8; template -struct unaligned_allocator : skewed_allocator {}; +struct not_highly_aligned_allocator : skewed_allocator {}; #endif // ^^^ TRANSITION, GH-5043 ^^^ @@ -62,8 +79,8 @@ void bm(benchmark::State& state) { const size_t HSize = Pos * 2; const size_t Which = 0; - using container = conditional_t>, - basic_string, unaligned_allocator>>; + using container = conditional_t>, + basic_string, not_highly_aligned_allocator>>; constexpr T HaystackFiller{' '}; static_assert(HaystackFiller < Start, "The following iota() should not produce the haystack filler."); From 05deddbf299566d3bc8e3aa2f902d2128be88ab7 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Tue, 29 Oct 2024 20:46:18 +0200 Subject: [PATCH 14/47] not_highly_aligned_allocator update --- benchmarks/src/find_first_of.cpp | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/benchmarks/src/find_first_of.cpp b/benchmarks/src/find_first_of.cpp index edcf9dba32..a66e5b79a0 100644 --- a/benchmarks/src/find_first_of.cpp +++ b/benchmarks/src/find_first_of.cpp @@ -8,8 +8,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -21,13 +21,12 @@ using namespace std; template struct skewed_allocator { using value_type = T; - - static_assert( - Alignment % alignof(T) == 0 && Skew % alignof(T) == 0, "Chosen parameters will produce unaligned T objects"); + static_assert(Alignment % alignof(T) == 0, "Chosen Alignment will produce unaligned T objects"); + static_assert(Skew % alignof(T) == 0, "Chosen Skew will produce unaligned T objects"); template struct rebind { - using type = skewed_allocator; + using other = skewed_allocator; }; skewed_allocator() = default; @@ -39,7 +38,7 @@ struct skewed_allocator { return true; } - T* allocate(size_t n) { + T* allocate(const size_t n) { const auto p = static_cast(_aligned_malloc(n * sizeof(T) + Skew, Alignment)); if (!p) { throw std::bad_alloc{}; @@ -47,7 +46,7 @@ struct skewed_allocator { return reinterpret_cast(p + Skew); } - void deallocate(T* p, size_t) { + void deallocate(T* const p, size_t) { if (p) { _aligned_free(reinterpret_cast(p) - Skew); } @@ -55,18 +54,18 @@ struct skewed_allocator { }; // The purpose is to provide consistent behavior for benchmarks. -// 64 seems to be reasonable alignment for practical perf uses, -// as it is both cache line size and maximum vector instruction size (on x64). -// However to provide even more consistency, aligning to page, -// to make sure the same number of page boundaries is crossed each time. -constexpr size_t page_size = 4096; +// 64 would be a reasonable alignment for practical perf uses, +// as it is both the cache line size and the maximum vector instruction size (on x64). +// However, aligning to the page size will provide even more consistency +// by ensuring that the same number of page boundaries is crossed each time. +inline constexpr size_t page_size = 4096; // A realistic skew relative to allocation granularity, when a variable is placed -// next to a pointer in a structure or on stack. Also corresponds to the default packing. -constexpr size_t skew = 8; +// next to a pointer in a structure or on the stack. Also corresponds to the default packing. +inline constexpr size_t realistic_skew = 8; template -struct not_highly_aligned_allocator : skewed_allocator {}; +using not_highly_aligned_allocator = skewed_allocator; #endif // ^^^ TRANSITION, GH-5043 ^^^ From 64e9df08ca756aaab86231c08c494b31bb092c77 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Tue, 29 Oct 2024 21:49:19 +0200 Subject: [PATCH 15/47] ASan fixes --- stl/src/vector_algorithms.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 9541110520..acff3e6785 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3809,7 +3809,15 @@ namespace { } const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); - const __m128i _Data1 = _mm_loadu_si128(reinterpret_cast(_Haystack)); + __m128i _Data1; + + if (_Haystack_length_bytes >= 16) { + _Data1 = _mm_loadu_si128(reinterpret_cast(_Haystack)); + } else { + alignas(16) uint8_t _Tmp1[16]; + memcpy(_Tmp1, _Haystack, _Haystack_length_bytes); + _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); + } if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op)) { return _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op); @@ -3866,7 +3874,16 @@ namespace { } const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); - const __m128i _Data1 = _mm_loadu_si128(reinterpret_cast(_Haystack)); + __m128i _Data1; + + if (_Haystack_length_bytes >= 16) { + _Data1 = _mm_loadu_si128(reinterpret_cast(_Haystack)); + } else { + alignas(16) uint8_t _Tmp1[16]; + memcpy(_Tmp1, _Haystack, _Haystack_length_bytes); + _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); + } + _Test_whole_needle(_Data1, _Last_part_size_el); return static_cast(_Found_pos); From ac00cf68245bba5bfaf740b8205d2024b780773e Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Tue, 29 Oct 2024 21:59:57 +0200 Subject: [PATCH 16/47] Check last part size for zero Fewer iterations for exact multiplies of 16 haystacks --- stl/src/vector_algorithms.cpp | 120 ++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 56 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index acff3e6785..558997dd70 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2936,27 +2936,27 @@ namespace { bool _Use_bitmap_avx(const size_t _Count1, const size_t _Count2) { if constexpr (sizeof(_Ty) == 1) { if (_Count2 <= 16) { - return _Count1 >= 1000; + return _Count1 > 1000; } else if (_Count2 <= 48) { - return _Count1 >= 80; + return _Count1 > 80; } else if (_Count2 <= 240) { - return _Count1 >= 40; + return _Count1 > 40; } else if (_Count2 <= 1000) { - return _Count1 >= 32; + return _Count1 > 32; } else { - return _Count1 >= 16; + return _Count1 > 16; } } else if constexpr (sizeof(_Ty) == 2) { if (_Count2 <= 8) { - return _Count1 >= 128; + return _Count1 > 128; } else if (_Count2 <= 48) { - return _Count1 >= 128; + return _Count1 > 128; } else if (_Count2 <= 72) { - return _Count1 >= 23; + return _Count1 > 24; } else if (_Count2 <= 144) { - return _Count1 >= 15; + return _Count1 > 16; } else { - return _Count1 >= 7; + return _Count1 > 8; } } else if constexpr (sizeof(_Ty) == 4) { if (_Count2 <= 8) { @@ -2995,23 +2995,23 @@ namespace { if (_Count2 <= 32) { return false; } else if (_Count2 <= 48) { - return _Count1 >= 415; + return _Count1 > 416; } else if (_Count2 <= 64) { - return _Count1 >= 223; + return _Count1 > 224; } else if (_Count2 <= 80) { - return _Count1 >= 127; + return _Count1 > 128; } else if (_Count2 <= 540) { - return _Count1 >= 47; + return _Count1 > 48; } else { - return _Count1 >= 31; + return _Count1 > 32; } } else if constexpr (sizeof(_Ty) == 2) { if (_Count2 <= 8) { return false; } else if (_Count2 <= 80) { - return _Count1 >= 15; + return _Count1 > 16; } else { - return _Count1 >= 7; + return _Count1 > 8; } } else { static_assert(false, "unexpected size"); @@ -3348,20 +3348,22 @@ namespace { _Advance_bytes(_First1, 16); } - const size_t _Last_part_size = _Haystack_length & 0xF; - const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); + if (const size_t _Last_part_size = _Haystack_length & 0xF; _Last_part_size != 0) { + const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); - alignas(16) uint8_t _Tmp1[16]; - memcpy(_Tmp1, _First1, _Last_part_size); - const __m128i _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); + alignas(16) uint8_t _Tmp1[16]; + memcpy(_Tmp1, _First1, _Last_part_size); + const __m128i _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); - if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op)) { - const int _Pos = _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op); - _Advance_bytes(_First1, _Pos * sizeof(_Ty)); - return _First1; + if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op)) { + const int _Pos = _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op); + _Advance_bytes(_First1, _Pos * sizeof(_Ty)); + return _First1; + } + + _Advance_bytes(_First1, _Last_part_size); } - _Advance_bytes(_First1, _Last_part_size); return _First1; } else { const void* _Last_needle = _First2; @@ -3415,18 +3417,20 @@ namespace { _Advance_bytes(_First1, 16); } - const size_t _Last_part_size = _Haystack_length & 0xF; - const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); + if (const size_t _Last_part_size = _Haystack_length & 0xF; _Last_part_size != 0) { + const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); + + alignas(16) uint8_t _Tmp1[16]; + memcpy(_Tmp1, _First1, _Last_part_size); + const __m128i _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); - alignas(16) uint8_t _Tmp1[16]; - memcpy(_Tmp1, _First1, _Last_part_size); - const __m128i _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); + _Found_pos = _Last_part_size_el; - _Found_pos = _Last_part_size_el; + _Test_whole_needle(_Data1, _Last_part_size_el); - _Test_whole_needle(_Data1, _Last_part_size_el); + _Advance_bytes(_First1, _Found_pos * sizeof(_Ty)); + } - _Advance_bytes(_First1, _Found_pos * sizeof(_Ty)); return _First1; } } @@ -3808,19 +3812,21 @@ namespace { } } - const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); - __m128i _Data1; + if (_Last_part_size != 0) { + const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); + __m128i _Data1; - if (_Haystack_length_bytes >= 16) { - _Data1 = _mm_loadu_si128(reinterpret_cast(_Haystack)); - } else { - alignas(16) uint8_t _Tmp1[16]; - memcpy(_Tmp1, _Haystack, _Haystack_length_bytes); - _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); - } + if (_Haystack_length_bytes >= 16) { + _Data1 = _mm_loadu_si128(reinterpret_cast(_Haystack)); + } else { + alignas(16) uint8_t _Tmp1[16]; + memcpy(_Tmp1, _Haystack, _Haystack_length_bytes); + _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); + } - if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op)) { - return _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op); + if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op)) { + return _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op); + } } return static_cast(-1); @@ -3873,18 +3879,20 @@ namespace { } } - const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); - __m128i _Data1; + if (_Last_part_size != 0) { + const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); + __m128i _Data1; - if (_Haystack_length_bytes >= 16) { - _Data1 = _mm_loadu_si128(reinterpret_cast(_Haystack)); - } else { - alignas(16) uint8_t _Tmp1[16]; - memcpy(_Tmp1, _Haystack, _Haystack_length_bytes); - _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); - } + if (_Haystack_length_bytes >= 16) { + _Data1 = _mm_loadu_si128(reinterpret_cast(_Haystack)); + } else { + alignas(16) uint8_t _Tmp1[16]; + memcpy(_Tmp1, _Haystack, _Haystack_length_bytes); + _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); + } - _Test_whole_needle(_Data1, _Last_part_size_el); + _Test_whole_needle(_Data1, _Last_part_size_el); + } return static_cast(_Found_pos); } From 3132911f4621b1cabf62fdd09b096c4ba0f933c1 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Tue, 29 Oct 2024 23:07:01 +0200 Subject: [PATCH 17/47] typo --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 558997dd70..7f1ddec53c 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2950,7 +2950,7 @@ namespace { if (_Count2 <= 8) { return _Count1 > 128; } else if (_Count2 <= 48) { - return _Count1 > 128; + return _Count1 > 32; } else if (_Count2 <= 72) { return _Count1 > 24; } else if (_Count2 <= 144) { From b34af28da353abd9bb19f51555265ccc26fff2f3 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Wed, 30 Oct 2024 20:31:08 +0200 Subject: [PATCH 18/47] GH-5043 usage --- benchmarks/src/find_first_of.cpp | 56 ++------------------------------ 1 file changed, 2 insertions(+), 54 deletions(-) diff --git a/benchmarks/src/find_first_of.cpp b/benchmarks/src/find_first_of.cpp index a66e5b79a0..41b2089e4c 100644 --- a/benchmarks/src/find_first_of.cpp +++ b/benchmarks/src/find_first_of.cpp @@ -8,66 +8,14 @@ #include #include #include -#include #include #include #include #include -using namespace std; - -#if 1 // TRANSITION, GH-5043 - -template -struct skewed_allocator { - using value_type = T; - static_assert(Alignment % alignof(T) == 0, "Chosen Alignment will produce unaligned T objects"); - static_assert(Skew % alignof(T) == 0, "Chosen Skew will produce unaligned T objects"); - - template - struct rebind { - using other = skewed_allocator; - }; - - skewed_allocator() = default; - template - skewed_allocator(const skewed_allocator&) {} +#include "skewed_allocator.hpp" - template - bool operator==(const skewed_allocator&) const { - return true; - } - - T* allocate(const size_t n) { - const auto p = static_cast(_aligned_malloc(n * sizeof(T) + Skew, Alignment)); - if (!p) { - throw std::bad_alloc{}; - } - return reinterpret_cast(p + Skew); - } - - void deallocate(T* const p, size_t) { - if (p) { - _aligned_free(reinterpret_cast(p) - Skew); - } - } -}; - -// The purpose is to provide consistent behavior for benchmarks. -// 64 would be a reasonable alignment for practical perf uses, -// as it is both the cache line size and the maximum vector instruction size (on x64). -// However, aligning to the page size will provide even more consistency -// by ensuring that the same number of page boundaries is crossed each time. -inline constexpr size_t page_size = 4096; - -// A realistic skew relative to allocation granularity, when a variable is placed -// next to a pointer in a structure or on the stack. Also corresponds to the default packing. -inline constexpr size_t realistic_skew = 8; - -template -using not_highly_aligned_allocator = skewed_allocator; - -#endif // ^^^ TRANSITION, GH-5043 ^^^ +using namespace std; enum class AlgType { std_func, str_member_first, str_member_last }; From ee48d32aa8805582c3e1be94e0e70e2967bab7e9 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Fri, 8 Nov 2024 10:23:30 -0800 Subject: [PATCH 19/47] Use `static_cast` instead of a functional-style cast. --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index bb2a80dd34..b1ca612b74 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3106,7 +3106,7 @@ namespace { for (; _Needle_ptr != _Stop; ++_Needle_ptr) { const _Ty _Val = *_Needle_ptr; const __m256i _Count_low = _mm256_broadcastq_epi64(_mm_cvtsi32_si128(_Val & 0x3F)); - const uint32_t _One_1_high = 1u << uint32_t((_Val >> 3) & 0x18); + const uint32_t _One_1_high = 1u << static_cast((_Val >> 3) & 0x18); const __m256i _One_1_high_unp = _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(_One_1_high)); const __m256i _One_1 = _mm256_sllv_epi64(_One_1_high_unp, _Count_low); _Bitmap = _mm256_or_si256(_Bitmap, _One_1); From 5323a0bd150c2c07d899c4e790d964c586237eb3 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Fri, 8 Nov 2024 10:26:57 -0800 Subject: [PATCH 20/47] `unsigned` => `unsigned int` --- stl/src/vector_algorithms.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index b1ca612b74..0d274e1952 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3154,9 +3154,9 @@ namespace { const size_t _Haystack_length_vec = _Haystack_length & ~size_t{7}; for (size_t _Ix = 0; _Ix != _Haystack_length_vec; _Ix += 8) { - const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Ix); - const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); - const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Ix); + const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); + const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); if (_Bingo != 0) { return _Ix + _tzcnt_u32(_Bingo); } @@ -3164,12 +3164,12 @@ namespace { const size_t _Haystack_length_tail = _Haystack_length & 7; if (_Haystack_length_tail != 0) { - const unsigned _Tail_bingo_mask = ((1 << _Haystack_length_tail) - 1); + const unsigned int _Tail_bingo_mask = ((1 << _Haystack_length_tail) - 1); _Ty _Buf[8]; memcpy(_Buf, _Haystack_ptr + _Haystack_length_vec, _Haystack_length_tail * sizeof(_Ty)); - const __m256i _Data = _Load_avx_256_8(_Buf); - const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); - const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; + const __m256i _Data = _Load_avx_256_8(_Buf); + const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); + const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; if (_Bingo != 0) { return _Haystack_length_vec + _tzcnt_u32(_Bingo); } @@ -3188,9 +3188,9 @@ namespace { while (_Haystack_length >= 8) { _Haystack_length -= 8; - const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Haystack_length); - const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); - const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Haystack_length); + const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); + const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); if (_Bingo != 0) { return _Haystack_length + 31 - _lzcnt_u32(_Bingo); } @@ -3198,12 +3198,12 @@ namespace { const size_t _Haystack_length_tail = _Haystack_length & 7; if (_Haystack_length_tail != 0) { - const unsigned _Tail_bingo_mask = ((1 << _Haystack_length_tail) - 1); + const unsigned int _Tail_bingo_mask = ((1 << _Haystack_length_tail) - 1); _Ty _Buf[8]; memcpy(_Buf, _Haystack_ptr, _Haystack_length_tail * sizeof(_Ty)); - const __m256i _Data = _Load_avx_256_8(_Buf); - const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); - const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; + const __m256i _Data = _Load_avx_256_8(_Buf); + const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); + const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; if (_Bingo != 0) { return 31 - _lzcnt_u32(_Bingo); } From 35379a9a39a71a0cdd43ac0507adc539f7a353f8 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Fri, 8 Nov 2024 10:49:30 -0800 Subject: [PATCH 21/47] Drop unnecessary parens. --- stl/src/vector_algorithms.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 0d274e1952..3f0199d196 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3164,7 +3164,7 @@ namespace { const size_t _Haystack_length_tail = _Haystack_length & 7; if (_Haystack_length_tail != 0) { - const unsigned int _Tail_bingo_mask = ((1 << _Haystack_length_tail) - 1); + const unsigned int _Tail_bingo_mask = (1 << _Haystack_length_tail) - 1; _Ty _Buf[8]; memcpy(_Buf, _Haystack_ptr + _Haystack_length_vec, _Haystack_length_tail * sizeof(_Ty)); const __m256i _Data = _Load_avx_256_8(_Buf); @@ -3198,7 +3198,7 @@ namespace { const size_t _Haystack_length_tail = _Haystack_length & 7; if (_Haystack_length_tail != 0) { - const unsigned int _Tail_bingo_mask = ((1 << _Haystack_length_tail) - 1); + const unsigned int _Tail_bingo_mask = (1 << _Haystack_length_tail) - 1; _Ty _Buf[8]; memcpy(_Buf, _Haystack_ptr, _Haystack_length_tail * sizeof(_Ty)); const __m256i _Data = _Load_avx_256_8(_Buf); From 6dcf3ba6f53a587d7487394192f76488c30a598f Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Fri, 8 Nov 2024 10:50:29 -0800 Subject: [PATCH 22/47] Conditional operator => if-else --- stl/src/vector_algorithms.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 3f0199d196..c2a54325d7 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3140,8 +3140,11 @@ namespace { template __m256i _Make_bitmap(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { - return _Needle_length <= 20 ? _Make_bitmap_small(_Needle_ptr, _Needle_length) - : _Make_bitmap_large(_Needle_ptr, _Needle_length); + if (_Needle_length <= 20) { + return _Make_bitmap_small(_Needle_ptr, _Needle_length); + } else { + return _Make_bitmap_large(_Needle_ptr, _Needle_length); + } } template From f6f95dd362286235459b26e623cfa82b211df62b Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Fri, 8 Nov 2024 10:51:18 -0800 Subject: [PATCH 23/47] Add const. --- stl/src/vector_algorithms.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index c2a54325d7..ccb4670849 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3061,10 +3061,10 @@ namespace { } __m256i __vectorcall _Step(const __m256i _Bitmap, const __m256i _Data) noexcept { - __m256i _Data_high = _mm256_srli_epi32(_Data, 5); - __m256i _Bitmap_parts = _mm256_permutevar8x32_epi32(_Bitmap, _Data_high); - __m256i _Data_low_inv = _mm256_andnot_si256(_Data, _mm256_set1_epi32(0x1F)); - __m256i _Mask = _mm256_sllv_epi32(_Bitmap_parts, _Data_low_inv); + const __m256i _Data_high = _mm256_srli_epi32(_Data, 5); + const __m256i _Bitmap_parts = _mm256_permutevar8x32_epi32(_Bitmap, _Data_high); + const __m256i _Data_low_inv = _mm256_andnot_si256(_Data, _mm256_set1_epi32(0x1F)); + const __m256i _Mask = _mm256_sllv_epi32(_Bitmap_parts, _Data_low_inv); return _Mask; } @@ -3125,7 +3125,7 @@ namespace { _Table[*_Needle_ptr] = 0xFF; } - auto _Table_as_avx = reinterpret_cast(_Table); + const auto _Table_as_avx = reinterpret_cast(_Table); return _mm256_setr_epi32( // _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 0)), @@ -3139,7 +3139,7 @@ namespace { } template - __m256i _Make_bitmap(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { + __m256i _Make_bitmap(const _Ty* const _Needle_ptr, const size_t _Needle_length) noexcept { if (_Needle_length <= 20) { return _Make_bitmap_small(_Needle_ptr, _Needle_length); } else { @@ -3696,7 +3696,7 @@ namespace { } } - const void* _Last1 = static_cast(_First1) + _Count1; + const void* const _Last1 = static_cast(_First1) + _Count1; const size_t _Size_bytes_1 = _Count1 * sizeof(_Ty); const size_t _Size_bytes_2 = _Count2 * sizeof(_Ty); @@ -3712,7 +3712,7 @@ namespace { return _Bitmap::_Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); } - const void* _Last1 = static_cast(_First1) + _Count1; + const void* const _Last1 = static_cast(_First1) + _Count1; const size_t _Size_bytes_1 = _Count1 * sizeof(_Ty); const size_t _Size_bytes_2 = _Count2 * sizeof(_Ty); @@ -3730,8 +3730,8 @@ namespace { return _Bitmap::_Impl_first_scalar<_Ty>(_First1, _Count1, _Table); } - const void* _Last1 = static_cast(_First1) + _Count1; - const void* _Last2 = static_cast(_First2) + _Count2; + const void* const _Last1 = static_cast(_First1) + _Count1; + const void* const _Last2 = static_cast(_First2) + _Count2; return _Pos_from_ptr<_Ty>(_Fallback<_Ty>(_First1, _Last1, _First2, _Last2), _First1, _Last1); } From f54b728c11c818b487cc167ff1c198ec7ee80d74 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Fri, 8 Nov 2024 10:53:49 -0800 Subject: [PATCH 24/47] Don't return `const size_t`. --- stl/src/vector_algorithms.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index ccb4670849..1a66be6a53 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3669,8 +3669,7 @@ namespace { } template - const size_t _Pos_from_ptr( - const void* const _Result, const void* const _First1, const void* const _Last1) noexcept { + size_t _Pos_from_ptr(const void* const _Result, const void* const _First1, const void* const _Last1) noexcept { if (_Result != _Last1) { return _Byte_length(_First1, _Result) / sizeof(_Ty); } else { @@ -3680,7 +3679,7 @@ namespace { #ifndef _M_ARM64EC template - const size_t _Dispatch_pos_sse_1_2( + size_t _Dispatch_pos_sse_1_2( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { if (_Use_avx2()) { if (_Bitmap::_Use_bitmap_avx<_Ty>(_Count2, _Count1) @@ -3705,7 +3704,7 @@ namespace { } template - const size_t _Dispatch_pos_avx_4_8( + size_t _Dispatch_pos_avx_4_8( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { if (_Bitmap::_Use_bitmap_avx<_Ty>(_Count2, _Count1) && _Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { @@ -3722,7 +3721,7 @@ namespace { #endif // !_M_ARM64EC template - const size_t _Dispatch_pos_fallback( + size_t _Dispatch_pos_fallback( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { _Bitmap::_Scalar_table_t _Table = {}; @@ -3737,7 +3736,7 @@ namespace { } template - const size_t _Dispatch_pos( + size_t _Dispatch_pos( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { #ifndef _M_ARM64EC if constexpr (sizeof(_Ty) <= 2) { From 267a6799b7171bfecf1dcfab4bb69952b0068bdd Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Fri, 8 Nov 2024 10:58:37 -0800 Subject: [PATCH 25/47] Add `noexcept`. --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 1a66be6a53..e1d0738e68 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2933,7 +2933,7 @@ namespace { namespace __std_find_meow_of::_Bitmap { #ifndef _M_ARM64EC template - bool _Use_bitmap_avx(const size_t _Count1, const size_t _Count2) { + bool _Use_bitmap_avx(const size_t _Count1, const size_t _Count2) noexcept { if constexpr (sizeof(_Ty) == 1) { if (_Count2 <= 16) { return _Count1 > 1000; From 3768748bb66062d330da43952ed155f1ae58f311 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 9 Nov 2024 08:46:50 +0200 Subject: [PATCH 26/47] separate steps for small bitmap differently --- stl/src/vector_algorithms.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index e1d0738e68..d2c284c14b 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3104,12 +3104,12 @@ namespace { const _Ty* const _Stop = _Needle_ptr + _Needle_length; for (; _Needle_ptr != _Stop; ++_Needle_ptr) { - const _Ty _Val = *_Needle_ptr; - const __m256i _Count_low = _mm256_broadcastq_epi64(_mm_cvtsi32_si128(_Val & 0x3F)); - const uint32_t _One_1_high = 1u << static_cast((_Val >> 3) & 0x18); - const __m256i _One_1_high_unp = _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(_One_1_high)); - const __m256i _One_1 = _mm256_sllv_epi64(_One_1_high_unp, _Count_low); - _Bitmap = _mm256_or_si256(_Bitmap, _One_1); + const _Ty _Val = *_Needle_ptr; + const __m128i _Count_low = _mm_cvtsi32_si128(_Val & 0x3F); + const auto _Count_high = static_cast((_Val >> 3) & 0x18); + const __m256i _One_1_high = _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(1u << _Count_high)); + const __m256i _One_1 = _mm256_sllv_epi64(_One_1_high, _mm256_broadcastq_epi64(_Count_low)); + _Bitmap = _mm256_or_si256(_Bitmap, _One_1); } return _Bitmap; From 9964a161ce6ab1a3aef9fc2c49baac9848cb9f9c Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 9 Nov 2024 08:55:51 +0200 Subject: [PATCH 27/47] yet better name --- stl/src/vector_algorithms.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index d2c284c14b..c1dc21b8e8 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3106,8 +3106,8 @@ namespace { for (; _Needle_ptr != _Stop; ++_Needle_ptr) { const _Ty _Val = *_Needle_ptr; const __m128i _Count_low = _mm_cvtsi32_si128(_Val & 0x3F); - const auto _Count_high = static_cast((_Val >> 3) & 0x18); - const __m256i _One_1_high = _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(1u << _Count_high)); + const auto _Count_high_x8 = static_cast((_Val >> 3) & 0x18); + const __m256i _One_1_high = _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(1u << _Count_high_x8)); const __m256i _One_1 = _mm256_sllv_epi64(_One_1_high, _mm256_broadcastq_epi64(_Count_low)); _Bitmap = _mm256_or_si256(_Bitmap, _One_1); } From 7a08a13795b4b91180c33bd7a9b5dc694718e649 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 9 Nov 2024 09:20:50 +0200 Subject: [PATCH 28/47] We should avoid broadcast Even though sll is potentially more expensive than sllv, the broadcast is still more expensive --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index c1dc21b8e8..362ebd236f 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3108,7 +3108,7 @@ namespace { const __m128i _Count_low = _mm_cvtsi32_si128(_Val & 0x3F); const auto _Count_high_x8 = static_cast((_Val >> 3) & 0x18); const __m256i _One_1_high = _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(1u << _Count_high_x8)); - const __m256i _One_1 = _mm256_sllv_epi64(_One_1_high, _mm256_broadcastq_epi64(_Count_low)); + const __m256i _One_1 = _mm256_sll_epi64(_One_1_high, _Count_low); _Bitmap = _mm256_or_si256(_Bitmap, _One_1); } From abd5ee95cbfb09ad07e8b6534602262ba021b381 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 9 Nov 2024 09:21:24 +0200 Subject: [PATCH 29/47] Should inline actually --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 362ebd236f..f0c763173f 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3060,7 +3060,7 @@ namespace { } } - __m256i __vectorcall _Step(const __m256i _Bitmap, const __m256i _Data) noexcept { + __m256i _Step(const __m256i _Bitmap, const __m256i _Data) noexcept { const __m256i _Data_high = _mm256_srli_epi32(_Data, 5); const __m256i _Bitmap_parts = _mm256_permutevar8x32_epi32(_Bitmap, _Data_high); const __m256i _Data_low_inv = _mm256_andnot_si256(_Data, _mm256_set1_epi32(0x1F)); From bc0decf1b2384e0908a89a3c8a85c1db399b038e Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 9 Nov 2024 05:35:51 -0800 Subject: [PATCH 30/47] Use reference-to-array parameters. --- stl/src/vector_algorithms.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index f0c763173f..6fcde007d0 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3219,7 +3219,8 @@ namespace { using _Scalar_table_t = bool[256]; template - bool _Build_scalar_table(const void* const _Needle, const size_t _Needle_length, bool* _Table) noexcept { + bool _Build_scalar_table( + const void* const _Needle, const size_t _Needle_length, _Scalar_table_t& _Table) noexcept { auto _Ptr = static_cast(_Needle); const auto _End = _Ptr + _Needle_length; @@ -3241,7 +3242,7 @@ namespace { #ifndef _M_ARM64EC template void _Build_scalar_table_no_check( - const void* const _Needle, const size_t _Needle_length, bool* _Table) noexcept { + const void* const _Needle, const size_t _Needle_length, _Scalar_table_t& _Table) noexcept { auto _Ptr = static_cast(_Needle); const auto _End = _Ptr + _Needle_length; @@ -3253,7 +3254,7 @@ namespace { template size_t _Impl_first_scalar( - const void* const _Haystack, const size_t _Haystack_length, const bool* const _Table) noexcept { + const void* const _Haystack, const size_t _Haystack_length, const _Scalar_table_t& _Table) noexcept { const auto _Haystack_ptr = static_cast(_Haystack); for (size_t _Ix = 0; _Ix != _Haystack_length; ++_Ix) { @@ -3275,7 +3276,7 @@ namespace { template size_t _Impl_last_scalar( - const void* const _Haystack, size_t _Haystack_length, const bool* const _Table) noexcept { + const void* const _Haystack, size_t _Haystack_length, const _Scalar_table_t& _Table) noexcept { const auto _Haystack_ptr = static_cast(_Haystack); while (_Haystack_length != 0) { From 58fa2b4fe31cce24bed83a5bc9c5144c2caf7cd6 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 9 Nov 2024 05:40:44 -0800 Subject: [PATCH 31/47] Mark `_Build_scalar_table` as `[[nodiscard]]`. --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 6fcde007d0..7457280d15 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3219,7 +3219,7 @@ namespace { using _Scalar_table_t = bool[256]; template - bool _Build_scalar_table( + [[nodiscard]] bool _Build_scalar_table( const void* const _Needle, const size_t _Needle_length, _Scalar_table_t& _Table) noexcept { auto _Ptr = static_cast(_Needle); const auto _End = _Ptr + _Needle_length; From 73e704d7c79440a782f4daafae6b7a25b53d817b Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 9 Nov 2024 06:07:56 -0800 Subject: [PATCH 32/47] Avoid `_First` and `_Last` sub-namespaces. --- stl/src/vector_algorithms.cpp | 37 ++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 7457280d15..b9ef3ad1ff 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3299,8 +3299,7 @@ namespace { } } // namespace __std_find_meow_of::_Bitmap - namespace __std_find_meow_of::_First { - + namespace __std_find_first_of { template const void* _Fallback(const void* _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { @@ -3682,6 +3681,8 @@ namespace { template size_t _Dispatch_pos_sse_1_2( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { + namespace _Bitmap = __std_find_meow_of::_Bitmap; + if (_Use_avx2()) { if (_Bitmap::_Use_bitmap_avx<_Ty>(_Count2, _Count1) && _Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { @@ -3707,6 +3708,8 @@ namespace { template size_t _Dispatch_pos_avx_4_8( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { + namespace _Bitmap = __std_find_meow_of::_Bitmap; + if (_Bitmap::_Use_bitmap_avx<_Ty>(_Count2, _Count1) && _Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { return _Bitmap::_Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); @@ -3724,6 +3727,7 @@ namespace { template size_t _Dispatch_pos_fallback( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { + namespace _Bitmap = __std_find_meow_of::_Bitmap; _Bitmap::_Scalar_table_t _Table = {}; if (_Bitmap::_Build_scalar_table<_Ty>(_First2, _Count2, _Table)) { @@ -3752,10 +3756,9 @@ namespace { #endif // !_M_ARM64EC return _Dispatch_pos_fallback<_Ty>(_First1, _Count1, _First2, _Count2); } - } // namespace __std_find_meow_of::_First - - namespace __std_find_meow_of::_Last { + } // namespace __std_find_first_of + namespace __std_find_last_of { template size_t __stdcall _Fallback(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { @@ -3905,6 +3908,8 @@ namespace { template size_t _Dispatch_pos(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { + namespace _Bitmap = __std_find_meow_of::_Bitmap; + #ifndef _M_ARM64EC if (_Use_sse42()) { if (_Use_avx2()) { @@ -3934,7 +3939,7 @@ namespace { } } - } // namespace __std_find_meow_of::_Last + } // namespace __std_find_last_of template __declspec(noalias) size_t __stdcall __std_mismatch_impl( @@ -4492,52 +4497,52 @@ __declspec(noalias) size_t __stdcall __std_count_trivial_8( const void* __stdcall __std_find_first_of_trivial_1( const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { - return __std_find_meow_of::_First::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); + return __std_find_first_of::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); } const void* __stdcall __std_find_first_of_trivial_2( const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { - return __std_find_meow_of::_First::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); + return __std_find_first_of::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); } const void* __stdcall __std_find_first_of_trivial_4( const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { - return __std_find_meow_of::_First::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); + return __std_find_first_of::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); } const void* __stdcall __std_find_first_of_trivial_8( const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { - return __std_find_meow_of::_First::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); + return __std_find_first_of::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); } __declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_1( const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { - return __std_find_meow_of::_First::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_first_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); } __declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_2( const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { - return __std_find_meow_of::_First::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_first_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); } __declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_4( const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { - return __std_find_meow_of::_First::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_first_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); } __declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_8( const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { - return __std_find_meow_of::_First::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_first_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); } __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_1(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { - return __std_find_meow_of::_Last::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_last_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); } __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_2(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { - return __std_find_meow_of::_Last::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_last_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); } const void* __stdcall __std_search_1( From 67b93194fa6677fed45bb71fe17230f96264d90a Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 9 Nov 2024 06:14:34 -0800 Subject: [PATCH 33/47] Pure code movement: Move bitmap details before "public" machinery. --- stl/src/vector_algorithms.cpp | 174 +++++++++++++++++----------------- 1 file changed, 87 insertions(+), 87 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index b9ef3ad1ff..c7d2d3c1f4 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2932,6 +2932,93 @@ namespace { namespace __std_find_meow_of::_Bitmap { #ifndef _M_ARM64EC + __m256i _Step(const __m256i _Bitmap, const __m256i _Data) noexcept { + const __m256i _Data_high = _mm256_srli_epi32(_Data, 5); + const __m256i _Bitmap_parts = _mm256_permutevar8x32_epi32(_Bitmap, _Data_high); + const __m256i _Data_low_inv = _mm256_andnot_si256(_Data, _mm256_set1_epi32(0x1F)); + const __m256i _Mask = _mm256_sllv_epi32(_Bitmap_parts, _Data_low_inv); + return _Mask; + } + + template + __m256i _Load_avx_256_8(const _Ty* const _Src) noexcept { + if constexpr (sizeof(_Ty) == 1) { + return _mm256_cvtepu8_epi32(_mm_loadu_si64(_Src)); + } else if constexpr (sizeof(_Ty) == 2) { + return _mm256_cvtepu16_epi32(_mm_loadu_si128(reinterpret_cast(_Src))); + } else if constexpr (sizeof(_Ty) == 4) { + return _mm256_loadu_si256(reinterpret_cast(_Src)); + } else if constexpr (sizeof(_Ty) == 8) { + const __m256i _Low = _mm256_loadu_si256(reinterpret_cast(_Src)); + const __m256i _High = _mm256_loadu_si256(reinterpret_cast(_Src) + 1); + const __m256i _Pack = _mm256_packs_epi32(_Low, _High); + return _mm256_permutex_epi64(_Pack, _MM_SHUFFLE(3, 1, 2, 0)); + } else { + static_assert(false, "Unexpected size"); + } + } + + template + __m256i _Mask_out_oveflow(const __m256i _Mask, const __m256i _Data) noexcept { + if constexpr (sizeof(_Ty) == 1) { + return _Mask; + } else { + const __m256i _Data_high = _mm256_and_si256(_Data, _mm256_set1_epi32(static_cast(0xFFFF'FF00))); + const __m256i _Fit_mask = _mm256_cmpeq_epi32(_Data_high, _mm256_setzero_si256()); + return _mm256_and_si256(_Mask, _Fit_mask); + } + } + + template + __m256i _Make_bitmap_small(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { + __m256i _Bitmap = _mm256_setzero_si256(); + + const _Ty* const _Stop = _Needle_ptr + _Needle_length; + + for (; _Needle_ptr != _Stop; ++_Needle_ptr) { + const _Ty _Val = *_Needle_ptr; + const __m128i _Count_low = _mm_cvtsi32_si128(_Val & 0x3F); + const auto _Count_high_x8 = static_cast((_Val >> 3) & 0x18); + const __m256i _One_1_high = _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(1u << _Count_high_x8)); + const __m256i _One_1 = _mm256_sll_epi64(_One_1_high, _Count_low); + _Bitmap = _mm256_or_si256(_Bitmap, _One_1); + } + + return _Bitmap; + } + + template + __m256i _Make_bitmap_large(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { + alignas(32) uint8_t _Table[256] = {}; + + const _Ty* const _Stop = _Needle_ptr + _Needle_length; + + for (; _Needle_ptr != _Stop; ++_Needle_ptr) { + _Table[*_Needle_ptr] = 0xFF; + } + + const auto _Table_as_avx = reinterpret_cast(_Table); + + return _mm256_setr_epi32( // + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 0)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 1)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 2)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 3)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 4)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 5)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 6)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 7))); + } + + template + __m256i _Make_bitmap(const _Ty* const _Needle_ptr, const size_t _Needle_length) noexcept { + if (_Needle_length <= 20) { + return _Make_bitmap_small(_Needle_ptr, _Needle_length); + } else { + return _Make_bitmap_large(_Needle_ptr, _Needle_length); + } + } + template bool _Use_bitmap_avx(const size_t _Count1, const size_t _Count2) noexcept { if constexpr (sizeof(_Ty) == 1) { @@ -3060,93 +3147,6 @@ namespace { } } - __m256i _Step(const __m256i _Bitmap, const __m256i _Data) noexcept { - const __m256i _Data_high = _mm256_srli_epi32(_Data, 5); - const __m256i _Bitmap_parts = _mm256_permutevar8x32_epi32(_Bitmap, _Data_high); - const __m256i _Data_low_inv = _mm256_andnot_si256(_Data, _mm256_set1_epi32(0x1F)); - const __m256i _Mask = _mm256_sllv_epi32(_Bitmap_parts, _Data_low_inv); - return _Mask; - } - - template - __m256i _Load_avx_256_8(const _Ty* const _Src) noexcept { - if constexpr (sizeof(_Ty) == 1) { - return _mm256_cvtepu8_epi32(_mm_loadu_si64(_Src)); - } else if constexpr (sizeof(_Ty) == 2) { - return _mm256_cvtepu16_epi32(_mm_loadu_si128(reinterpret_cast(_Src))); - } else if constexpr (sizeof(_Ty) == 4) { - return _mm256_loadu_si256(reinterpret_cast(_Src)); - } else if constexpr (sizeof(_Ty) == 8) { - const __m256i _Low = _mm256_loadu_si256(reinterpret_cast(_Src)); - const __m256i _High = _mm256_loadu_si256(reinterpret_cast(_Src) + 1); - const __m256i _Pack = _mm256_packs_epi32(_Low, _High); - return _mm256_permutex_epi64(_Pack, _MM_SHUFFLE(3, 1, 2, 0)); - } else { - static_assert(false, "Unexpected size"); - } - } - - template - __m256i _Mask_out_oveflow(const __m256i _Mask, const __m256i _Data) noexcept { - if constexpr (sizeof(_Ty) == 1) { - return _Mask; - } else { - const __m256i _Data_high = _mm256_and_si256(_Data, _mm256_set1_epi32(static_cast(0xFFFF'FF00))); - const __m256i _Fit_mask = _mm256_cmpeq_epi32(_Data_high, _mm256_setzero_si256()); - return _mm256_and_si256(_Mask, _Fit_mask); - } - } - - template - __m256i _Make_bitmap_small(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { - __m256i _Bitmap = _mm256_setzero_si256(); - - const _Ty* const _Stop = _Needle_ptr + _Needle_length; - - for (; _Needle_ptr != _Stop; ++_Needle_ptr) { - const _Ty _Val = *_Needle_ptr; - const __m128i _Count_low = _mm_cvtsi32_si128(_Val & 0x3F); - const auto _Count_high_x8 = static_cast((_Val >> 3) & 0x18); - const __m256i _One_1_high = _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(1u << _Count_high_x8)); - const __m256i _One_1 = _mm256_sll_epi64(_One_1_high, _Count_low); - _Bitmap = _mm256_or_si256(_Bitmap, _One_1); - } - - return _Bitmap; - } - - template - __m256i _Make_bitmap_large(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { - alignas(32) uint8_t _Table[256] = {}; - - const _Ty* const _Stop = _Needle_ptr + _Needle_length; - - for (; _Needle_ptr != _Stop; ++_Needle_ptr) { - _Table[*_Needle_ptr] = 0xFF; - } - - const auto _Table_as_avx = reinterpret_cast(_Table); - - return _mm256_setr_epi32( // - _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 0)), - _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 1)), - _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 2)), - _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 3)), - _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 4)), - _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 5)), - _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 6)), - _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 7))); - } - - template - __m256i _Make_bitmap(const _Ty* const _Needle_ptr, const size_t _Needle_length) noexcept { - if (_Needle_length <= 20) { - return _Make_bitmap_small(_Needle_ptr, _Needle_length); - } else { - return _Make_bitmap_large(_Needle_ptr, _Needle_length); - } - } - template size_t _Impl_first_avx(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { From 016ee5cd7d5f58d901b83a3ca26d75cd4525fbe4 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 9 Nov 2024 06:32:38 -0800 Subject: [PATCH 34/47] Avoid `_Bitmap` sub-namespace, extract details. --- stl/src/vector_algorithms.cpp | 70 +++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index c7d2d3c1f4..123e652f30 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2930,8 +2930,8 @@ namespace { return _Result; } - namespace __std_find_meow_of::_Bitmap { #ifndef _M_ARM64EC + namespace __std_find_meow_of_bitmap_details { __m256i _Step(const __m256i _Bitmap, const __m256i _Data) noexcept { const __m256i _Data_high = _mm256_srli_epi32(_Data, 5); const __m256i _Bitmap_parts = _mm256_permutevar8x32_epi32(_Bitmap, _Data_high); @@ -3018,7 +3018,11 @@ namespace { return _Make_bitmap_large(_Needle_ptr, _Needle_length); } } + } // namespace __std_find_meow_of_bitmap_details +#endif // !_M_ARM64EC + namespace __std_find_meow_of_bitmap { +#ifndef _M_ARM64EC template bool _Use_bitmap_avx(const size_t _Count1, const size_t _Count2) noexcept { if constexpr (sizeof(_Ty) == 1) { @@ -3150,6 +3154,8 @@ namespace { template size_t _Impl_first_avx(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { + using namespace __std_find_meow_of_bitmap_details; + const auto _Haystack_ptr = static_cast(_Haystack); const auto _Needle_ptr = static_cast(_Needle); @@ -3184,6 +3190,8 @@ namespace { template size_t _Impl_last_avx(const void* const _Haystack, size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { + using namespace __std_find_meow_of_bitmap_details; + const auto _Haystack_ptr = static_cast(_Haystack); const auto _Needle_ptr = static_cast(_Needle); @@ -3297,7 +3305,7 @@ namespace { return static_cast(-1); } - } // namespace __std_find_meow_of::_Bitmap + } // namespace __std_find_meow_of_bitmap namespace __std_find_first_of { template @@ -3681,19 +3689,19 @@ namespace { template size_t _Dispatch_pos_sse_1_2( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { - namespace _Bitmap = __std_find_meow_of::_Bitmap; + using namespace __std_find_meow_of_bitmap; if (_Use_avx2()) { - if (_Bitmap::_Use_bitmap_avx<_Ty>(_Count2, _Count1) - && _Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { - return _Bitmap::_Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); + if (_Use_bitmap_avx<_Ty>(_Count2, _Count1) + && _Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + return _Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); } } else { - if (_Bitmap::_Use_bitmap_sse<_Ty>(_Count2, _Count1) - && _Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { - _Bitmap::_Scalar_table_t _Table = {}; - _Bitmap::_Build_scalar_table_no_check<_Ty>(_First2, _Count2, _Table); - return _Bitmap::_Impl_first_scalar<_Ty>(_First1, _Count1, _Table); + if (_Use_bitmap_sse<_Ty>(_Count2, _Count1) + && _Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + _Scalar_table_t _Table = {}; + _Build_scalar_table_no_check<_Ty>(_First2, _Count2, _Table); + return _Impl_first_scalar<_Ty>(_First1, _Count1, _Table); } } @@ -3708,11 +3716,11 @@ namespace { template size_t _Dispatch_pos_avx_4_8( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { - namespace _Bitmap = __std_find_meow_of::_Bitmap; + using namespace __std_find_meow_of_bitmap; - if (_Bitmap::_Use_bitmap_avx<_Ty>(_Count2, _Count1) - && _Bitmap::_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { - return _Bitmap::_Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); + if (_Use_bitmap_avx<_Ty>(_Count2, _Count1) + && _Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + return _Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); } const void* const _Last1 = static_cast(_First1) + _Count1; @@ -3727,11 +3735,11 @@ namespace { template size_t _Dispatch_pos_fallback( const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { - namespace _Bitmap = __std_find_meow_of::_Bitmap; + using namespace __std_find_meow_of_bitmap; - _Bitmap::_Scalar_table_t _Table = {}; - if (_Bitmap::_Build_scalar_table<_Ty>(_First2, _Count2, _Table)) { - return _Bitmap::_Impl_first_scalar<_Ty>(_First1, _Count1, _Table); + _Scalar_table_t _Table = {}; + if (_Build_scalar_table<_Ty>(_First2, _Count2, _Table)) { + return _Impl_first_scalar<_Ty>(_First1, _Count1, _Table); } const void* const _Last1 = static_cast(_First1) + _Count1; @@ -3908,21 +3916,21 @@ namespace { template size_t _Dispatch_pos(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { - namespace _Bitmap = __std_find_meow_of::_Bitmap; + using namespace __std_find_meow_of_bitmap; #ifndef _M_ARM64EC if (_Use_sse42()) { if (_Use_avx2()) { - if (_Bitmap::_Use_bitmap_avx<_Ty>(_Haystack_length, _Needle_length) - && _Bitmap::_Can_fit_256_bits_sse(static_cast(_Needle), _Needle_length)) { - return _Bitmap::_Impl_last_avx<_Ty>(_Haystack, _Haystack_length, _Needle, _Needle_length); + if (_Use_bitmap_avx<_Ty>(_Haystack_length, _Needle_length) + && _Can_fit_256_bits_sse(static_cast(_Needle), _Needle_length)) { + return _Impl_last_avx<_Ty>(_Haystack, _Haystack_length, _Needle, _Needle_length); } } else { - if (_Bitmap::_Use_bitmap_sse<_Ty>(_Haystack_length, _Needle_length) - && _Bitmap::_Can_fit_256_bits_sse(static_cast(_Needle), _Needle_length)) { - _Bitmap::_Scalar_table_t _Table = {}; - _Bitmap::_Build_scalar_table_no_check<_Ty>(_Needle, _Needle_length, _Table); - return _Bitmap::_Impl_last_scalar<_Ty>(_Haystack, _Haystack_length, _Table); + if (_Use_bitmap_sse<_Ty>(_Haystack_length, _Needle_length) + && _Can_fit_256_bits_sse(static_cast(_Needle), _Needle_length)) { + _Scalar_table_t _Table = {}; + _Build_scalar_table_no_check<_Ty>(_Needle, _Needle_length, _Table); + return _Impl_last_scalar<_Ty>(_Haystack, _Haystack_length, _Table); } } @@ -3930,9 +3938,9 @@ namespace { } else #endif // !_M_ARM64EC { - _Bitmap::_Scalar_table_t _Table = {}; - if (_Bitmap::_Build_scalar_table<_Ty>(_Needle, _Needle_length, _Table)) { - return _Bitmap::_Impl_last_scalar<_Ty>(_Haystack, _Haystack_length, _Table); + _Scalar_table_t _Table = {}; + if (_Build_scalar_table<_Ty>(_Needle, _Needle_length, _Table)) { + return _Impl_last_scalar<_Ty>(_Haystack, _Haystack_length, _Table); } return _Fallback<_Ty>(_Haystack, _Haystack_length, _Needle, _Needle_length); From 60637a1e43e97aa52c0d813e46af2e7991872a7b Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 9 Nov 2024 06:37:11 -0800 Subject: [PATCH 35/47] Rename to `_Bitmap_step`. --- stl/src/vector_algorithms.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 123e652f30..d52bffe4c3 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2932,7 +2932,7 @@ namespace { #ifndef _M_ARM64EC namespace __std_find_meow_of_bitmap_details { - __m256i _Step(const __m256i _Bitmap, const __m256i _Data) noexcept { + __m256i _Bitmap_step(const __m256i _Bitmap, const __m256i _Data) noexcept { const __m256i _Data_high = _mm256_srli_epi32(_Data, 5); const __m256i _Bitmap_parts = _mm256_permutevar8x32_epi32(_Bitmap, _Data_high); const __m256i _Data_low_inv = _mm256_andnot_si256(_Data, _mm256_set1_epi32(0x1F)); @@ -3164,7 +3164,7 @@ namespace { const size_t _Haystack_length_vec = _Haystack_length & ~size_t{7}; for (size_t _Ix = 0; _Ix != _Haystack_length_vec; _Ix += 8) { const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Ix); - const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); + const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); if (_Bingo != 0) { return _Ix + _tzcnt_u32(_Bingo); @@ -3177,7 +3177,7 @@ namespace { _Ty _Buf[8]; memcpy(_Buf, _Haystack_ptr + _Haystack_length_vec, _Haystack_length_tail * sizeof(_Ty)); const __m256i _Data = _Load_avx_256_8(_Buf); - const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); + const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; if (_Bingo != 0) { return _Haystack_length_vec + _tzcnt_u32(_Bingo); @@ -3200,7 +3200,7 @@ namespace { while (_Haystack_length >= 8) { _Haystack_length -= 8; const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Haystack_length); - const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); + const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); if (_Bingo != 0) { return _Haystack_length + 31 - _lzcnt_u32(_Bingo); @@ -3213,7 +3213,7 @@ namespace { _Ty _Buf[8]; memcpy(_Buf, _Haystack_ptr, _Haystack_length_tail * sizeof(_Ty)); const __m256i _Data = _Load_avx_256_8(_Buf); - const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Step(_Bitmap, _Data), _Data); + const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; if (_Bingo != 0) { return 31 - _lzcnt_u32(_Bingo); From 07b04b7ec97e78946cafd115430ed51ccb0c9bb4 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 9 Nov 2024 06:38:02 -0800 Subject: [PATCH 36/47] Fix typo: `_Mask_out_oveflow` => `_Mask_out_overflow` --- stl/src/vector_algorithms.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index d52bffe4c3..1cfcc9d77b 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2959,7 +2959,7 @@ namespace { } template - __m256i _Mask_out_oveflow(const __m256i _Mask, const __m256i _Data) noexcept { + __m256i _Mask_out_overflow(const __m256i _Mask, const __m256i _Data) noexcept { if constexpr (sizeof(_Ty) == 1) { return _Mask; } else { @@ -3164,7 +3164,7 @@ namespace { const size_t _Haystack_length_vec = _Haystack_length & ~size_t{7}; for (size_t _Ix = 0; _Ix != _Haystack_length_vec; _Ix += 8) { const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Ix); - const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); + const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); if (_Bingo != 0) { return _Ix + _tzcnt_u32(_Bingo); @@ -3177,7 +3177,7 @@ namespace { _Ty _Buf[8]; memcpy(_Buf, _Haystack_ptr + _Haystack_length_vec, _Haystack_length_tail * sizeof(_Ty)); const __m256i _Data = _Load_avx_256_8(_Buf); - const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); + const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; if (_Bingo != 0) { return _Haystack_length_vec + _tzcnt_u32(_Bingo); @@ -3200,7 +3200,7 @@ namespace { while (_Haystack_length >= 8) { _Haystack_length -= 8; const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Haystack_length); - const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); + const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); if (_Bingo != 0) { return _Haystack_length + 31 - _lzcnt_u32(_Bingo); @@ -3213,7 +3213,7 @@ namespace { _Ty _Buf[8]; memcpy(_Buf, _Haystack_ptr, _Haystack_length_tail * sizeof(_Ty)); const __m256i _Data = _Load_avx_256_8(_Buf); - const __m256i _Mask = _Mask_out_oveflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); + const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; if (_Bingo != 0) { return 31 - _lzcnt_u32(_Bingo); From 8b0380da6b24171ffbb9cb6d6904180b999c4d0d Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 9 Nov 2024 06:39:04 -0800 Subject: [PATCH 37/47] Drop spurious spaces in preprocessor comments. --- stl/src/vector_algorithms.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 1cfcc9d77b..ecfc8ed7b3 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3222,7 +3222,7 @@ namespace { return static_cast(-1); } -#endif // ! _M_ARM64EC +#endif // !_M_ARM64EC using _Scalar_table_t = bool[256]; @@ -3258,7 +3258,7 @@ namespace { _Table[*_Ptr] = true; } } -#endif // ! _M_ARM64EC +#endif // !_M_ARM64EC template size_t _Impl_first_scalar( From cb766bd503f5aedda753e02a28af96cf495a4c3d Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 9 Nov 2024 17:54:48 +0200 Subject: [PATCH 38/47] AVX2 vpermq form --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index ecfc8ed7b3..1e26b0c003 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2952,7 +2952,7 @@ namespace { const __m256i _Low = _mm256_loadu_si256(reinterpret_cast(_Src)); const __m256i _High = _mm256_loadu_si256(reinterpret_cast(_Src) + 1); const __m256i _Pack = _mm256_packs_epi32(_Low, _High); - return _mm256_permutex_epi64(_Pack, _MM_SHUFFLE(3, 1, 2, 0)); + return _mm256_permute4x64_epi64(_Pack, _MM_SHUFFLE(3, 1, 2, 0)); } else { static_assert(false, "Unexpected size"); } From 3fc4cbd6cfa46cff0704623ac381885a71ec4ee9 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 9 Nov 2024 17:59:26 +0200 Subject: [PATCH 39/47] vzeroupper guards --- stl/src/vector_algorithms.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 1e26b0c003..5483f74471 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3156,6 +3156,8 @@ namespace { const size_t _Needle_length) noexcept { using namespace __std_find_meow_of_bitmap_details; + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + const auto _Haystack_ptr = static_cast(_Haystack); const auto _Needle_ptr = static_cast(_Needle); @@ -3192,6 +3194,8 @@ namespace { const size_t _Needle_length) noexcept { using namespace __std_find_meow_of_bitmap_details; + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + const auto _Haystack_ptr = static_cast(_Haystack); const auto _Needle_ptr = static_cast(_Needle); From 9d37e25cc1171479f87dcf437de8dc7145a4977d Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 10 Nov 2024 09:48:19 +0200 Subject: [PATCH 40/47] AVX2 masks for bitmap algorithm --- stl/src/vector_algorithms.cpp | 38 ++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 5483f74471..ec84e3a552 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2958,6 +2958,30 @@ namespace { } } + template + __m256i _Load_avx_256_8_last(const _Ty* const _Src, const size_t _Count) noexcept { + if constexpr (sizeof(_Ty) == 1) { + uint8_t _Buf[8]; + memcpy(_Buf, _Src, _Count); + return _mm256_cvtepu8_epi32(_mm_loadu_si64(_Buf)); + } else if constexpr (sizeof(_Ty) == 2) { + uint8_t _Buf[16]; + memcpy(_Buf, _Src, _Count * 2); + return _mm256_cvtepu16_epi32(_mm_loadu_si128(reinterpret_cast(_Buf))); + } else if constexpr (sizeof(_Ty) == 4) { + return _mm256_maskload_epi32(reinterpret_cast(_Src), _Avx2_tail_mask_32(_Count)); + } else if constexpr (sizeof(_Ty) == 8) { + const __m256i _Mask_low = _Avx2_tail_mask_32(((_Count > 4) ? 4 : _Count) << 1); + const __m256i _Low = _mm256_maskload_epi32(reinterpret_cast(_Src) + 0, _Mask_low); + const __m256i _Mask_high = _Avx2_tail_mask_32(((_Count > 4) ? _Count - 4 : 0) << 1); + const __m256i _High = _mm256_maskload_epi32(reinterpret_cast(_Src) + 8, _Mask_high); + const __m256i _Pack = _mm256_packs_epi32(_Low, _High); + return _mm256_permute4x64_epi64(_Pack, _MM_SHUFFLE(3, 1, 2, 0)); + } else { + static_assert(false, "Unexpected size"); + } + } + template __m256i _Mask_out_overflow(const __m256i _Mask, const __m256i _Data) noexcept { if constexpr (sizeof(_Ty) == 1) { @@ -3176,10 +3200,8 @@ namespace { const size_t _Haystack_length_tail = _Haystack_length & 7; if (_Haystack_length_tail != 0) { const unsigned int _Tail_bingo_mask = (1 << _Haystack_length_tail) - 1; - _Ty _Buf[8]; - memcpy(_Buf, _Haystack_ptr + _Haystack_length_vec, _Haystack_length_tail * sizeof(_Ty)); - const __m256i _Data = _Load_avx_256_8(_Buf); - const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); + const __m256i _Data = _Load_avx_256_8_last(_Haystack_ptr + _Haystack_length_vec, _Haystack_length_tail); + const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; if (_Bingo != 0) { return _Haystack_length_vec + _tzcnt_u32(_Bingo); @@ -3214,11 +3236,9 @@ namespace { const size_t _Haystack_length_tail = _Haystack_length & 7; if (_Haystack_length_tail != 0) { const unsigned int _Tail_bingo_mask = (1 << _Haystack_length_tail) - 1; - _Ty _Buf[8]; - memcpy(_Buf, _Haystack_ptr, _Haystack_length_tail * sizeof(_Ty)); - const __m256i _Data = _Load_avx_256_8(_Buf); - const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); - const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; + const __m256i _Data = _Load_avx_256_8_last(_Haystack_ptr, _Haystack_length_tail); + const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); + const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; if (_Bingo != 0) { return 31 - _lzcnt_u32(_Bingo); } From 4961641e20316d4ef8ba5c00bfd71c8bb4695d98 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 23 Nov 2024 15:19:24 +0200 Subject: [PATCH 41/47] restore strategy. set avx bitmap threshold also align scalar bitmap for faster `memset` --- stl/src/vector_algorithms.cpp | 95 ++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 29 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index ec84e3a552..62051814e1 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3105,7 +3105,7 @@ namespace { } template - bool _Use_bitmap_sse(const size_t _Count1, const size_t _Count2) noexcept { + bool _Use_bitmap_scalar(const size_t _Count1, const size_t _Count2) noexcept { if constexpr (sizeof(_Ty) == 1) { if (_Count2 <= 32) { return false; @@ -3128,11 +3128,40 @@ namespace { } else { return _Count1 > 8; } + } else if constexpr (sizeof(_Ty) == 4) { + if (_Count2 <= 32) { + return false; + } else if (_Count2 <= 112) { + return _Count1 > 16; + } else { + return _Count1 > 8; + } + } else if constexpr (sizeof(_Ty) == 8) { + if (_Count2 <= 16) { + return false; + } else if (_Count2 <= 32) { + return _Count1 > 16; + } else if (_Count2 <= 112) { + return _Count1 > 8; + } else { + return _Count1 > 4; + } } else { static_assert(false, "unexpected size"); } } + enum class _Strategy { _No_bitmap, _Scalar_bitmap, _Vector_bitmap }; + + template + _Strategy _Pick_strategy(const size_t _Count1, const size_t _Count2, const bool _Use_avx2_) noexcept { + if (_Use_avx2_ && _Count1 > 48) { + return _Use_bitmap_avx<_Ty>(_Count1, _Count2) ? _Strategy::_Vector_bitmap : _Strategy::_No_bitmap; + } else { + return _Use_bitmap_scalar<_Ty>(_Count1, _Count2) ? _Strategy::_Scalar_bitmap : _Strategy::_No_bitmap; + } + } + template bool _Can_fit_256_bits_sse(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { if constexpr (sizeof(_Ty) == 1) { @@ -3715,15 +3744,15 @@ namespace { const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { using namespace __std_find_meow_of_bitmap; - if (_Use_avx2()) { - if (_Use_bitmap_avx<_Ty>(_Count2, _Count1) - && _Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + const _Strategy _Strat = _Pick_strategy<_Ty>(_Count1, _Count2, _Use_avx2()); + + if (_Strat == _Strategy::_Vector_bitmap) { + if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { return _Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); } - } else { - if (_Use_bitmap_sse<_Ty>(_Count2, _Count1) - && _Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { - _Scalar_table_t _Table = {}; + } else if (_Strat == _Strategy::_Scalar_bitmap) { + if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + alignas(32) _Scalar_table_t _Table = {}; _Build_scalar_table_no_check<_Ty>(_First2, _Count2, _Table); return _Impl_first_scalar<_Ty>(_First1, _Count1, _Table); } @@ -3742,9 +3771,18 @@ namespace { const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { using namespace __std_find_meow_of_bitmap; - if (_Use_bitmap_avx<_Ty>(_Count2, _Count1) - && _Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { - return _Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); + const auto _Strat = _Pick_strategy<_Ty>(_Count1, _Count2, true); + + if (_Strat == _Strategy::_Vector_bitmap) { + if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + return _Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); + } + } else if (_Strat == _Strategy::_Scalar_bitmap) { + if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + alignas(32) _Scalar_table_t _Table = {}; + _Build_scalar_table_no_check<_Ty>(_First2, _Count2, _Table); + return _Impl_first_scalar<_Ty>(_First1, _Count1, _Table); + } } const void* const _Last1 = static_cast(_First1) + _Count1; @@ -3938,39 +3976,38 @@ namespace { #endif // !_M_ARM64EC template - size_t _Dispatch_pos(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, - const size_t _Needle_length) noexcept { + size_t _Dispatch_pos( + const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { using namespace __std_find_meow_of_bitmap; #ifndef _M_ARM64EC if (_Use_sse42()) { - if (_Use_avx2()) { - if (_Use_bitmap_avx<_Ty>(_Haystack_length, _Needle_length) - && _Can_fit_256_bits_sse(static_cast(_Needle), _Needle_length)) { - return _Impl_last_avx<_Ty>(_Haystack, _Haystack_length, _Needle, _Needle_length); + const auto _Strat = _Pick_strategy<_Ty>(_Count1, _Count2, _Use_avx2()); + + if (_Strat == _Strategy::_Vector_bitmap) { + if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + return _Impl_last_avx<_Ty>(_First1, _Count1, _First2, _Count2); } - } else { - if (_Use_bitmap_sse<_Ty>(_Haystack_length, _Needle_length) - && _Can_fit_256_bits_sse(static_cast(_Needle), _Needle_length)) { - _Scalar_table_t _Table = {}; - _Build_scalar_table_no_check<_Ty>(_Needle, _Needle_length, _Table); - return _Impl_last_scalar<_Ty>(_Haystack, _Haystack_length, _Table); + } else if (_Strat == _Strategy::_Scalar_bitmap) { + if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + alignas(32) _Scalar_table_t _Table = {}; + _Build_scalar_table_no_check<_Ty>(_First2, _Count2, _Table); + return _Impl_last_scalar<_Ty>(_First1, _Count1, _Table); } } - return _Impl<_Ty>(_Haystack, _Haystack_length, _Needle, _Needle_length); + return _Impl<_Ty>(_First1, _Count1, _First2, _Count2); } else #endif // !_M_ARM64EC { - _Scalar_table_t _Table = {}; - if (_Build_scalar_table<_Ty>(_Needle, _Needle_length, _Table)) { - return _Impl_last_scalar<_Ty>(_Haystack, _Haystack_length, _Table); + alignas(32) _Scalar_table_t _Table = {}; + if (_Build_scalar_table<_Ty>(_First2, _Count2, _Table)) { + return _Impl_last_scalar<_Ty>(_First1, _Count1, _Table); } - return _Fallback<_Ty>(_Haystack, _Haystack_length, _Needle, _Needle_length); + return _Fallback<_Ty>(_First1, _Count1, _First2, _Count2); } } - } // namespace __std_find_last_of template From ce583a56229245be97109aacd1569c132caf3080 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 2 Dec 2024 10:53:00 -0800 Subject: [PATCH 42/47] Comment `_Threshold_find_first_of` usage in `_Traits_find_last_of` --- stl/inc/__msvc_string_view.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/__msvc_string_view.hpp b/stl/inc/__msvc_string_view.hpp index 53ea1fcb59..5708127c34 100644 --- a/stl/inc/__msvc_string_view.hpp +++ b/stl/inc/__msvc_string_view.hpp @@ -896,7 +896,7 @@ constexpr size_t _Traits_find_last_of(_In_reads_(_Hay_size) const _Traits_ptr_t< if constexpr (sizeof(_Elem) <= 2) { if (!_STD _Is_constant_evaluated()) { const size_t _Remaining_size = _Hay_start + 1; - if (_Remaining_size + _Needle_size >= _Threshold_find_first_of) { + if (_Remaining_size + _Needle_size >= _Threshold_find_first_of) { // same threshold for first/last return _Find_last_of_pos_vectorized(_Haystack, _Remaining_size, _Needle, _Needle_size); } } From 4021da4465cd1c59c05f7ed223a0d3af70743e8b Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 2 Dec 2024 11:08:12 -0800 Subject: [PATCH 43/47] Drop unnecessary parens. --- stl/src/vector_algorithms.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 62051814e1..1d99d24ea8 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2971,9 +2971,9 @@ namespace { } else if constexpr (sizeof(_Ty) == 4) { return _mm256_maskload_epi32(reinterpret_cast(_Src), _Avx2_tail_mask_32(_Count)); } else if constexpr (sizeof(_Ty) == 8) { - const __m256i _Mask_low = _Avx2_tail_mask_32(((_Count > 4) ? 4 : _Count) << 1); + const __m256i _Mask_low = _Avx2_tail_mask_32((_Count > 4 ? 4 : _Count) << 1); const __m256i _Low = _mm256_maskload_epi32(reinterpret_cast(_Src) + 0, _Mask_low); - const __m256i _Mask_high = _Avx2_tail_mask_32(((_Count > 4) ? _Count - 4 : 0) << 1); + const __m256i _Mask_high = _Avx2_tail_mask_32((_Count > 4 ? _Count - 4 : 0) << 1); const __m256i _High = _mm256_maskload_epi32(reinterpret_cast(_Src) + 8, _Mask_high); const __m256i _Pack = _mm256_packs_epi32(_Low, _High); return _mm256_permute4x64_epi64(_Pack, _MM_SHUFFLE(3, 1, 2, 0)); From 6e186505b1b5df4c401acc2974fd00bba98b5644 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 2 Dec 2024 11:23:42 -0800 Subject: [PATCH 44/47] Use for-loops for `_Needle_ptr` iteration. --- stl/src/vector_algorithms.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 1d99d24ea8..22ab7ecde0 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3182,22 +3182,18 @@ namespace { const void* _Stop = _Needle_ptr; _Advance_bytes(_Stop, _Byte_size & ~size_t{0x1F}); - while (_Needle_ptr != _Stop) { + for (; _Needle_ptr != _Stop; _Needle_ptr += 32 / sizeof(_Ty)) { const __m128i _Data = _mm_loadu_si128(reinterpret_cast(_Needle_ptr)); if (!_mm_testz_si128(_Mask, _Data)) { return false; } - - _Needle_ptr += 32 / sizeof(_Ty); } _Advance_bytes(_Stop, _Byte_size & 0x1E); - while (_Needle_ptr != _Stop) { + for (; _Needle_ptr != _Stop; ++_Needle_ptr) { if ((*_Needle_ptr & ~_Ty{0xFF}) != 0) { return false; } - - ++_Needle_ptr; } return true; From 6ad0f530a7fc268bc4da41a2b39c462e9c2c49de Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 2 Dec 2024 11:35:11 -0800 Subject: [PATCH 45/47] Drop unnecessary newline. --- stl/src/vector_algorithms.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 22ab7ecde0..1400adc7ed 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3787,7 +3787,6 @@ namespace { return _Pos_from_ptr<_Ty>(_Impl_4_8<_Ty>(_First1, _Size_bytes_1, _First2, _Size_bytes_2), _First1, _Last1); } - #endif // !_M_ARM64EC template From 06612c27944e040b16f6df20351fe70676a0b53a Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 2 Dec 2024 11:51:26 -0800 Subject: [PATCH 46/47] `_Traits` => `_Find_first_of_traits` to avoid monstrous shadowing --- stl/src/vector_algorithms.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 1400adc7ed..de8eec5a04 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3497,10 +3497,10 @@ namespace { #endif // !_M_ARM64EC template - struct _Traits; + struct _Find_first_of_traits; template <> - struct _Traits : _Find_traits_4 { + struct _Find_first_of_traits : _Find_traits_4 { #ifndef _M_ARM64EC template static __m256i _Spread_avx(__m256i _Val, const size_t _Needle_length_el) noexcept { @@ -3546,7 +3546,7 @@ namespace { }; template <> - struct _Traits : _Find_traits_8 { + struct _Find_first_of_traits : _Find_traits_8 { #ifndef _M_ARM64EC template static __m256i _Spread_avx(const __m256i _Val, const size_t _Needle_length_el) noexcept { @@ -3613,7 +3613,7 @@ namespace { template const void* _Shuffle_impl(const void* _First1, const size_t _Haystack_length, const void* const _First2, const void* const _Stop2, const size_t _Last2_length_el) noexcept { - using _Traits = _Traits<_Ty>; + using _Traits = _Find_first_of_traits<_Ty>; constexpr size_t _Length_el = 32 / sizeof(_Ty); const __m256i _Last2val = _mm256_maskload_epi32( From a2ee6967b79d98712cecb97c2db080e3c0309c5f Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Thu, 5 Dec 2024 07:18:17 +0200 Subject: [PATCH 47/47] lift --- stl/inc/algorithm | 30 ++++++++++++++++++++++++++++++ stl/inc/xutility | 30 ------------------------------ 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 5ea4e3db6f..ef3fa2e1f5 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -38,6 +38,15 @@ extern "C" { // functions are in native code objects that the compiler cannot analyze. In the absence of the noalias attribute, the // compiler has to assume that the denoted arrays are "globally address taken", and that any later calls to // unanalyzable routines may modify those arrays. +const void* __stdcall __std_find_first_of_trivial_1( + const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; +const void* __stdcall __std_find_first_of_trivial_2( + const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; +const void* __stdcall __std_find_first_of_trivial_4( + const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; +const void* __stdcall __std_find_first_of_trivial_8( + const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; + __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_1( const void* _First, const void* _Last, void* _Dest) noexcept; __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_2( @@ -73,6 +82,27 @@ __declspec(noalias) void __stdcall __std_replace_8( } // extern "C" _STD_BEGIN +template +_Ty1* _Find_first_of_vectorized( + _Ty1* const _First1, _Ty1* const _Last1, _Ty2* const _First2, _Ty2* const _Last2) noexcept { + _STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2)); + if constexpr (sizeof(_Ty1) == 1) { + return const_cast<_Ty1*>( + static_cast(::__std_find_first_of_trivial_1(_First1, _Last1, _First2, _Last2))); + } else if constexpr (sizeof(_Ty1) == 2) { + return const_cast<_Ty1*>( + static_cast(::__std_find_first_of_trivial_2(_First1, _Last1, _First2, _Last2))); + } else if constexpr (sizeof(_Ty1) == 4) { + return const_cast<_Ty1*>( + static_cast(::__std_find_first_of_trivial_4(_First1, _Last1, _First2, _Last2))); + } else if constexpr (sizeof(_Ty1) == 8) { + return const_cast<_Ty1*>( + static_cast(::__std_find_first_of_trivial_8(_First1, _Last1, _First2, _Last2))); + } else { + _STL_INTERNAL_STATIC_ASSERT(false); // unexpected size + } +} + template __declspec(noalias) void _Reverse_copy_vectorized(const void* _First, const void* _Last, void* _Dest) noexcept { if constexpr (_Nx == 1) { diff --git a/stl/inc/xutility b/stl/inc/xutility index 77c8619911..f1f33652ce 100644 --- a/stl/inc/xutility +++ b/stl/inc/xutility @@ -98,15 +98,6 @@ const void* __stdcall __std_find_last_trivial_2(const void* _First, const void* const void* __stdcall __std_find_last_trivial_4(const void* _First, const void* _Last, uint32_t _Val) noexcept; const void* __stdcall __std_find_last_trivial_8(const void* _First, const void* _Last, uint64_t _Val) noexcept; -const void* __stdcall __std_find_first_of_trivial_1( - const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; -const void* __stdcall __std_find_first_of_trivial_2( - const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; -const void* __stdcall __std_find_first_of_trivial_4( - const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; -const void* __stdcall __std_find_first_of_trivial_8( - const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; - const void* __stdcall __std_search_1( const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept; const void* __stdcall __std_search_2( @@ -252,27 +243,6 @@ _Ty* _Find_last_vectorized(_Ty* const _First, _Ty* const _Last, const _TVal _Val // find_first_of vectorization is likely to be a win after this size (in elements) _INLINE_VAR constexpr ptrdiff_t _Threshold_find_first_of = 16; -template -_Ty1* _Find_first_of_vectorized( - _Ty1* const _First1, _Ty1* const _Last1, _Ty2* const _First2, _Ty2* const _Last2) noexcept { - _STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2)); - if constexpr (sizeof(_Ty1) == 1) { - return const_cast<_Ty1*>( - static_cast(::__std_find_first_of_trivial_1(_First1, _Last1, _First2, _Last2))); - } else if constexpr (sizeof(_Ty1) == 2) { - return const_cast<_Ty1*>( - static_cast(::__std_find_first_of_trivial_2(_First1, _Last1, _First2, _Last2))); - } else if constexpr (sizeof(_Ty1) == 4) { - return const_cast<_Ty1*>( - static_cast(::__std_find_first_of_trivial_4(_First1, _Last1, _First2, _Last2))); - } else if constexpr (sizeof(_Ty1) == 8) { - return const_cast<_Ty1*>( - static_cast(::__std_find_first_of_trivial_8(_First1, _Last1, _First2, _Last2))); - } else { - _STL_INTERNAL_STATIC_ASSERT(false); // unexpected size - } -} - template _Ty1* _Search_vectorized(_Ty1* const _First1, _Ty1* const _Last1, _Ty2* const _First2, const size_t _Count2) noexcept { _STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2));