diff --git a/benchmarks/src/find_first_of.cpp b/benchmarks/src/find_first_of.cpp index b81e94f6ed..41b2089e4c 100644 --- a/benchmarks/src/find_first_of.cpp +++ b/benchmarks/src/find_first_of.cpp @@ -13,6 +13,8 @@ #include #include +#include "skewed_allocator.hpp" + using namespace std; enum class AlgType { std_func, str_member_first, str_member_last }; @@ -24,7 +26,8 @@ void bm(benchmark::State& state) { const size_t HSize = Pos * 2; const size_t Which = 0; - using container = conditional_t, basic_string>; + using container = conditional_t>, + basic_string, not_highly_aligned_allocator>>; constexpr T HaystackFiller{' '}; static_assert(HaystackFiller < Start, "The following iota() should not produce the haystack filler."); @@ -59,8 +62,9 @@ void bm(benchmark::State& state) { } void common_args(auto bm) { - bm->Args({2, 3})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2})->Args({102, 4}); - bm->Args({325, 1})->Args({400, 50})->Args({1011, 11})->Args({1502, 23})->Args({3056, 7}); + bm->Args({2, 3})->Args({6, 81})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2}); + bm->Args({75, 85})->Args({102, 4})->Args({200, 46})->Args({325, 1})->Args({400, 50}); + bm->Args({1011, 11})->Args({1280, 46})->Args({1502, 23})->Args({2203, 54})->Args({3056, 7}); } BENCHMARK(bm)->Apply(common_args); diff --git a/stl/inc/__msvc_string_view.hpp b/stl/inc/__msvc_string_view.hpp index aeae9bd2c0..5708127c34 100644 --- a/stl/inc/__msvc_string_view.hpp +++ b/stl/inc/__msvc_string_view.hpp @@ -29,6 +29,15 @@ extern "C" { // compiler has to assume that the denoted arrays are "globally address taken", and that any later calls to // unanalyzable routines may modify those arrays. +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_1( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept; +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_2( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept; +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_4( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept; +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_8( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept; + __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_1( const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept; __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_2( @@ -38,6 +47,23 @@ __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_2( _STD_BEGIN +template +size_t _Find_first_of_pos_vectorized(const _Ty1* const _Haystack, const size_t _Haystack_length, + const _Ty2* const _Needle, const size_t _Needle_length) noexcept { + _STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2)); + if constexpr (sizeof(_Ty1) == 1) { + return ::__std_find_first_of_trivial_pos_1(_Haystack, _Haystack_length, _Needle, _Needle_length); + } else if constexpr (sizeof(_Ty1) == 2) { + return ::__std_find_first_of_trivial_pos_2(_Haystack, _Haystack_length, _Needle, _Needle_length); + } else if constexpr (sizeof(_Ty1) == 4) { + return ::__std_find_first_of_trivial_pos_4(_Haystack, _Haystack_length, _Needle, _Needle_length); + } else if constexpr (sizeof(_Ty1) == 8) { + return ::__std_find_first_of_trivial_pos_8(_Haystack, _Haystack_length, _Needle, _Needle_length); + } else { + _STL_INTERNAL_STATIC_ASSERT(false); // unexpected size + } +} + template size_t _Find_last_of_pos_vectorized(const _Ty1* const _Haystack, const size_t _Haystack_length, const _Ty2* const _Needle, const size_t _Needle_length) noexcept { @@ -817,48 +843,31 @@ constexpr size_t _Traits_find_first_of(_In_reads_(_Hay_size) const _Traits_ptr_t const auto _Hay_end = _Haystack + _Hay_size; if constexpr (_Is_implementation_handled_char_traits<_Traits>) { - if (!_STD _Is_constant_evaluated()) { - using _Elem = typename _Traits::char_type; - #if _USE_STD_VECTOR_ALGORITHMS - const bool _Try_vectorize = _Hay_size - _Start_at > _Threshold_find_first_of; - - // Additional condition for when the vectorization outperforms the table lookup - constexpr size_t _Find_first_of_bitmap_threshold = sizeof(_Elem) == 1 ? 48 : sizeof(_Elem) == 8 ? 8 : 16; - - const bool _Use_bitmap = !_Try_vectorize || _Needle_size > _Find_first_of_bitmap_threshold; -#else // ^^^ _USE_STD_VECTOR_ALGORITHMS / !_USE_STD_VECTOR_ALGORITHMS vvv - const bool _Use_bitmap = true; -#endif // ^^^ !_USE_STD_VECTOR_ALGORITHMS ^^^ - - if (_Use_bitmap) { - _String_bitmap<_Elem> _Matches; - - if (_Matches._Mark(_Needle, _Needle + _Needle_size)) { - for (auto _Match_try = _Hay_start; _Match_try < _Hay_end; ++_Match_try) { - if (_Matches._Match(*_Match_try)) { - return static_cast(_Match_try - _Haystack); // found a match - } - } - return static_cast(-1); // no match + if (!_STD _Is_constant_evaluated()) { + const size_t _Remaining_size = _Hay_size - _Start_at; + if (_Remaining_size + _Needle_size >= _Threshold_find_first_of) { + size_t _Pos = _Find_first_of_pos_vectorized(_Hay_start, _Remaining_size, _Needle, _Needle_size); + if (_Pos != static_cast(-1)) { + _Pos += _Start_at; } - - // couldn't put one of the characters into the bitmap, fall back to vectorized or serial algorithms + return _Pos; } + } +#endif // _USE_STD_VECTOR_ALGORITHMS -#if _USE_STD_VECTOR_ALGORITHMS - if (_Try_vectorize) { - const _Traits_ptr_t<_Traits> _Found = - _STD _Find_first_of_vectorized(_Hay_start, _Hay_end, _Needle, _Needle + _Needle_size); - - if (_Found != _Hay_end) { - return static_cast(_Found - _Haystack); // found a match - } else { - return static_cast(-1); // no match + _String_bitmap _Matches; + + if (_Matches._Mark(_Needle, _Needle + _Needle_size)) { + for (auto _Match_try = _Hay_start; _Match_try < _Hay_end; ++_Match_try) { + if (_Matches._Match(*_Match_try)) { + return static_cast(_Match_try - _Haystack); // found a match } } -#endif // _USE_STD_VECTOR_ALGORITHMS + return static_cast(-1); // no match } + + // couldn't put one of the characters into the bitmap, fall back to serial algorithm } for (auto _Match_try = _Hay_start; _Match_try < _Hay_end; ++_Match_try) { @@ -882,47 +891,32 @@ constexpr size_t _Traits_find_last_of(_In_reads_(_Hay_size) const _Traits_ptr_t< const auto _Hay_start = (_STD min)(_Start_at, _Hay_size - 1); if constexpr (_Is_implementation_handled_char_traits<_Traits>) { - if (!_STD _Is_constant_evaluated()) { - using _Elem = typename _Traits::char_type; - - bool _Use_bitmap = true; + using _Elem = typename _Traits::char_type; #if _USE_STD_VECTOR_ALGORITHMS - bool _Try_vectorize = false; - - if constexpr (sizeof(_Elem) <= 2) { - _Try_vectorize = _Hay_start + 1 > _Threshold_find_first_of; - // Additional condition for when the vectorization outperforms the table lookup - constexpr size_t _Find_last_of_bitmap_threshold = sizeof(_Elem) == 1 ? 48 : 8; - - _Use_bitmap = !_Try_vectorize || _Needle_size > _Find_last_of_bitmap_threshold; + if constexpr (sizeof(_Elem) <= 2) { + if (!_STD _Is_constant_evaluated()) { + const size_t _Remaining_size = _Hay_start + 1; + if (_Remaining_size + _Needle_size >= _Threshold_find_first_of) { // same threshold for first/last + return _Find_last_of_pos_vectorized(_Haystack, _Remaining_size, _Needle, _Needle_size); + } } + } #endif // _USE_STD_VECTOR_ALGORITHMS - if (_Use_bitmap) { - _String_bitmap<_Elem> _Matches; - if (_Matches._Mark(_Needle, _Needle + _Needle_size)) { - for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) { - if (_Matches._Match(*_Match_try)) { - return static_cast(_Match_try - _Haystack); // found a match - } - - if (_Match_try == _Haystack) { - return static_cast(-1); // at beginning, no more chance for match - } - } + _String_bitmap<_Elem> _Matches; + if (_Matches._Mark(_Needle, _Needle + _Needle_size)) { + for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) { + if (_Matches._Match(*_Match_try)) { + return static_cast(_Match_try - _Haystack); // found a match } - // couldn't put one of the characters into the bitmap, fall back to vectorized or serial algorithms - } - -#if _USE_STD_VECTOR_ALGORITHMS - if constexpr (sizeof(_Elem) <= 2) { - if (_Try_vectorize) { - return _STD _Find_last_of_pos_vectorized(_Haystack, _Hay_start + 1, _Needle, _Needle_size); + if (_Match_try == _Haystack) { + return static_cast(-1); // at beginning, no more chance for match } } -#endif // _USE_STD_VECTOR_ALGORITHMS } + + // couldn't put one of the characters into the bitmap, fall back to serial algorithm } for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) { diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 5ea4e3db6f..ef3fa2e1f5 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -38,6 +38,15 @@ extern "C" { // functions are in native code objects that the compiler cannot analyze. In the absence of the noalias attribute, the // compiler has to assume that the denoted arrays are "globally address taken", and that any later calls to // unanalyzable routines may modify those arrays. +const void* __stdcall __std_find_first_of_trivial_1( + const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; +const void* __stdcall __std_find_first_of_trivial_2( + const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; +const void* __stdcall __std_find_first_of_trivial_4( + const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; +const void* __stdcall __std_find_first_of_trivial_8( + const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; + __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_1( const void* _First, const void* _Last, void* _Dest) noexcept; __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_2( @@ -73,6 +82,27 @@ __declspec(noalias) void __stdcall __std_replace_8( } // extern "C" _STD_BEGIN +template +_Ty1* _Find_first_of_vectorized( + _Ty1* const _First1, _Ty1* const _Last1, _Ty2* const _First2, _Ty2* const _Last2) noexcept { + _STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2)); + if constexpr (sizeof(_Ty1) == 1) { + return const_cast<_Ty1*>( + static_cast(::__std_find_first_of_trivial_1(_First1, _Last1, _First2, _Last2))); + } else if constexpr (sizeof(_Ty1) == 2) { + return const_cast<_Ty1*>( + static_cast(::__std_find_first_of_trivial_2(_First1, _Last1, _First2, _Last2))); + } else if constexpr (sizeof(_Ty1) == 4) { + return const_cast<_Ty1*>( + static_cast(::__std_find_first_of_trivial_4(_First1, _Last1, _First2, _Last2))); + } else if constexpr (sizeof(_Ty1) == 8) { + return const_cast<_Ty1*>( + static_cast(::__std_find_first_of_trivial_8(_First1, _Last1, _First2, _Last2))); + } else { + _STL_INTERNAL_STATIC_ASSERT(false); // unexpected size + } +} + template __declspec(noalias) void _Reverse_copy_vectorized(const void* _First, const void* _Last, void* _Dest) noexcept { if constexpr (_Nx == 1) { diff --git a/stl/inc/xutility b/stl/inc/xutility index 77c8619911..f1f33652ce 100644 --- a/stl/inc/xutility +++ b/stl/inc/xutility @@ -98,15 +98,6 @@ const void* __stdcall __std_find_last_trivial_2(const void* _First, const void* const void* __stdcall __std_find_last_trivial_4(const void* _First, const void* _Last, uint32_t _Val) noexcept; const void* __stdcall __std_find_last_trivial_8(const void* _First, const void* _Last, uint64_t _Val) noexcept; -const void* __stdcall __std_find_first_of_trivial_1( - const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; -const void* __stdcall __std_find_first_of_trivial_2( - const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; -const void* __stdcall __std_find_first_of_trivial_4( - const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; -const void* __stdcall __std_find_first_of_trivial_8( - const void* _First1, const void* _Last1, const void* _First2, const void* _Last2) noexcept; - const void* __stdcall __std_search_1( const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept; const void* __stdcall __std_search_2( @@ -252,27 +243,6 @@ _Ty* _Find_last_vectorized(_Ty* const _First, _Ty* const _Last, const _TVal _Val // find_first_of vectorization is likely to be a win after this size (in elements) _INLINE_VAR constexpr ptrdiff_t _Threshold_find_first_of = 16; -template -_Ty1* _Find_first_of_vectorized( - _Ty1* const _First1, _Ty1* const _Last1, _Ty2* const _First2, _Ty2* const _Last2) noexcept { - _STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2)); - if constexpr (sizeof(_Ty1) == 1) { - return const_cast<_Ty1*>( - static_cast(::__std_find_first_of_trivial_1(_First1, _Last1, _First2, _Last2))); - } else if constexpr (sizeof(_Ty1) == 2) { - return const_cast<_Ty1*>( - static_cast(::__std_find_first_of_trivial_2(_First1, _Last1, _First2, _Last2))); - } else if constexpr (sizeof(_Ty1) == 4) { - return const_cast<_Ty1*>( - static_cast(::__std_find_first_of_trivial_4(_First1, _Last1, _First2, _Last2))); - } else if constexpr (sizeof(_Ty1) == 8) { - return const_cast<_Ty1*>( - static_cast(::__std_find_first_of_trivial_8(_First1, _Last1, _First2, _Last2))); - } else { - _STL_INTERNAL_STATIC_ASSERT(false); // unexpected size - } -} - template _Ty1* _Search_vectorized(_Ty1* const _First1, _Ty1* const _Last1, _Ty2* const _First2, const size_t _Count2) noexcept { _STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2)); diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index e89a0fba91..de8eec5a04 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2930,10 +2930,435 @@ namespace { return _Result; } - namespace __std_find_first_of { +#ifndef _M_ARM64EC + namespace __std_find_meow_of_bitmap_details { + __m256i _Bitmap_step(const __m256i _Bitmap, const __m256i _Data) noexcept { + const __m256i _Data_high = _mm256_srli_epi32(_Data, 5); + const __m256i _Bitmap_parts = _mm256_permutevar8x32_epi32(_Bitmap, _Data_high); + const __m256i _Data_low_inv = _mm256_andnot_si256(_Data, _mm256_set1_epi32(0x1F)); + const __m256i _Mask = _mm256_sllv_epi32(_Bitmap_parts, _Data_low_inv); + return _Mask; + } + + template + __m256i _Load_avx_256_8(const _Ty* const _Src) noexcept { + if constexpr (sizeof(_Ty) == 1) { + return _mm256_cvtepu8_epi32(_mm_loadu_si64(_Src)); + } else if constexpr (sizeof(_Ty) == 2) { + return _mm256_cvtepu16_epi32(_mm_loadu_si128(reinterpret_cast(_Src))); + } else if constexpr (sizeof(_Ty) == 4) { + return _mm256_loadu_si256(reinterpret_cast(_Src)); + } else if constexpr (sizeof(_Ty) == 8) { + const __m256i _Low = _mm256_loadu_si256(reinterpret_cast(_Src)); + const __m256i _High = _mm256_loadu_si256(reinterpret_cast(_Src) + 1); + const __m256i _Pack = _mm256_packs_epi32(_Low, _High); + return _mm256_permute4x64_epi64(_Pack, _MM_SHUFFLE(3, 1, 2, 0)); + } else { + static_assert(false, "Unexpected size"); + } + } + + template + __m256i _Load_avx_256_8_last(const _Ty* const _Src, const size_t _Count) noexcept { + if constexpr (sizeof(_Ty) == 1) { + uint8_t _Buf[8]; + memcpy(_Buf, _Src, _Count); + return _mm256_cvtepu8_epi32(_mm_loadu_si64(_Buf)); + } else if constexpr (sizeof(_Ty) == 2) { + uint8_t _Buf[16]; + memcpy(_Buf, _Src, _Count * 2); + return _mm256_cvtepu16_epi32(_mm_loadu_si128(reinterpret_cast(_Buf))); + } else if constexpr (sizeof(_Ty) == 4) { + return _mm256_maskload_epi32(reinterpret_cast(_Src), _Avx2_tail_mask_32(_Count)); + } else if constexpr (sizeof(_Ty) == 8) { + const __m256i _Mask_low = _Avx2_tail_mask_32((_Count > 4 ? 4 : _Count) << 1); + const __m256i _Low = _mm256_maskload_epi32(reinterpret_cast(_Src) + 0, _Mask_low); + const __m256i _Mask_high = _Avx2_tail_mask_32((_Count > 4 ? _Count - 4 : 0) << 1); + const __m256i _High = _mm256_maskload_epi32(reinterpret_cast(_Src) + 8, _Mask_high); + const __m256i _Pack = _mm256_packs_epi32(_Low, _High); + return _mm256_permute4x64_epi64(_Pack, _MM_SHUFFLE(3, 1, 2, 0)); + } else { + static_assert(false, "Unexpected size"); + } + } + + template + __m256i _Mask_out_overflow(const __m256i _Mask, const __m256i _Data) noexcept { + if constexpr (sizeof(_Ty) == 1) { + return _Mask; + } else { + const __m256i _Data_high = _mm256_and_si256(_Data, _mm256_set1_epi32(static_cast(0xFFFF'FF00))); + const __m256i _Fit_mask = _mm256_cmpeq_epi32(_Data_high, _mm256_setzero_si256()); + return _mm256_and_si256(_Mask, _Fit_mask); + } + } + + template + __m256i _Make_bitmap_small(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { + __m256i _Bitmap = _mm256_setzero_si256(); + + const _Ty* const _Stop = _Needle_ptr + _Needle_length; + + for (; _Needle_ptr != _Stop; ++_Needle_ptr) { + const _Ty _Val = *_Needle_ptr; + const __m128i _Count_low = _mm_cvtsi32_si128(_Val & 0x3F); + const auto _Count_high_x8 = static_cast((_Val >> 3) & 0x18); + const __m256i _One_1_high = _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(1u << _Count_high_x8)); + const __m256i _One_1 = _mm256_sll_epi64(_One_1_high, _Count_low); + _Bitmap = _mm256_or_si256(_Bitmap, _One_1); + } + + return _Bitmap; + } + + template + __m256i _Make_bitmap_large(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { + alignas(32) uint8_t _Table[256] = {}; + + const _Ty* const _Stop = _Needle_ptr + _Needle_length; + + for (; _Needle_ptr != _Stop; ++_Needle_ptr) { + _Table[*_Needle_ptr] = 0xFF; + } + + const auto _Table_as_avx = reinterpret_cast(_Table); + + return _mm256_setr_epi32( // + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 0)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 1)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 2)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 3)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 4)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 5)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 6)), + _mm256_movemask_epi8(_mm256_load_si256(_Table_as_avx + 7))); + } + + template + __m256i _Make_bitmap(const _Ty* const _Needle_ptr, const size_t _Needle_length) noexcept { + if (_Needle_length <= 20) { + return _Make_bitmap_small(_Needle_ptr, _Needle_length); + } else { + return _Make_bitmap_large(_Needle_ptr, _Needle_length); + } + } + } // namespace __std_find_meow_of_bitmap_details +#endif // !_M_ARM64EC + + namespace __std_find_meow_of_bitmap { +#ifndef _M_ARM64EC + template + bool _Use_bitmap_avx(const size_t _Count1, const size_t _Count2) noexcept { + if constexpr (sizeof(_Ty) == 1) { + if (_Count2 <= 16) { + return _Count1 > 1000; + } else if (_Count2 <= 48) { + return _Count1 > 80; + } else if (_Count2 <= 240) { + return _Count1 > 40; + } else if (_Count2 <= 1000) { + return _Count1 > 32; + } else { + return _Count1 > 16; + } + } else if constexpr (sizeof(_Ty) == 2) { + if (_Count2 <= 8) { + return _Count1 > 128; + } else if (_Count2 <= 48) { + return _Count1 > 32; + } else if (_Count2 <= 72) { + return _Count1 > 24; + } else if (_Count2 <= 144) { + return _Count1 > 16; + } else { + return _Count1 > 8; + } + } else if constexpr (sizeof(_Ty) == 4) { + if (_Count2 <= 8) { + return _Count1 > 64; + } else if (_Count2 <= 24) { + return _Count1 > 40; + } else if (_Count2 <= 44) { + return _Count1 > 24; + } else if (_Count2 <= 112) { + return _Count1 > 16; + } else { + return _Count1 > 8; + } + } else if constexpr (sizeof(_Ty) == 8) { + if (_Count2 <= 8) { + return _Count1 > 40; + } else if (_Count2 <= 12) { + return _Count1 > 20; + } else if (_Count2 <= 48) { + return _Count1 > 16; + } else if (_Count2 <= 64) { + return _Count1 > 12; + } else if (_Count2 <= 192) { + return _Count1 > 8; + } else { + return _Count1 > 4; + } + } else { + static_assert(false, "unexpected size"); + } + } + + template + bool _Use_bitmap_scalar(const size_t _Count1, const size_t _Count2) noexcept { + if constexpr (sizeof(_Ty) == 1) { + if (_Count2 <= 32) { + return false; + } else if (_Count2 <= 48) { + return _Count1 > 416; + } else if (_Count2 <= 64) { + return _Count1 > 224; + } else if (_Count2 <= 80) { + return _Count1 > 128; + } else if (_Count2 <= 540) { + return _Count1 > 48; + } else { + return _Count1 > 32; + } + } else if constexpr (sizeof(_Ty) == 2) { + if (_Count2 <= 8) { + return false; + } else if (_Count2 <= 80) { + return _Count1 > 16; + } else { + return _Count1 > 8; + } + } else if constexpr (sizeof(_Ty) == 4) { + if (_Count2 <= 32) { + return false; + } else if (_Count2 <= 112) { + return _Count1 > 16; + } else { + return _Count1 > 8; + } + } else if constexpr (sizeof(_Ty) == 8) { + if (_Count2 <= 16) { + return false; + } else if (_Count2 <= 32) { + return _Count1 > 16; + } else if (_Count2 <= 112) { + return _Count1 > 8; + } else { + return _Count1 > 4; + } + } else { + static_assert(false, "unexpected size"); + } + } + + enum class _Strategy { _No_bitmap, _Scalar_bitmap, _Vector_bitmap }; + + template + _Strategy _Pick_strategy(const size_t _Count1, const size_t _Count2, const bool _Use_avx2_) noexcept { + if (_Use_avx2_ && _Count1 > 48) { + return _Use_bitmap_avx<_Ty>(_Count1, _Count2) ? _Strategy::_Vector_bitmap : _Strategy::_No_bitmap; + } else { + return _Use_bitmap_scalar<_Ty>(_Count1, _Count2) ? _Strategy::_Scalar_bitmap : _Strategy::_No_bitmap; + } + } + + template + bool _Can_fit_256_bits_sse(const _Ty* _Needle_ptr, const size_t _Needle_length) noexcept { + if constexpr (sizeof(_Ty) == 1) { + return true; + } else { + __m128i _Mask = _mm_undefined_si128(); + if constexpr (sizeof(_Ty) == 2) { + _Mask = _mm_set1_epi16(static_cast(0xFF00)); + } else if constexpr (sizeof(_Ty) == 4) { + _Mask = _mm_set1_epi32(static_cast(0xFFFF'FF00)); + } else if constexpr (sizeof(_Ty) == 8) { + _Mask = _mm_set1_epi64x(static_cast(0xFFFF'FFFF'FFFF'FF00)); + } else { + static_assert(false, "Unexpected size"); + } + + const size_t _Byte_size = _Needle_length * sizeof(_Ty); + + const void* _Stop = _Needle_ptr; + _Advance_bytes(_Stop, _Byte_size & ~size_t{0x1F}); + for (; _Needle_ptr != _Stop; _Needle_ptr += 32 / sizeof(_Ty)) { + const __m128i _Data = _mm_loadu_si128(reinterpret_cast(_Needle_ptr)); + if (!_mm_testz_si128(_Mask, _Data)) { + return false; + } + } + + _Advance_bytes(_Stop, _Byte_size & 0x1E); + for (; _Needle_ptr != _Stop; ++_Needle_ptr) { + if ((*_Needle_ptr & ~_Ty{0xFF}) != 0) { + return false; + } + } + + return true; + } + } + + template + size_t _Impl_first_avx(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, + const size_t _Needle_length) noexcept { + using namespace __std_find_meow_of_bitmap_details; + + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + + const auto _Haystack_ptr = static_cast(_Haystack); + const auto _Needle_ptr = static_cast(_Needle); + + const __m256i _Bitmap = _Make_bitmap(_Needle_ptr, _Needle_length); + + const size_t _Haystack_length_vec = _Haystack_length & ~size_t{7}; + for (size_t _Ix = 0; _Ix != _Haystack_length_vec; _Ix += 8) { + const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Ix); + const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); + const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + if (_Bingo != 0) { + return _Ix + _tzcnt_u32(_Bingo); + } + } + + const size_t _Haystack_length_tail = _Haystack_length & 7; + if (_Haystack_length_tail != 0) { + const unsigned int _Tail_bingo_mask = (1 << _Haystack_length_tail) - 1; + const __m256i _Data = _Load_avx_256_8_last(_Haystack_ptr + _Haystack_length_vec, _Haystack_length_tail); + const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); + const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; + if (_Bingo != 0) { + return _Haystack_length_vec + _tzcnt_u32(_Bingo); + } + } + return static_cast(-1); + } + + template + size_t _Impl_last_avx(const void* const _Haystack, size_t _Haystack_length, const void* const _Needle, + const size_t _Needle_length) noexcept { + using namespace __std_find_meow_of_bitmap_details; + + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + + const auto _Haystack_ptr = static_cast(_Haystack); + const auto _Needle_ptr = static_cast(_Needle); + + const __m256i _Bitmap = _Make_bitmap(_Needle_ptr, _Needle_length); + + while (_Haystack_length >= 8) { + _Haystack_length -= 8; + const __m256i _Data = _Load_avx_256_8(_Haystack_ptr + _Haystack_length); + const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); + const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + if (_Bingo != 0) { + return _Haystack_length + 31 - _lzcnt_u32(_Bingo); + } + } + + const size_t _Haystack_length_tail = _Haystack_length & 7; + if (_Haystack_length_tail != 0) { + const unsigned int _Tail_bingo_mask = (1 << _Haystack_length_tail) - 1; + const __m256i _Data = _Load_avx_256_8_last(_Haystack_ptr, _Haystack_length_tail); + const __m256i _Mask = _Mask_out_overflow<_Ty>(_Bitmap_step(_Bitmap, _Data), _Data); + const unsigned int _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)) & _Tail_bingo_mask; + if (_Bingo != 0) { + return 31 - _lzcnt_u32(_Bingo); + } + } + + return static_cast(-1); + } +#endif // !_M_ARM64EC + + using _Scalar_table_t = bool[256]; + + template + [[nodiscard]] bool _Build_scalar_table( + const void* const _Needle, const size_t _Needle_length, _Scalar_table_t& _Table) noexcept { + auto _Ptr = static_cast(_Needle); + const auto _End = _Ptr + _Needle_length; + + for (; _Ptr != _End; ++_Ptr) { + const _Ty _Val = *_Ptr; + + if constexpr (sizeof(_Val) > 1) { + if (_Val >= 256) { + return false; + } + } + + _Table[_Val] = true; + } + + return true; + } + +#ifndef _M_ARM64EC + template + void _Build_scalar_table_no_check( + const void* const _Needle, const size_t _Needle_length, _Scalar_table_t& _Table) noexcept { + auto _Ptr = static_cast(_Needle); + const auto _End = _Ptr + _Needle_length; + + for (; _Ptr != _End; ++_Ptr) { + _Table[*_Ptr] = true; + } + } +#endif // !_M_ARM64EC + + template + size_t _Impl_first_scalar( + const void* const _Haystack, const size_t _Haystack_length, const _Scalar_table_t& _Table) noexcept { + const auto _Haystack_ptr = static_cast(_Haystack); + + for (size_t _Ix = 0; _Ix != _Haystack_length; ++_Ix) { + const _Ty _Val = _Haystack_ptr[_Ix]; + + if constexpr (sizeof(_Val) > 1) { + if (_Val >= 256) { + continue; + } + } + + if (_Table[_Val]) { + return _Ix; + } + } + + return static_cast(-1); + } + + template + size_t _Impl_last_scalar( + const void* const _Haystack, size_t _Haystack_length, const _Scalar_table_t& _Table) noexcept { + const auto _Haystack_ptr = static_cast(_Haystack); + + while (_Haystack_length != 0) { + --_Haystack_length; + + const _Ty _Val = _Haystack_ptr[_Haystack_length]; + + if constexpr (sizeof(_Val) > 1) { + if (_Val >= 256) { + continue; + } + } + + if (_Table[_Val]) { + return _Haystack_length; + } + } + + return static_cast(-1); + } + } // namespace __std_find_meow_of_bitmap + + namespace __std_find_first_of { template - const void* __stdcall _Fallback(const void* _First1, const void* const _Last1, const void* const _First2, + const void* _Fallback(const void* _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { auto _Ptr_haystack = static_cast(_First1); const auto _Ptr_haystack_end = static_cast(_Last1); @@ -2951,42 +3376,39 @@ namespace { return _Ptr_haystack; } - template - const void* __stdcall _Impl_pcmpestri(const void* _First1, const void* const _Last1, const void* const _First2, - const void* const _Last2) noexcept { #ifndef _M_ARM64EC - if (_Use_sse42()) { - constexpr int _Op = (sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ANY - | _SIDD_LEAST_SIGNIFICANT; - constexpr int _Part_size_el = sizeof(_Ty) == 1 ? 16 : 8; - const size_t _Needle_length = _Byte_length(_First2, _Last2); - - const size_t _Haystack_length = _Byte_length(_First1, _Last1); - const void* _Stop_at = _First1; - _Advance_bytes(_Stop_at, _Haystack_length & ~size_t{0xF}); + template + const void* _Impl_pcmpestri(const void* _First1, const size_t _Haystack_length, const void* const _First2, + const size_t _Needle_length) noexcept { + constexpr int _Op = + (sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + constexpr int _Part_size_el = sizeof(_Ty) == 1 ? 16 : 8; - if (_Needle_length <= 16) { - // Special handling of small needle - // The generic branch could also be modified to handle it, but with slightly worse performance + const void* _Stop_at = _First1; + _Advance_bytes(_Stop_at, _Haystack_length & ~size_t{0xF}); - const int _Needle_length_el = static_cast(_Needle_length / sizeof(_Ty)); + if (_Needle_length <= 16) { + // Special handling of small needle + // The generic branch could also be modified to handle it, but with slightly worse performance - alignas(16) uint8_t _Tmp2[16]; - memcpy(_Tmp2, _First2, _Needle_length); - const __m128i _Data2 = _mm_load_si128(reinterpret_cast(_Tmp2)); + const int _Needle_length_el = static_cast(_Needle_length / sizeof(_Ty)); - while (_First1 != _Stop_at) { - const __m128i _Data1 = _mm_loadu_si128(static_cast(_First1)); - if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Part_size_el, _Op)) { - const int _Pos = _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Part_size_el, _Op); - _Advance_bytes(_First1, _Pos * sizeof(_Ty)); - return _First1; - } + alignas(16) uint8_t _Tmp2[16]; + memcpy(_Tmp2, _First2, _Needle_length); + const __m128i _Data2 = _mm_load_si128(reinterpret_cast(_Tmp2)); - _Advance_bytes(_First1, 16); + while (_First1 != _Stop_at) { + const __m128i _Data1 = _mm_loadu_si128(static_cast(_First1)); + if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Part_size_el, _Op)) { + const int _Pos = _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Part_size_el, _Op); + _Advance_bytes(_First1, _Pos * sizeof(_Ty)); + return _First1; } - const size_t _Last_part_size = _Haystack_length & 0xF; + _Advance_bytes(_First1, 16); + } + + if (const size_t _Last_part_size = _Haystack_length & 0xF; _Last_part_size != 0) { const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); alignas(16) uint8_t _Tmp1[16]; @@ -3000,60 +3422,62 @@ namespace { } _Advance_bytes(_First1, _Last_part_size); - return _First1; - } else { - const void* _Last_needle = _First2; - _Advance_bytes(_Last_needle, _Needle_length & ~size_t{0xF}); + } - const int _Last_needle_length = static_cast(_Needle_length & 0xF); + return _First1; + } else { + const void* _Last_needle = _First2; + _Advance_bytes(_Last_needle, _Needle_length & ~size_t{0xF}); - alignas(16) uint8_t _Tmp2[16]; - memcpy(_Tmp2, _Last_needle, _Last_needle_length); - const __m128i _Last_needle_val = _mm_load_si128(reinterpret_cast(_Tmp2)); - const int _Last_needle_length_el = _Last_needle_length / sizeof(_Ty); + const int _Last_needle_length = static_cast(_Needle_length & 0xF); - constexpr int _Not_found = 16; // arbitrary value greater than any found value + alignas(16) uint8_t _Tmp2[16]; + memcpy(_Tmp2, _Last_needle, _Last_needle_length); + const __m128i _Last_needle_val = _mm_load_si128(reinterpret_cast(_Tmp2)); + const int _Last_needle_length_el = _Last_needle_length / sizeof(_Ty); - int _Found_pos = _Not_found; + constexpr int _Not_found = 16; // arbitrary value greater than any found value - const auto _Step = [&_Found_pos](const __m128i _Data2, const int _Size2, const __m128i _Data1, - const int _Size1) noexcept { - if (_mm_cmpestrc(_Data2, _Size2, _Data1, _Size1, _Op)) { - const int _Pos = _mm_cmpestri(_Data2, _Size2, _Data1, _Size1, _Op); - if (_Pos < _Found_pos) { - _Found_pos = _Pos; - } + int _Found_pos = _Not_found; + + const auto _Step = [&_Found_pos](const __m128i _Data2, const int _Size2, const __m128i _Data1, + const int _Size1) noexcept { + if (_mm_cmpestrc(_Data2, _Size2, _Data1, _Size1, _Op)) { + const int _Pos = _mm_cmpestri(_Data2, _Size2, _Data1, _Size1, _Op); + if (_Pos < _Found_pos) { + _Found_pos = _Pos; } - }; + } + }; #pragma warning(push) #pragma warning(disable : 4324) // structure was padded due to alignment specifier - const auto _Test_whole_needle = [=](const __m128i _Data1, const int _Size1) noexcept { - const void* _Cur_needle = _First2; - do { - const __m128i _Data2 = _mm_loadu_si128(static_cast(_Cur_needle)); - _Step(_Data2, _Part_size_el, _Data1, _Size1); - _Advance_bytes(_Cur_needle, 16); - } while (_Cur_needle != _Last_needle); - - if (_Last_needle_length_el != 0) { - _Step(_Last_needle_val, _Last_needle_length_el, _Data1, _Size1); - } - }; -#pragma warning(pop) + const auto _Test_whole_needle = [=](const __m128i _Data1, const int _Size1) noexcept { + const void* _Cur_needle = _First2; + do { + const __m128i _Data2 = _mm_loadu_si128(static_cast(_Cur_needle)); + _Step(_Data2, _Part_size_el, _Data1, _Size1); + _Advance_bytes(_Cur_needle, 16); + } while (_Cur_needle != _Last_needle); - while (_First1 != _Stop_at) { - _Test_whole_needle(_mm_loadu_si128(static_cast(_First1)), _Part_size_el); + if (_Last_needle_length_el != 0) { + _Step(_Last_needle_val, _Last_needle_length_el, _Data1, _Size1); + } + }; +#pragma warning(pop) - if (_Found_pos != _Not_found) { - _Advance_bytes(_First1, _Found_pos * sizeof(_Ty)); - return _First1; - } + while (_First1 != _Stop_at) { + _Test_whole_needle(_mm_loadu_si128(static_cast(_First1)), _Part_size_el); - _Advance_bytes(_First1, 16); + if (_Found_pos != _Not_found) { + _Advance_bytes(_First1, _Found_pos * sizeof(_Ty)); + return _First1; } - const size_t _Last_part_size = _Haystack_length & 0xF; + _Advance_bytes(_First1, 16); + } + + if (const size_t _Last_part_size = _Haystack_length & 0xF; _Last_part_size != 0) { const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); alignas(16) uint8_t _Tmp1[16]; @@ -3065,15 +3489,18 @@ namespace { _Test_whole_needle(_Data1, _Last_part_size_el); _Advance_bytes(_First1, _Found_pos * sizeof(_Ty)); - return _First1; } + + return _First1; } -#endif // !_M_ARM64EC - return _Fallback<_Ty>(_First1, _Last1, _First2, _Last2); } +#endif // !_M_ARM64EC + + template + struct _Find_first_of_traits; - struct _Traits_4 : _Find_traits_4 { - using _Ty = uint32_t; + template <> + struct _Find_first_of_traits : _Find_traits_4 { #ifndef _M_ARM64EC template static __m256i _Spread_avx(__m256i _Val, const size_t _Needle_length_el) noexcept { @@ -3118,8 +3545,8 @@ namespace { #endif // !_M_ARM64EC }; - struct _Traits_8 : _Find_traits_8 { - using _Ty = uint64_t; + template <> + struct _Find_first_of_traits : _Find_traits_8 { #ifndef _M_ARM64EC template static __m256i _Spread_avx(const __m256i _Val, const size_t _Needle_length_el) noexcept { @@ -3183,18 +3610,16 @@ namespace { return _Eq; } - template - const void* _Shuffle_impl(const void* _First1, const void* const _Last1, const void* const _First2, + template + const void* _Shuffle_impl(const void* _First1, const size_t _Haystack_length, const void* const _First2, const void* const _Stop2, const size_t _Last2_length_el) noexcept { - using _Ty = _Traits::_Ty; + using _Traits = _Find_first_of_traits<_Ty>; constexpr size_t _Length_el = 32 / sizeof(_Ty); const __m256i _Last2val = _mm256_maskload_epi32( reinterpret_cast(_Stop2), _Avx2_tail_mask_32(_Last2_length_el * (sizeof(_Ty) / 4))); const __m256i _Last2s0 = _Traits::_Spread_avx<_Last2_length_el_magnitude>(_Last2val, _Last2_length_el); - const size_t _Haystack_length = _Byte_length(_First1, _Last1); - const void* _Stop1 = _First1; _Advance_bytes(_Stop1, _Haystack_length & ~size_t{0x1F}); @@ -3240,61 +3665,192 @@ namespace { return _First1; } - template - const void* _Shuffle_impl_dispatch_magnitude(const void* const _First1, const void* const _Last1, + template + const void* _Shuffle_impl_dispatch_magnitude(const void* const _First1, const size_t _Haystack_length, const void* const _First2, const void* const _Stop2, const size_t _Last2_length_el) noexcept { if (_Last2_length_el == 0) { - return _Shuffle_impl<_Traits, _Large, 0>(_First1, _Last1, _First2, _Stop2, _Last2_length_el); + return _Shuffle_impl<_Ty, _Large, 0>(_First1, _Haystack_length, _First2, _Stop2, _Last2_length_el); } else if (_Last2_length_el == 1) { - return _Shuffle_impl<_Traits, _Large, 1>(_First1, _Last1, _First2, _Stop2, _Last2_length_el); + return _Shuffle_impl<_Ty, _Large, 1>(_First1, _Haystack_length, _First2, _Stop2, _Last2_length_el); } else if (_Last2_length_el == 2) { - return _Shuffle_impl<_Traits, _Large, 2>(_First1, _Last1, _First2, _Stop2, _Last2_length_el); + return _Shuffle_impl<_Ty, _Large, 2>(_First1, _Haystack_length, _First2, _Stop2, _Last2_length_el); } else if (_Last2_length_el <= 4) { - return _Shuffle_impl<_Traits, _Large, 4>(_First1, _Last1, _First2, _Stop2, _Last2_length_el); + return _Shuffle_impl<_Ty, _Large, 4>(_First1, _Haystack_length, _First2, _Stop2, _Last2_length_el); } else if (_Last2_length_el <= 8) { - if constexpr (sizeof(_Traits::_Ty) == 4) { - return _Shuffle_impl<_Traits, _Large, 8>(_First1, _Last1, _First2, _Stop2, _Last2_length_el); + if constexpr (sizeof(_Ty) == 4) { + return _Shuffle_impl<_Ty, _Large, 8>(_First1, _Haystack_length, _First2, _Stop2, _Last2_length_el); } } _STL_UNREACHABLE; } + template + const void* _Impl_4_8(const void* const _First1, const size_t _Haystack_length, const void* const _First2, + const size_t _Needle_length) noexcept { + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + + const size_t _Last_needle_length = _Needle_length & 0x1F; + const size_t _Last_needle_length_el = _Last_needle_length / sizeof(_Ty); + + if (const size_t _Needle_length_large = _Needle_length & ~size_t{0x1F}; _Needle_length_large != 0) { + const void* _Stop2 = _First2; + _Advance_bytes(_Stop2, _Needle_length_large); + return _Shuffle_impl_dispatch_magnitude<_Ty, true>( + _First1, _Haystack_length, _First2, _Stop2, _Last_needle_length_el); + } else { + return _Shuffle_impl_dispatch_magnitude<_Ty, false>( + _First1, _Haystack_length, _First2, _First2, _Last_needle_length_el); + } + } #endif // !_M_ARM64EC - template - const void* __stdcall _Impl_4_8(const void* const _First1, const void* const _Last1, const void* const _First2, + template + const void* _Dispatch_ptr(const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { - using _Ty = _Traits::_Ty; #ifndef _M_ARM64EC - if (_Use_avx2()) { - _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + if constexpr (sizeof(_Ty) <= 2) { + if (_Use_sse42()) { + return _Impl_pcmpestri<_Ty>( + _First1, _Byte_length(_First1, _Last1), _First2, _Byte_length(_First2, _Last2)); + } + } else { + if (_Use_avx2()) { + return _Impl_4_8<_Ty>( + _First1, _Byte_length(_First1, _Last1), _First2, _Byte_length(_First2, _Last2)); + } + } +#endif // !_M_ARM64EC - const size_t _Needle_length = _Byte_length(_First2, _Last2); - const size_t _Last_needle_length = _Needle_length & 0x1F; - const size_t _Last_needle_length_el = _Last_needle_length / sizeof(_Ty); + return _Fallback<_Ty>(_First1, _Last1, _First2, _Last2); + } - if (const size_t _Needle_length_large = _Needle_length & ~size_t{0x1F}; _Needle_length_large != 0) { - const void* _Stop2 = _First2; - _Advance_bytes(_Stop2, _Needle_length_large); - return _Shuffle_impl_dispatch_magnitude<_Traits, true>( - _First1, _Last1, _First2, _Stop2, _Last_needle_length_el); - } else { - return _Shuffle_impl_dispatch_magnitude<_Traits, false>( - _First1, _Last1, _First2, _First2, _Last_needle_length_el); + template + size_t _Pos_from_ptr(const void* const _Result, const void* const _First1, const void* const _Last1) noexcept { + if (_Result != _Last1) { + return _Byte_length(_First1, _Result) / sizeof(_Ty); + } else { + return static_cast(-1); + } + } + +#ifndef _M_ARM64EC + template + size_t _Dispatch_pos_sse_1_2( + const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { + using namespace __std_find_meow_of_bitmap; + + const _Strategy _Strat = _Pick_strategy<_Ty>(_Count1, _Count2, _Use_avx2()); + + if (_Strat == _Strategy::_Vector_bitmap) { + if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + return _Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); + } + } else if (_Strat == _Strategy::_Scalar_bitmap) { + if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + alignas(32) _Scalar_table_t _Table = {}; + _Build_scalar_table_no_check<_Ty>(_First2, _Count2, _Table); + return _Impl_first_scalar<_Ty>(_First1, _Count1, _Table); + } + } + + const void* const _Last1 = static_cast(_First1) + _Count1; + const size_t _Size_bytes_1 = _Count1 * sizeof(_Ty); + const size_t _Size_bytes_2 = _Count2 * sizeof(_Ty); + + return _Pos_from_ptr<_Ty>( + _Impl_pcmpestri<_Ty>(_First1, _Size_bytes_1, _First2, _Size_bytes_2), _First1, _Last1); + } + + template + size_t _Dispatch_pos_avx_4_8( + const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { + using namespace __std_find_meow_of_bitmap; + + const auto _Strat = _Pick_strategy<_Ty>(_Count1, _Count2, true); + + if (_Strat == _Strategy::_Vector_bitmap) { + if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + return _Impl_first_avx<_Ty>(_First1, _Count1, _First2, _Count2); + } + } else if (_Strat == _Strategy::_Scalar_bitmap) { + if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + alignas(32) _Scalar_table_t _Table = {}; + _Build_scalar_table_no_check<_Ty>(_First2, _Count2, _Table); + return _Impl_first_scalar<_Ty>(_First1, _Count1, _Table); } } + + const void* const _Last1 = static_cast(_First1) + _Count1; + const size_t _Size_bytes_1 = _Count1 * sizeof(_Ty); + const size_t _Size_bytes_2 = _Count2 * sizeof(_Ty); + + return _Pos_from_ptr<_Ty>(_Impl_4_8<_Ty>(_First1, _Size_bytes_1, _First2, _Size_bytes_2), _First1, _Last1); + } #endif // !_M_ARM64EC - return _Fallback<_Ty>(_First1, _Last1, _First2, _Last2); + + template + size_t _Dispatch_pos_fallback( + const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { + using namespace __std_find_meow_of_bitmap; + + _Scalar_table_t _Table = {}; + if (_Build_scalar_table<_Ty>(_First2, _Count2, _Table)) { + return _Impl_first_scalar<_Ty>(_First1, _Count1, _Table); + } + + const void* const _Last1 = static_cast(_First1) + _Count1; + const void* const _Last2 = static_cast(_First2) + _Count2; + + return _Pos_from_ptr<_Ty>(_Fallback<_Ty>(_First1, _Last1, _First2, _Last2), _First1, _Last1); + } + + template + size_t _Dispatch_pos( + const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { +#ifndef _M_ARM64EC + if constexpr (sizeof(_Ty) <= 2) { + if (_Use_sse42()) { + return _Dispatch_pos_sse_1_2<_Ty>(_First1, _Count1, _First2, _Count2); + } + } else { + if (_Use_avx2()) { + return _Dispatch_pos_avx_4_8<_Ty>(_First1, _Count1, _First2, _Count2); + } + } +#endif // !_M_ARM64EC + return _Dispatch_pos_fallback<_Ty>(_First1, _Count1, _First2, _Count2); } } // namespace __std_find_first_of - template - size_t __stdcall __std_find_last_of_pos_impl(const void* const _Haystack, const size_t _Haystack_length, - const void* const _Needle, const size_t _Needle_length) noexcept { + namespace __std_find_last_of { + template + size_t __stdcall _Fallback(const void* const _Haystack, const size_t _Haystack_length, + const void* const _Needle, const size_t _Needle_length) noexcept { + + const auto _Ptr_haystack = static_cast(_Haystack); + size_t _Pos = _Haystack_length; + const auto _Needle_end = static_cast(_Needle) + _Needle_length; + + while (_Pos != 0) { + --_Pos; + + for (auto _Ptr = static_cast(_Needle); _Ptr != _Needle_end; ++_Ptr) { + if (_Ptr_haystack[_Pos] == *_Ptr) { + return _Pos; + } + } + } + + return static_cast(-1); + } + #ifndef _M_ARM64EC - const size_t _Haystack_length_bytes = _Haystack_length * sizeof(_Ty); - if (_Use_sse42() && _Haystack_length_bytes >= 16) { + template + size_t _Impl(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, + const size_t _Needle_length) noexcept { + const size_t _Haystack_length_bytes = _Haystack_length * sizeof(_Ty); + constexpr int _Op = (sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ANY | _SIDD_MOST_SIGNIFICANT; constexpr int _Part_size_el = sizeof(_Ty) == 1 ? 16 : 8; @@ -3327,11 +3883,21 @@ namespace { } } - const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); - const __m128i _Data1 = _mm_loadu_si128(reinterpret_cast(_Haystack)); + if (_Last_part_size != 0) { + const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); + __m128i _Data1; + + if (_Haystack_length_bytes >= 16) { + _Data1 = _mm_loadu_si128(reinterpret_cast(_Haystack)); + } else { + alignas(16) uint8_t _Tmp1[16]; + memcpy(_Tmp1, _Haystack, _Haystack_length_bytes); + _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); + } - if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op)) { - return _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op); + if (_mm_cmpestrc(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op)) { + return _mm_cmpestri(_Data2, _Needle_length_el, _Data1, _Last_part_size_el, _Op); + } } return static_cast(-1); @@ -3384,30 +3950,60 @@ namespace { } } - const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); - const __m128i _Data1 = _mm_loadu_si128(reinterpret_cast(_Haystack)); - _Test_whole_needle(_Data1, _Last_part_size_el); + if (_Last_part_size != 0) { + const int _Last_part_size_el = static_cast(_Last_part_size / sizeof(_Ty)); + __m128i _Data1; + + if (_Haystack_length_bytes >= 16) { + _Data1 = _mm_loadu_si128(reinterpret_cast(_Haystack)); + } else { + alignas(16) uint8_t _Tmp1[16]; + memcpy(_Tmp1, _Haystack, _Haystack_length_bytes); + _Data1 = _mm_load_si128(reinterpret_cast(_Tmp1)); + } + + _Test_whole_needle(_Data1, _Last_part_size_el); + } return static_cast(_Found_pos); } } #endif // !_M_ARM64EC - const auto _Ptr_haystack = static_cast(_Haystack); - size_t _Pos = _Haystack_length; - const auto _Needle_end = static_cast(_Needle) + _Needle_length; - while (_Pos != 0) { - --_Pos; + template + size_t _Dispatch_pos( + const void* const _First1, const size_t _Count1, const void* const _First2, const size_t _Count2) noexcept { + using namespace __std_find_meow_of_bitmap; + +#ifndef _M_ARM64EC + if (_Use_sse42()) { + const auto _Strat = _Pick_strategy<_Ty>(_Count1, _Count2, _Use_avx2()); - for (auto _Ptr = static_cast(_Needle); _Ptr != _Needle_end; ++_Ptr) { - if (_Ptr_haystack[_Pos] == *_Ptr) { - return _Pos; + if (_Strat == _Strategy::_Vector_bitmap) { + if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + return _Impl_last_avx<_Ty>(_First1, _Count1, _First2, _Count2); + } + } else if (_Strat == _Strategy::_Scalar_bitmap) { + if (_Can_fit_256_bits_sse(static_cast(_First2), _Count2)) { + alignas(32) _Scalar_table_t _Table = {}; + _Build_scalar_table_no_check<_Ty>(_First2, _Count2, _Table); + return _Impl_last_scalar<_Ty>(_First1, _Count1, _Table); + } } + + return _Impl<_Ty>(_First1, _Count1, _First2, _Count2); + } else +#endif // !_M_ARM64EC + { + alignas(32) _Scalar_table_t _Table = {}; + if (_Build_scalar_table<_Ty>(_First2, _Count2, _Table)) { + return _Impl_last_scalar<_Ty>(_First1, _Count1, _Table); + } + + return _Fallback<_Ty>(_First1, _Count1, _First2, _Count2); } } - - return static_cast(-1); - } + } // namespace __std_find_last_of template __declspec(noalias) size_t __stdcall __std_mismatch_impl( @@ -3965,32 +4561,52 @@ __declspec(noalias) size_t __stdcall __std_count_trivial_8( const void* __stdcall __std_find_first_of_trivial_1( const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { - return __std_find_first_of::_Impl_pcmpestri(_First1, _Last1, _First2, _Last2); + return __std_find_first_of::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); } const void* __stdcall __std_find_first_of_trivial_2( const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { - return __std_find_first_of::_Impl_pcmpestri(_First1, _Last1, _First2, _Last2); + return __std_find_first_of::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); } const void* __stdcall __std_find_first_of_trivial_4( const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { - return __std_find_first_of::_Impl_4_8<__std_find_first_of::_Traits_4>(_First1, _Last1, _First2, _Last2); + return __std_find_first_of::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); } const void* __stdcall __std_find_first_of_trivial_8( const void* const _First1, const void* const _Last1, const void* const _First2, const void* const _Last2) noexcept { - return __std_find_first_of::_Impl_4_8<__std_find_first_of::_Traits_8>(_First1, _Last1, _First2, _Last2); + return __std_find_first_of::_Dispatch_ptr(_First1, _Last1, _First2, _Last2); +} + +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_1( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { + return __std_find_first_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); +} + +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_2( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { + return __std_find_first_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); +} + +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_4( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { + return __std_find_first_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); +} + +__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_8( + const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept { + return __std_find_first_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); } __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_1(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { - return __std_find_last_of_pos_impl(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_last_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); } __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_2(const void* const _Haystack, const size_t _Haystack_length, const void* const _Needle, const size_t _Needle_length) noexcept { - return __std_find_last_of_pos_impl(_Haystack, _Haystack_length, _Needle, _Needle_length); + return __std_find_last_of::_Dispatch_pos(_Haystack, _Haystack_length, _Needle, _Needle_length); } const void* __stdcall __std_search_1(