From c2c21e10ed9d8adadfc832d51326b09978472dcf Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Fri, 27 Sep 2024 21:11:44 +0300 Subject: [PATCH 01/34] remove vectorization --- benchmarks/CMakeLists.txt | 1 + benchmarks/inc/lorem.hpp | 35 +++ benchmarks/src/remove.cpp | 39 +++ benchmarks/src/replace.cpp | 43 +-- benchmarks/src/search.cpp | 40 +-- benchmarks/src/sv_equal.cpp | 23 +- stl/inc/algorithm | 24 ++ stl/inc/xmemory | 58 ++++ stl/src/vector_algorithms.cpp | 273 ++++++++++++++++++ .../VSO_0000000_vector_algorithms/test.cpp | 66 +++++ 10 files changed, 511 insertions(+), 91 deletions(-) create mode 100644 benchmarks/inc/lorem.hpp create mode 100644 benchmarks/src/remove.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 31572a968f..3df5417aa7 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -117,6 +117,7 @@ add_benchmark(mismatch src/mismatch.cpp) add_benchmark(path_lexically_normal src/path_lexically_normal.cpp) add_benchmark(priority_queue_push_range src/priority_queue_push_range.cpp) add_benchmark(random_integer_generation src/random_integer_generation.cpp) +add_benchmark(remove src/remove.cpp) add_benchmark(replace src/replace.cpp) add_benchmark(search src/search.cpp) add_benchmark(std_copy src/std_copy.cpp) diff --git a/benchmarks/inc/lorem.hpp b/benchmarks/inc/lorem.hpp new file mode 100644 index 0000000000..e7e70aa20f --- /dev/null +++ b/benchmarks/inc/lorem.hpp @@ -0,0 +1,35 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +const char lorem_ipsum[] = + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam mollis imperdiet massa, at dapibus elit interdum " + "ac. In eget sollicitudin mi. Nam at tellus at sapien tincidunt sollicitudin vel non eros. Pellentesque nunc nunc, " + "ullamcorper eu accumsan at, pulvinar non turpis. Quisque vel mauris pulvinar, pretium purus vel, ultricies erat. " + "Curabitur a magna in ligula tristique ornare. Quisque commodo, massa viverra laoreet luctus, sem nisi aliquam " + "velit, fermentum pulvinar velit leo eget justo. Suspendisse vel erat efficitur, pulvinar eros volutpat, vulputate " + "ex. Phasellus non purus vel velit tristique tristique id at ligula. Quisque mollis sodales magna. Mauris et quam " + "eu quam viverra tempus. Nullam tempus maximus porta. Nunc mattis eleifend fermentum. Nullam aliquam libero " + "accumsan velit elementum, eu laoreet metus convallis. Donec pellentesque lacus ut iaculis iaculis. Curabitur orci " + "elit, bibendum sit amet feugiat at, iaculis sit amet massa. Maecenas imperdiet lacus at vehicula iaculis. Donec " + "volutpat nunc sit amet accumsan tempor. Quisque pretium vestibulum ultricies. Suspendisse potenti. Aenean at diam " + "iaculis, condimentum felis venenatis, condimentum erat. Nam quis elit dui. Duis quis odio vitae metus hendrerit " + "rhoncus ut et magna. Cras ac augue quis nibh pharetra sagittis. Donec ullamcorper vel eros semper pretium. Proin " + "vel sollicitudin eros. Nulla sollicitudin mattis turpis id suscipit. Aliquam sed risus velit. Aliquam iaculis nec " + "nibh ac egestas. Duis finibus semper est sed consequat. Sed in sapien quis nibh dignissim mattis. Vestibulum nec " + "metus sodales, euismod mauris ac, sollicitudin libero. Maecenas non arcu ac velit ullamcorper fringilla et quis " + "nulla. Curabitur posuere leo eget ipsum tincidunt dignissim. Cras ultricies suscipit neque, quis suscipit tortor " + "venenatis non. Cras nisl mi, bibendum in vulputate quis, vestibulum ornare enim. Nunc hendrerit placerat dui, " + "aliquam mollis sem convallis et. Integer vitae urna diam. Phasellus et imperdiet est. Maecenas auctor facilisis " + "nibh non commodo. Suspendisse iaculis quam id bibendum feugiat. Pellentesque felis erat, egestas a libero ac, " + "laoreet consectetur elit. Cras ut suscipit ex. Etiam gravida sem quis ex porta, eu lacinia tortor fermentum. " + "Nulla consequat odio enim, sed condimentum est sagittis a. Quisque nec commodo tellus. Phasellus elementum " + "feugiat dolor et feugiat. Praesent sed mattis tortor. In vitae sodales purus. Morbi accumsan, ligula et interdum " + "lacinia, leo risus suscipit urna, non luctus mi justo eu ipsum. Curabitur venenatis pretium orci id porttitor. " + "Quisque dapibus nisl sit amet elit lobortis sagittis. Orci varius natoque penatibus et magnis dis parturient " + "montes, nascetur ridiculus mus. Mauris varius dui sit amet tortor facilisis vestibulum. Curabitur condimentum " + "justo nec orci mattis auctor. Quisque aliquet condimentum arcu ac sollicitudin. Maecenas elit elit, condimentum " + "vitae auctor a, cursus et sem. Cras vehicula ante in consequat fermentum. Praesent at massa nisi. Mauris pretium " + "euismod eros, ut posuere ligula ullamcorper id. Nullam aliquet malesuada est at dignissim. Pellentesque finibus " + "sagittis libero nec bibendum. Phasellus dolor ipsum, finibus quis turpis quis, mollis interdum felis."; diff --git a/benchmarks/src/remove.cpp b/benchmarks/src/remove.cpp new file mode 100644 index 0000000000..2e47fbcc20 --- /dev/null +++ b/benchmarks/src/remove.cpp @@ -0,0 +1,39 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include + +#include "lorem.hpp" + +enum class alg_type { std_fn, rng }; + +template +void r(benchmark::State& state) { + std::vector src(std::begin(lorem_ipsum), std::end(lorem_ipsum)); + std::vector v; + v.reserve(std::size(lorem_ipsum)); + for (auto _ : state) { + v = src; + benchmark::DoNotOptimize(v); + if constexpr (Type == alg_type::std_fn) { + benchmark::DoNotOptimize(std::remove(v.begin(), v.end(), T{'l'})); + } else { + benchmark::DoNotOptimize(std::ranges::remove(v, T{'l'})); + } + } +} + +BENCHMARK(r); +BENCHMARK(r); +BENCHMARK(r); +BENCHMARK(r); + +BENCHMARK(r); +BENCHMARK(r); +BENCHMARK(r); +BENCHMARK(r); + +BENCHMARK_MAIN(); diff --git a/benchmarks/src/replace.cpp b/benchmarks/src/replace.cpp index 5740edaaab..ea83dc8d4e 100644 --- a/benchmarks/src/replace.cpp +++ b/benchmarks/src/replace.cpp @@ -6,41 +6,12 @@ #include #include -const char src[] = - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam mollis imperdiet massa, at dapibus elit interdum " - "ac. In eget sollicitudin mi. Nam at tellus at sapien tincidunt sollicitudin vel non eros. Pellentesque nunc nunc, " - "ullamcorper eu accumsan at, pulvinar non turpis. Quisque vel mauris pulvinar, pretium purus vel, ultricies erat. " - "Curabitur a magna in ligula tristique ornare. Quisque commodo, massa viverra laoreet luctus, sem nisi aliquam " - "velit, fermentum pulvinar velit leo eget justo. Suspendisse vel erat efficitur, pulvinar eros volutpat, vulputate " - "ex. Phasellus non purus vel velit tristique tristique id at ligula. Quisque mollis sodales magna. Mauris et quam " - "eu quam viverra tempus. Nullam tempus maximus porta. Nunc mattis eleifend fermentum. Nullam aliquam libero " - "accumsan velit elementum, eu laoreet metus convallis. Donec pellentesque lacus ut iaculis iaculis. Curabitur orci " - "elit, bibendum sit amet feugiat at, iaculis sit amet massa. Maecenas imperdiet lacus at vehicula iaculis. Donec " - "volutpat nunc sit amet accumsan tempor. Quisque pretium vestibulum ultricies. Suspendisse potenti. Aenean at diam " - "iaculis, condimentum felis venenatis, condimentum erat. Nam quis elit dui. Duis quis odio vitae metus hendrerit " - "rhoncus ut et magna. Cras ac augue quis nibh pharetra sagittis. Donec ullamcorper vel eros semper pretium. Proin " - "vel sollicitudin eros. Nulla sollicitudin mattis turpis id suscipit. Aliquam sed risus velit. Aliquam iaculis nec " - "nibh ac egestas. Duis finibus semper est sed consequat. Sed in sapien quis nibh dignissim mattis. Vestibulum nec " - "metus sodales, euismod mauris ac, sollicitudin libero. Maecenas non arcu ac velit ullamcorper fringilla et quis " - "nulla. Curabitur posuere leo eget ipsum tincidunt dignissim. Cras ultricies suscipit neque, quis suscipit tortor " - "venenatis non. Cras nisl mi, bibendum in vulputate quis, vestibulum ornare enim. Nunc hendrerit placerat dui, " - "aliquam mollis sem convallis et. Integer vitae urna diam. Phasellus et imperdiet est. Maecenas auctor facilisis " - "nibh non commodo. Suspendisse iaculis quam id bibendum feugiat. Pellentesque felis erat, egestas a libero ac, " - "laoreet consectetur elit. Cras ut suscipit ex. Etiam gravida sem quis ex porta, eu lacinia tortor fermentum. " - "Nulla consequat odio enim, sed condimentum est sagittis a. Quisque nec commodo tellus. Phasellus elementum " - "feugiat dolor et feugiat. Praesent sed mattis tortor. In vitae sodales purus. Morbi accumsan, ligula et interdum " - "lacinia, leo risus suscipit urna, non luctus mi justo eu ipsum. Curabitur venenatis pretium orci id porttitor. " - "Quisque dapibus nisl sit amet elit lobortis sagittis. Orci varius natoque penatibus et magnis dis parturient " - "montes, nascetur ridiculus mus. Mauris varius dui sit amet tortor facilisis vestibulum. Curabitur condimentum " - "justo nec orci mattis auctor. Quisque aliquet condimentum arcu ac sollicitudin. Maecenas elit elit, condimentum " - "vitae auctor a, cursus et sem. Cras vehicula ante in consequat fermentum. Praesent at massa nisi. Mauris pretium " - "euismod eros, ut posuere ligula ullamcorper id. Nullam aliquet malesuada est at dignissim. Pellentesque finibus " - "sagittis libero nec bibendum. Phasellus dolor ipsum, finibus quis turpis quis, mollis interdum felis."; +#include "lorem.hpp" template void r(benchmark::State& state) { - const std::vector a(std::begin(src), std::end(src)); - std::vector b(std::size(src)); + const std::vector a(std::begin(lorem_ipsum), std::end(lorem_ipsum)); + std::vector b(std::size(lorem_ipsum)); for (auto _ : state) { b = a; @@ -50,8 +21,8 @@ void r(benchmark::State& state) { template void rc(benchmark::State& state) { - const std::vector a(std::begin(src), std::end(src)); - std::vector b(std::size(src)); + const std::vector a(std::begin(lorem_ipsum), std::end(lorem_ipsum)); + std::vector b(std::size(lorem_ipsum)); for (auto _ : state) { std::replace_copy(std::begin(a), std::end(a), std::begin(b), T{'m'}, T{'w'}); @@ -60,8 +31,8 @@ void rc(benchmark::State& state) { template void rc_if(benchmark::State& state) { - const std::vector a(std::begin(src), std::end(src)); - std::vector b(std::size(src)); + const std::vector a(std::begin(lorem_ipsum), std::end(lorem_ipsum)); + std::vector b(std::size(lorem_ipsum)); for (auto _ : state) { (void) std::replace_copy_if( diff --git a/benchmarks/src/search.cpp b/benchmarks/src/search.cpp index c6fad3d4fc..2d5265e0f1 100644 --- a/benchmarks/src/search.cpp +++ b/benchmarks/src/search.cpp @@ -10,38 +10,10 @@ #include #include #include + +#include "lorem.hpp" using namespace std::string_view_literals; -const char src_haystack[] = - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam mollis imperdiet massa, at dapibus elit interdum " - "ac. In eget sollicitudin mi. Nam at tellus at sapien tincidunt sollicitudin vel non eros. Pellentesque nunc nunc, " - "ullamcorper eu accumsan at, pulvinar non turpis. Quisque vel mauris pulvinar, pretium purus vel, ultricies erat. " - "Curabitur a magna in ligula tristique ornare. Quisque commodo, massa viverra laoreet luctus, sem nisi aliquam " - "velit, fermentum pulvinar velit leo eget justo. Suspendisse vel erat efficitur, pulvinar eros volutpat, vulputate " - "ex. Phasellus non purus vel velit tristique tristique id at ligula. Quisque mollis sodales magna. Mauris et quam " - "eu quam viverra tempus. Nullam tempus maximus porta. Nunc mattis eleifend fermentum. Nullam aliquam libero " - "accumsan velit elementum, eu laoreet metus convallis. Donec pellentesque lacus ut iaculis iaculis. Curabitur orci " - "elit, bibendum sit amet feugiat at, iaculis sit amet massa. Maecenas imperdiet lacus at vehicula iaculis. Donec " - "volutpat nunc sit amet accumsan tempor. Quisque pretium vestibulum ultricies. Suspendisse potenti. Aenean at diam " - "iaculis, condimentum felis venenatis, condimentum erat. Nam quis elit dui. Duis quis odio vitae metus hendrerit " - "rhoncus ut et magna. Cras ac augue quis nibh pharetra sagittis. Donec ullamcorper vel eros semper pretium. Proin " - "vel sollicitudin eros. Nulla sollicitudin mattis turpis id suscipit. Aliquam sed risus velit. Aliquam iaculis nec " - "nibh ac egestas. Duis finibus semper est sed consequat. Sed in sapien quis nibh dignissim mattis. Vestibulum nec " - "metus sodales, euismod mauris ac, sollicitudin libero. Maecenas non arcu ac velit ullamcorper fringilla et quis " - "nulla. Curabitur posuere leo eget ipsum tincidunt dignissim. Cras ultricies suscipit neque, quis suscipit tortor " - "venenatis non. Cras nisl mi, bibendum in vulputate quis, vestibulum ornare enim. Nunc hendrerit placerat dui, " - "aliquam mollis sem convallis et. Integer vitae urna diam. Phasellus et imperdiet est. Maecenas auctor facilisis " - "nibh non commodo. Suspendisse iaculis quam id bibendum feugiat. Pellentesque felis erat, egestas a libero ac, " - "laoreet consectetur elit. Cras ut suscipit ex. Etiam gravida sem quis ex porta, eu lacinia tortor fermentum. " - "Nulla consequat odio enim, sed condimentum est sagittis a. Quisque nec commodo tellus. Phasellus elementum " - "feugiat dolor et feugiat. Praesent sed mattis tortor. In vitae sodales purus. Morbi accumsan, ligula et interdum " - "lacinia, leo risus suscipit urna, non luctus mi justo eu ipsum. Curabitur venenatis pretium orci id porttitor. " - "Quisque dapibus nisl sit amet elit lobortis sagittis. Orci varius natoque penatibus et magnis dis parturient " - "montes, nascetur ridiculus mus. Mauris varius dui sit amet tortor facilisis vestibulum. Curabitur condimentum " - "justo nec orci mattis auctor. Quisque aliquet condimentum arcu ac sollicitudin. Maecenas elit elit, condimentum " - "vitae auctor a, cursus et sem. Cras vehicula ante in consequat fermentum. Praesent at massa nisi. Mauris pretium " - "euismod eros, ut posuere ligula ullamcorper id. Nullam aliquet malesuada est at dignissim. Pellentesque finibus " - "sagittis libero nec bibendum. Phasellus dolor ipsum, finibus quis turpis quis, mollis interdum felis."; constexpr std::array patterns = { "aliquet"sv, @@ -51,7 +23,7 @@ constexpr std::array patterns = { void c_strstr(benchmark::State& state) { const auto& src_needle = patterns[static_cast(state.range())]; - const std::string haystack(std::begin(src_haystack), std::end(src_haystack)); + const std::string haystack(std::begin(lorem_ipsum), std::end(lorem_ipsum)); const std::string needle(std::begin(src_needle), std::end(src_needle)); for (auto _ : state) { @@ -66,7 +38,7 @@ template void classic_search(benchmark::State& state) { const auto& src_needle = patterns[static_cast(state.range())]; - const std::vector haystack(std::begin(src_haystack), std::end(src_haystack)); + const std::vector haystack(std::begin(lorem_ipsum), std::end(lorem_ipsum)); const std::vector needle(std::begin(src_needle), std::end(src_needle)); for (auto _ : state) { @@ -81,7 +53,7 @@ template void ranges_search(benchmark::State& state) { const auto& src_needle = patterns[static_cast(state.range())]; - const std::vector haystack(std::begin(src_haystack), std::end(src_haystack)); + const std::vector haystack(std::begin(lorem_ipsum), std::end(lorem_ipsum)); const std::vector needle(std::begin(src_needle), std::end(src_needle)); for (auto _ : state) { @@ -96,7 +68,7 @@ template void search_default_searcher(benchmark::State& state) { const auto& src_needle = patterns[static_cast(state.range())]; - const std::vector haystack(std::begin(src_haystack), std::end(src_haystack)); + const std::vector haystack(std::begin(lorem_ipsum), std::end(lorem_ipsum)); const std::vector needle(std::begin(src_needle), std::end(src_needle)); for (auto _ : state) { diff --git a/benchmarks/src/sv_equal.cpp b/benchmarks/src/sv_equal.cpp index ee47ab63c8..b0f95da6f2 100644 --- a/benchmarks/src/sv_equal.cpp +++ b/benchmarks/src/sv_equal.cpp @@ -8,28 +8,9 @@ #include #include -using namespace std::string_view_literals; +#include "lorem.hpp" -constexpr auto haystack = - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam mollis imperdiet massa, at dapibus elit " - "interdumac. In eget sollicitudin mi. Nam at tellus at sapien tincidunt sollicitudin vel non eros. Pellentesque " - "nunc nunc,ullamcorper eu accumsan at, pulvinar non turpis. Quisque vel mauris pulvinar, pretium purus vel, " - "ultricies erat.Curabitur a magna in ligula tristique ornare. Quisque commodo, massa viverra laoreet luctus, sem " - "nisi aliquamvelit, fermentum pulvinar velit leo eget justo. Suspendisse vel erat efficitur, pulvinar eros " - "volutpat, vulputateex. Phasellus non purus vel velit tristique tristique id at ligula. Quisque mollis sodales " - "magna. Mauris et quameu quam viverra tempus. Nullam tempus maximus porta. Nunc mattis eleifend fermentum. Nullam " - "aliquam liberoaccumsan velit elementum, eu laoreet metus convallis. Donec pellentesque lacus ut iaculis iaculis. " - "Curabitur orcielit, bibendum sit amet feugiat at, iaculis sit amet massa. Maecenas imperdiet lacus at vehicula " - "iaculis. Donecvolutpat nunc sit amet accumsan tempor. Quisque pretium vestibulum ultricies. Suspendisse potenti. " - "Aenean at diamiaculis, condimentum felis venenatis, condimentum erat. Nam quis elit dui. Duis quis odio vitae " - "metus hendreritrhoncus ut et magna. Cras ac augue quis nibh pharetra sagittis. Donec ullamcorper vel eros semper " - "pretium. Proinvel sollicitudin eros. Nulla sollicitudin mattis turpis id suscipit. Aliquam sed risus velit. " - "Aliquam iaculis necnibh ac egestas. Duis finibus semper est sed consequat. Sed in sapien quis nibh dignissim " - "mattis. Vestibulum necmetus sodales, euismod mauris ac, sollicitudin libero. Maecenas non arcu ac velit " - "ullamcorper fringilla et quisnulla. Curabitur posuere leo eget ipsum tincidunt dignissim. Cras ultricies suscipit " - "neque, quis suscipit tortorvenenatis non. Cras nisl mi, bibendum in vulputate quis, vestibulum ornare enim. Nunc " - "hendrerit placerat dui,aliquam mollis sem convallis et. Integer vitae urna diam. Phasellus et imperdiet est. " - "Maecenas auctor facilisisnibh non commodo. Suspendisse iaculis quam "sv; +constexpr std::string_view haystack(lorem_ipsum, lorem_ipsum + 2048); constexpr std::size_t Count = 8u; diff --git a/stl/inc/algorithm b/stl/inc/algorithm index c16afa7d04..d7899f7763 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -4515,6 +4515,30 @@ namespace ranges { _STL_INTERNAL_STATIC_ASSERT(sentinel_for<_Se, _It>); _STL_INTERNAL_STATIC_ASSERT(indirect_binary_predicate, const _Ty*>); +#if _USE_STD_VECTOR_ALGORITHMS +#if _HAS_CXX20 + if constexpr (_Vector_alg_in_find_is_safe<_It, _Ty> && sized_sentinel_for<_Se, _It> + && is_same_v<_Pj, identity>) { + if (!_STD is_constant_evaluated()) +#endif // _HAS_CXX20 + { + const auto _Size = _Last - _First; + if (!_STD _Could_compare_equal_to_value_type<_It>(_Val)) { + return {_First + _Size, _First + _Size}; + } + const auto _First_ptr = _To_address(_First); + const auto _Last_ptr = _First_ptr + static_cast(_Size); + const auto _Result = _STD _Remove_vectorized(_First_ptr, _Last_ptr, _Val); + + if constexpr (is_pointer_v<_It>) { + return {static_cast<_It>(_Result), static_cast<_It>(_Last_ptr)}; + } else { + return {_First + (_Result - _First_ptr), _First + _Size}; + } + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + _First = _RANGES _Find_unchecked(_STD move(_First), _Last, _Val, _Proj); auto _Next = _First; if (_First == _Last) { diff --git a/stl/inc/xmemory b/stl/inc/xmemory index 3aa11f488c..0cb3be686d 100644 --- a/stl/inc/xmemory +++ b/stl/inc/xmemory @@ -25,6 +25,39 @@ _STL_DISABLE_CLANG_WARNINGS #pragma push_macro("new") #undef new +#if _USE_STD_VECTOR_ALGORITHMS +extern "C" { +void* __stdcall __std_remove_1(void* _First, void* _Last, uint8_t _Val) noexcept; +void* __stdcall __std_remove_2(void* _First, void* _Last, uint16_t _Val) noexcept; +void* __stdcall __std_remove_4(void* _First, void* _Last, uint32_t _Val) noexcept; +void* __stdcall __std_remove_8(void* _First, void* _Last, uint64_t _Val) noexcept; +} // extern "C" + +_STD_BEGIN +template +_Ty* _Remove_vectorized(_Ty* const _First, _Ty* const _Last, const _TVal _Val) { + if constexpr (is_pointer_v<_Ty>) { +#ifdef _WIN64 + return reinterpret_cast<_Ty*>(::__std_remove_8(_First, _Last, reinterpret_cast(_Val))); +#else + return reinterpret_cast<_Ty*>(::__std_remove_4(_First, _Last, reinterpret_cast(_Val))); +#endif + } else if constexpr (sizeof(_Ty) == 1) { + return reinterpret_cast<_Ty*>(::__std_remove_1(_First, _Last, static_cast(_Val))); + } else if constexpr (sizeof(_Ty) == 2) { + return reinterpret_cast<_Ty*>(::__std_remove_2(_First, _Last, static_cast(_Val))); + } else if constexpr (sizeof(_Ty) == 4) { + return reinterpret_cast<_Ty*>(::__std_remove_4(_First, _Last, static_cast(_Val))); + } else if constexpr (sizeof(_Ty) == 8) { + return reinterpret_cast<_Ty*>(::__std_remove_8(_First, _Last, static_cast(_Val))); + } else { + _STL_INTERNAL_STATIC_ASSERT(false); // Unexpected size + } +} +_STD_END + +#endif // _USE_STD_VECTOR_ALGORITHMS + _STD_BEGIN template _NODISCARD constexpr auto _Unfancy(_Ptrty _Ptr) noexcept { // converts from a fancy pointer to a plain pointer @@ -2194,6 +2227,31 @@ _NODISCARD_REMOVE_ALG _CONSTEXPR20 _FwdIt remove(_FwdIt _First, const _FwdIt _La _UFirst = _STD _Find_unchecked(_UFirst, _ULast, _Val); auto _UNext = _UFirst; if (_UFirst != _ULast) { +#if _USE_STD_VECTOR_ALGORITHMS + if constexpr (_Vector_alg_in_find_is_safe) { +#if _HAS_CXX20 + if (!_STD is_constant_evaluated()) +#endif // _HAS_CXX20 + { + if (!_STD _Could_compare_equal_to_value_type(_Val)) { + return _Last; + } + + const auto _First_ptr = _To_address(_UFirst); + const auto _Result = _STD _Remove_vectorized(_First_ptr, _To_address(_ULast), _Val); + + if constexpr (is_pointer_v) { + _UNext = static_cast(_Result); + } else { + _UNext += _Result - _First_ptr; + } + + _STD _Seek_wrapped(_First, _UNext); + return _First; + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + while (++_UFirst != _ULast) { if (!(*_UFirst == _Val)) { *_UNext = _STD move(*_UFirst); diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 657feab547..a393c14fd7 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3626,6 +3626,279 @@ __declspec(noalias) void __stdcall __std_replace_8( } // extern "C" +namespace { + template + void* _Remove_fallback(void* const _First, void* const _Last, void* const _Out, const _Ty _Val) noexcept { + _Ty* _Src = reinterpret_cast<_Ty*>(_First); + _Ty* _Dest = reinterpret_cast<_Ty*>(_Out); + + while (_Src != _Last) { + if (*_Src != _Val) { + *_Dest = *_Src; + ++_Dest; + } + + ++_Src; + } + + return _Dest; + } + + struct _Remove_patterns_1_2_t { + using _Byte_shuffle = uint8_t[16]; + + _Byte_shuffle _Data[256]; + uint8_t _Count[256]; + }; + + constexpr _Remove_patterns_1_2_t _Make_remove_patterns_1() { + _Remove_patterns_1_2_t _Result; + + for (unsigned _Vx = 0; _Vx != 256; ++_Vx) { + unsigned _Nx = 0; + + // Compact the source according to bitmap + for (unsigned _Hx = 0; _Hx != 8; ++_Hx) { + if ((_Vx & (1 << _Hx)) == 0) { + _Result._Data[_Vx][_Nx] = static_cast(_Hx); + ++_Nx; + } + } + + _Result._Count[_Vx] = static_cast(_Nx); + + // Fill the remaining as if not touched + for (; _Nx != 8; ++_Nx) { + _Result._Data[_Vx][_Nx] = static_cast(_Nx); + } + + // Unused high part, fill with 0xFF for cernainity + for (; _Nx != 16; ++_Nx) { + _Result._Data[_Vx][_Nx] = 0xFF; + } + } + + return _Result; + } + + constexpr auto _Remove_patterns_1 = _Make_remove_patterns_1(); + + constexpr _Remove_patterns_1_2_t _Make_remove_patterns_2() { + _Remove_patterns_1_2_t _Result; + + for (unsigned _Vx = 0; _Vx != 256; ++_Vx) { + unsigned _Nx = 0; + + // Compact the source according to bitmap + for (unsigned _Hx = 0; _Hx != 8; ++_Hx) { + if ((_Vx & (1 << _Hx)) == 0) { + _Result._Data[_Vx][_Nx * 2 + 0] = static_cast(_Hx * 2 + 0); + _Result._Data[_Vx][_Nx * 2 + 1] = static_cast(_Hx * 2 + 1); + ++_Nx; + } + } + + _Result._Count[_Vx] = static_cast(_Nx * 2); + + // Fill the remaining as if not touched + for (; _Nx != 8; ++_Nx) { + _Result._Data[_Vx][_Nx * 2 + 0] = static_cast(_Nx * 2 + 0); + _Result._Data[_Vx][_Nx * 2 + 1] = static_cast(_Nx * 2 + 1); + } + } + + return _Result; + } + + constexpr auto _Remove_patterns_2 = _Make_remove_patterns_2(); + + struct _Remove_patterns_4_t { + using _Int_shuffle = uint32_t[8]; + + _Int_shuffle _Data[256]; + uint8_t _Count[256]; + }; + + constexpr _Remove_patterns_4_t _Make_remove_patterns_4() { + _Remove_patterns_4_t _Result; + + for (unsigned _Vx = 0; _Vx != 256; ++_Vx) { + unsigned _Nx = 0; + + // Compact the source according to bitmap + for (unsigned _Hx = 0; _Hx != 8; ++_Hx) { + if ((_Vx & (1 << _Hx)) == 0) { + _Result._Data[_Vx][_Nx] = _Hx; + ++_Nx; + } + } + + _Result._Count[_Vx] = static_cast(_Nx * 4); + + // Fill the remaining as if not touched + for (; _Nx != 8; ++_Nx) { + _Result._Data[_Vx][_Nx] = _Nx; + } + } + + return _Result; + } + + constexpr auto _Remove_patterns_4 = _Make_remove_patterns_4(); + + struct _Remove_patterns_8_t { + using _Int_shuffle = uint32_t[8]; + + _Int_shuffle _Data[16]; + uint8_t _Count[16]; + }; + + constexpr _Remove_patterns_8_t _Make_remove_patterns_8() { + _Remove_patterns_8_t _Result; + + for (unsigned _Vx = 0; _Vx != 16; ++_Vx) { + unsigned _Nx = 0; + + // Compact the source according to bitmap + for (unsigned _Hx = 0; _Hx != 4; ++_Hx) { + if ((_Vx & (1 << _Hx)) == 0) { + _Result._Data[_Vx][_Nx * 2 + 0] = _Hx * 2 + 0; + _Result._Data[_Vx][_Nx * 2 + 1] = _Hx * 2 + 1; + ++_Nx; + } + } + + _Result._Count[_Vx] = static_cast(_Nx * 8); + + // Fill the remaining as if not touched + for (; _Nx != 4; ++_Nx) { + _Result._Data[_Vx][_Nx * 2 + 0] = _Nx * 2 + 0; + _Result._Data[_Vx][_Nx * 2 + 1] = _Nx * 2 + 1; + } + } + + return _Result; + } + + constexpr auto _Remove_patterns_8 = _Make_remove_patterns_8(); + + +} // unnamed namespace + +extern "C" { + +void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _Val) noexcept { + _First = const_cast(__std_find_trivial_1(_First, _Last, _Val)); + void* _Out = _First; + + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes > 8) { + const __m128i _Match = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Val), _mm_setzero_si128()); + + void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{7}); + do { + const __m128i _Src = _mm_loadu_si64(_First); + const unsigned _Bingo = _mm_movemask_epi8(_mm_cmpeq_epi8(_Src, _Match)) & 0xFF; + const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_patterns_1._Data[_Bingo])); + const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si64(_Out, _Dest); + _Advance_bytes(_Out, _Remove_patterns_1._Count[_Bingo]); + _Advance_bytes(_First, 8); + } while (_First != _Stop); + } + + return _Remove_fallback(_First, _Last, _Out, _Val); +} + +void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _Val) noexcept { + _First = const_cast(__std_find_trivial_2(_First, _Last, _Val)); + void* _Out = _First; + + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes > 16) { + const __m128i _Match = _mm_set1_epi16(_Val); + const __m128i _Dense_shuf = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0); + + void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); + do { + const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); + const __m128i _Mask = _mm_cmpeq_epi16(_Src, _Match); + const unsigned _Bingo = _mm_movemask_epi8(_mm_shuffle_epi8(_Mask, _Dense_shuf)); + const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_patterns_2._Data[_Bingo])); + const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); + _Advance_bytes(_Out, _Remove_patterns_2._Count[_Bingo]); + _Advance_bytes(_First, 16); + } while (_First != _Stop); + } + + return _Remove_fallback(_First, _Last, _Out, _Val); +} + +void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _Val) noexcept { + _First = const_cast(__std_find_trivial_4(_First, _Last, _Val)); + void* _Out = _First; + + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes > 32) { + const __m256i _Match = _mm256_set1_epi32(_Val); + const __m256i _Dense_shuf = _mm256_set_epi8( // + 12, 8, 4, 0, -1, -1, -1, -1, // + -1, -1, -1, -1, -1, -1, -1, -1, // + -1, -1, -1, -1, -1, -1, -1, -1, // + -1, -1, -1, -1, 12, 8, 4, 0); + + void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); + do { + const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); + const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); + const unsigned _Bingo_swapped = _rotl(_mm256_movemask_epi8(_mm256_shuffle_epi8(_Mask, _Dense_shuf)), 4); + const unsigned _Bingo = _rotl8(static_cast(_Bingo_swapped), 4); + const __m256i _Shuf = + _mm256_loadu_si256(reinterpret_cast(_Remove_patterns_4._Data[_Bingo])); + const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); + _Advance_bytes(_Out, _Remove_patterns_4._Count[_Bingo]); + _Advance_bytes(_First, 32); + } while (_First != _Stop); + } + + return _Remove_fallback(_First, _Last, _Out, _Val); +} + +void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _Val) noexcept { + _First = const_cast(__std_find_trivial_8(_First, _Last, _Val)); + void* _Out = _First; + + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes > 32) { + const __m256i _Match = _mm256_set1_epi64x(_Val); + const __m256i _Dense_shuf = _mm256_set_epi8( // + 8, 0, -1, -1, -1, -1, -1, -1, // + -1, -1, -1, -1, -1, -1, -1, -1, // + -1, -1, -1, -1, -1, -1, -1, -1, // + -1, -1, -1, -1, -1, -1, 8, 0); + + void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); + do { + const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); + const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); + const unsigned _Bingo_swapped = _mm256_movemask_epi8(_mm256_shuffle_epi8(_Mask, _Dense_shuf)); + const unsigned _Bingo = (_Bingo_swapped | (_Bingo_swapped >> 28)) & 0xF; + const __m256i _Shuf = + _mm256_loadu_si256(reinterpret_cast(_Remove_patterns_8._Data[_Bingo])); + const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); + _Advance_bytes(_Out, _Remove_patterns_8._Count[_Bingo]); + _Advance_bytes(_First, 32); + } while (_First != _Stop); + } + + return _Remove_fallback(_First, _Last, _Out, _Val); +} + +} // extern "C" + #ifndef _M_ARM64EC namespace { __m256i __forceinline _Bitset_to_string_1_step_avx(const uint32_t _Val, const __m256i _Px0, const __m256i _Px1) { diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 28f5c92d37..5b45fcb92e 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -775,6 +775,62 @@ void test_swap_ranges(mt19937_64& gen) { } } +template +FwdIt last_known_good_remove(FwdIt first, FwdIt last, T val) { + FwdIt dest = first; + + while (first != last) { + if (*first != val) { + *dest = *first; + ++dest; + } + + ++first; + } + + return dest; +} + +template +void test_case_remove(vector& in_out_expected, vector& in_out_actual, vector& in_out_actual_r, const T val) { + auto rem_expected = last_known_good_remove(in_out_expected.begin(), in_out_expected.end(), val); + auto rem_actual = remove(in_out_actual.begin(), in_out_actual.end(), val); + assert(equal(in_out_expected.begin(), rem_expected, in_out_actual.begin(), rem_actual)); + +#if _HAS_CXX20 + auto rem_actual_r = ranges::remove(in_out_actual_r, val); + assert(equal(in_out_expected.begin(), rem_expected, begin(in_out_actual_r), begin(rem_actual_r))); +#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv + (void) in_out_actual_r; +#endif // ^^^ !_HAS_CXX20 ^^^ +} + +template +void test_remove(mt19937_64& gen) { + using TD = conditional_t; + binomial_distribution dis(10); + + vector source; + vector in_out_expected; + vector in_out_actual; + vector in_out_actual_r; + + for (auto v : {&source, &in_out_expected, &in_out_actual, &in_out_actual_r}) { + v->reserve(dataCount); + } + + test_case_remove(in_out_actual, in_out_expected, in_out_actual_r, static_cast(dis(gen))); + for (size_t attempts = 0; attempts < dataCount; ++attempts) { + source.push_back(static_cast(dis(gen))); + + for (auto v : {&in_out_expected, &in_out_actual, &in_out_actual_r}) { + *v = source; + } + + test_case_remove(in_out_expected, in_out_actual, in_out_actual_r, static_cast(dis(gen))); + } +} + void test_vector_algorithms(mt19937_64& gen) { test_count(gen); test_count(gen); @@ -921,6 +977,16 @@ void test_vector_algorithms(mt19937_64& gen) { test_reverse_copy(gen); test_reverse_copy(gen); + test_remove(gen); + test_remove(gen); + test_remove(gen); + test_remove(gen); + test_remove(gen); + test_remove(gen); + test_remove(gen); + test_remove(gen); + test_remove(gen); + test_swap_ranges(gen); test_swap_ranges(gen); test_swap_ranges(gen); From d0f2e68e04e9fe06ad45ce9d034d594e9b13128e Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 28 Sep 2024 18:07:02 +0300 Subject: [PATCH 02/34] ADL --- stl/inc/algorithm | 2 +- stl/inc/xmemory | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index d7899f7763..2e7aea79cc 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -4526,7 +4526,7 @@ namespace ranges { if (!_STD _Could_compare_equal_to_value_type<_It>(_Val)) { return {_First + _Size, _First + _Size}; } - const auto _First_ptr = _To_address(_First); + const auto _First_ptr = _STD _To_address(_First); const auto _Last_ptr = _First_ptr + static_cast(_Size); const auto _Result = _STD _Remove_vectorized(_First_ptr, _Last_ptr, _Val); diff --git a/stl/inc/xmemory b/stl/inc/xmemory index 0cb3be686d..81316bcf7e 100644 --- a/stl/inc/xmemory +++ b/stl/inc/xmemory @@ -2237,8 +2237,8 @@ _NODISCARD_REMOVE_ALG _CONSTEXPR20 _FwdIt remove(_FwdIt _First, const _FwdIt _La return _Last; } - const auto _First_ptr = _To_address(_UFirst); - const auto _Result = _STD _Remove_vectorized(_First_ptr, _To_address(_ULast), _Val); + const auto _First_ptr = _STD _To_address(_UFirst); + const auto _Result = _STD _Remove_vectorized(_First_ptr, _STD _To_address(_ULast), _Val); if constexpr (is_pointer_v) { _UNext = static_cast(_Result); From d6b27afee431fa7fcb84e9fd571d4090409499ed Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 28 Sep 2024 20:57:58 +0300 Subject: [PATCH 03/34] compress 1-byte data --- stl/src/vector_algorithms.cpp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index a393c14fd7..b50d8e2835 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3644,15 +3644,15 @@ namespace { return _Dest; } - struct _Remove_patterns_1_2_t { - using _Byte_shuffle = uint8_t[16]; + struct _Remove_patterns_1_t { + using _Byte_shuffle = uint8_t[8]; _Byte_shuffle _Data[256]; uint8_t _Count[256]; }; - constexpr _Remove_patterns_1_2_t _Make_remove_patterns_1() { - _Remove_patterns_1_2_t _Result; + constexpr _Remove_patterns_1_t _Make_remove_patterns_1() { + _Remove_patterns_1_t _Result; for (unsigned _Vx = 0; _Vx != 256; ++_Vx) { unsigned _Nx = 0; @@ -3671,11 +3671,6 @@ namespace { for (; _Nx != 8; ++_Nx) { _Result._Data[_Vx][_Nx] = static_cast(_Nx); } - - // Unused high part, fill with 0xFF for cernainity - for (; _Nx != 16; ++_Nx) { - _Result._Data[_Vx][_Nx] = 0xFF; - } } return _Result; @@ -3683,8 +3678,15 @@ namespace { constexpr auto _Remove_patterns_1 = _Make_remove_patterns_1(); - constexpr _Remove_patterns_1_2_t _Make_remove_patterns_2() { - _Remove_patterns_1_2_t _Result; + struct _Remove_patterns_2_t { + using _Byte_shuffle = uint8_t[16]; + + _Byte_shuffle _Data[256]; + uint8_t _Count[256]; + }; + + constexpr _Remove_patterns_2_t _Make_remove_patterns_2() { + _Remove_patterns_2_t _Result; for (unsigned _Vx = 0; _Vx != 256; ++_Vx) { unsigned _Nx = 0; @@ -3799,7 +3801,7 @@ void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _V do { const __m128i _Src = _mm_loadu_si64(_First); const unsigned _Bingo = _mm_movemask_epi8(_mm_cmpeq_epi8(_Src, _Match)) & 0xFF; - const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_patterns_1._Data[_Bingo])); + const __m128i _Shuf = _mm_loadu_si64(_Remove_patterns_1._Data[_Bingo]); const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); _mm_storeu_si64(_Out, _Dest); _Advance_bytes(_Out, _Remove_patterns_1._Count[_Bingo]); From 44a9278576779238e61e2698a3abf57aa1d17c4a Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 28 Sep 2024 21:05:55 +0300 Subject: [PATCH 04/34] compact also 4 and 8 tables --- stl/src/vector_algorithms.cpp | 38 +++++++++++++---------------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index b50d8e2835..90123b45b9 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3645,9 +3645,7 @@ namespace { } struct _Remove_patterns_1_t { - using _Byte_shuffle = uint8_t[8]; - - _Byte_shuffle _Data[256]; + uint8_t _Data[256][8]; uint8_t _Count[256]; }; @@ -3679,9 +3677,7 @@ namespace { constexpr auto _Remove_patterns_1 = _Make_remove_patterns_1(); struct _Remove_patterns_2_t { - using _Byte_shuffle = uint8_t[16]; - - _Byte_shuffle _Data[256]; + uint8_t _Data[256][16]; uint8_t _Count[256]; }; @@ -3715,9 +3711,7 @@ namespace { constexpr auto _Remove_patterns_2 = _Make_remove_patterns_2(); struct _Remove_patterns_4_t { - using _Int_shuffle = uint32_t[8]; - - _Int_shuffle _Data[256]; + uint8_t _Data[256][8]; uint8_t _Count[256]; }; @@ -3730,7 +3724,7 @@ namespace { // Compact the source according to bitmap for (unsigned _Hx = 0; _Hx != 8; ++_Hx) { if ((_Vx & (1 << _Hx)) == 0) { - _Result._Data[_Vx][_Nx] = _Hx; + _Result._Data[_Vx][_Nx] = static_cast(_Hx); ++_Nx; } } @@ -3739,7 +3733,7 @@ namespace { // Fill the remaining as if not touched for (; _Nx != 8; ++_Nx) { - _Result._Data[_Vx][_Nx] = _Nx; + _Result._Data[_Vx][_Nx] = static_cast(_Nx); } } @@ -3749,9 +3743,7 @@ namespace { constexpr auto _Remove_patterns_4 = _Make_remove_patterns_4(); struct _Remove_patterns_8_t { - using _Int_shuffle = uint32_t[8]; - - _Int_shuffle _Data[16]; + uint8_t _Data[16][8]; uint8_t _Count[16]; }; @@ -3764,8 +3756,8 @@ namespace { // Compact the source according to bitmap for (unsigned _Hx = 0; _Hx != 4; ++_Hx) { if ((_Vx & (1 << _Hx)) == 0) { - _Result._Data[_Vx][_Nx * 2 + 0] = _Hx * 2 + 0; - _Result._Data[_Vx][_Nx * 2 + 1] = _Hx * 2 + 1; + _Result._Data[_Vx][_Nx * 2 + 0] = static_cast(_Hx * 2 + 0); + _Result._Data[_Vx][_Nx * 2 + 1] = static_cast(_Hx * 2 + 1); ++_Nx; } } @@ -3774,8 +3766,8 @@ namespace { // Fill the remaining as if not touched for (; _Nx != 4; ++_Nx) { - _Result._Data[_Vx][_Nx * 2 + 0] = _Nx * 2 + 0; - _Result._Data[_Vx][_Nx * 2 + 1] = _Nx * 2 + 1; + _Result._Data[_Vx][_Nx * 2 + 0] = static_cast(_Nx * 2 + 0); + _Result._Data[_Vx][_Nx * 2 + 1] = static_cast(_Nx * 2 + 1); } } @@ -3856,9 +3848,8 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); const unsigned _Bingo_swapped = _rotl(_mm256_movemask_epi8(_mm256_shuffle_epi8(_Mask, _Dense_shuf)), 4); const unsigned _Bingo = _rotl8(static_cast(_Bingo_swapped), 4); - const __m256i _Shuf = - _mm256_loadu_si256(reinterpret_cast(_Remove_patterns_4._Data[_Bingo])); - const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); + const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_4._Data[_Bingo])); + const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); _Advance_bytes(_Out, _Remove_patterns_4._Count[_Bingo]); _Advance_bytes(_First, 32); @@ -3887,9 +3878,8 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); const unsigned _Bingo_swapped = _mm256_movemask_epi8(_mm256_shuffle_epi8(_Mask, _Dense_shuf)); const unsigned _Bingo = (_Bingo_swapped | (_Bingo_swapped >> 28)) & 0xF; - const __m256i _Shuf = - _mm256_loadu_si256(reinterpret_cast(_Remove_patterns_8._Data[_Bingo])); - const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); + const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_8._Data[_Bingo])); + const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); _Advance_bytes(_Out, _Remove_patterns_8._Count[_Bingo]); _Advance_bytes(_First, 32); From 18c0b7cdb0c53c5fdc25b6bc5c6b0d97f28bc262 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 28 Sep 2024 21:29:16 +0300 Subject: [PATCH 05/34] reduce copypasta --- stl/src/vector_algorithms.cpp | 105 +++++++++------------------------- 1 file changed, 28 insertions(+), 77 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 90123b45b9..d89d257b17 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3644,29 +3644,31 @@ namespace { return _Dest; } - struct _Remove_patterns_1_t { - uint8_t _Data[256][8]; - uint8_t _Count[256]; + template + struct _Remove_patterns_t { + uint8_t _Data[_Size_v][_Size_h]; + uint8_t _Count[_Size_v]; }; - constexpr _Remove_patterns_1_t _Make_remove_patterns_1() { - _Remove_patterns_1_t _Result; + template + constexpr auto _Make_remove_patterns_1_4(const uint8_t _Mul) { + _Remove_patterns_t<_Size_v, _Size_h> _Result; - for (unsigned _Vx = 0; _Vx != 256; ++_Vx) { + for (unsigned _Vx = 0; _Vx != _Size_v; ++_Vx) { unsigned _Nx = 0; // Compact the source according to bitmap - for (unsigned _Hx = 0; _Hx != 8; ++_Hx) { + for (unsigned _Hx = 0; _Hx != _Size_h; ++_Hx) { if ((_Vx & (1 << _Hx)) == 0) { _Result._Data[_Vx][_Nx] = static_cast(_Hx); ++_Nx; } } - _Result._Count[_Vx] = static_cast(_Nx); + _Result._Count[_Vx] = static_cast(_Nx * _Mul); // Fill the remaining as if not touched - for (; _Nx != 8; ++_Nx) { + for (; _Nx != _Size_h; ++_Nx) { _Result._Data[_Vx][_Nx] = static_cast(_Nx); } } @@ -3674,21 +3676,15 @@ namespace { return _Result; } - constexpr auto _Remove_patterns_1 = _Make_remove_patterns_1(); - - struct _Remove_patterns_2_t { - uint8_t _Data[256][16]; - uint8_t _Count[256]; - }; - - constexpr _Remove_patterns_2_t _Make_remove_patterns_2() { - _Remove_patterns_2_t _Result; + template + constexpr auto _Make_remove_patterns_2_8(const uint8_t _Mul) { + _Remove_patterns_t<_Size_v, _Size_h> _Result; - for (unsigned _Vx = 0; _Vx != 256; ++_Vx) { + for (unsigned _Vx = 0; _Vx != _Size_v; ++_Vx) { unsigned _Nx = 0; // Compact the source according to bitmap - for (unsigned _Hx = 0; _Hx != 8; ++_Hx) { + for (unsigned _Hx = 0; _Hx != _Size_h / 2; ++_Hx) { if ((_Vx & (1 << _Hx)) == 0) { _Result._Data[_Vx][_Nx * 2 + 0] = static_cast(_Hx * 2 + 0); _Result._Data[_Vx][_Nx * 2 + 1] = static_cast(_Hx * 2 + 1); @@ -3696,10 +3692,10 @@ namespace { } } - _Result._Count[_Vx] = static_cast(_Nx * 2); + _Result._Count[_Vx] = static_cast(_Nx * _Mul); // Fill the remaining as if not touched - for (; _Nx != 8; ++_Nx) { + for (; _Nx != _Size_h / 2; ++_Nx) { _Result._Data[_Vx][_Nx * 2 + 0] = static_cast(_Nx * 2 + 0); _Result._Data[_Vx][_Nx * 2 + 1] = static_cast(_Nx * 2 + 1); } @@ -3708,73 +3704,28 @@ namespace { return _Result; } - constexpr auto _Remove_patterns_2 = _Make_remove_patterns_2(); + constexpr auto _Remove_patterns_1 = _Make_remove_patterns_1_4<256, 8>(1); - struct _Remove_patterns_4_t { - uint8_t _Data[256][8]; + struct _Remove_patterns_2_t { + uint8_t _Data[256][16]; uint8_t _Count[256]; }; - constexpr _Remove_patterns_4_t _Make_remove_patterns_4() { - _Remove_patterns_4_t _Result; - - for (unsigned _Vx = 0; _Vx != 256; ++_Vx) { - unsigned _Nx = 0; - - // Compact the source according to bitmap - for (unsigned _Hx = 0; _Hx != 8; ++_Hx) { - if ((_Vx & (1 << _Hx)) == 0) { - _Result._Data[_Vx][_Nx] = static_cast(_Hx); - ++_Nx; - } - } - - _Result._Count[_Vx] = static_cast(_Nx * 4); - - // Fill the remaining as if not touched - for (; _Nx != 8; ++_Nx) { - _Result._Data[_Vx][_Nx] = static_cast(_Nx); - } - } + constexpr auto _Remove_patterns_2 = _Make_remove_patterns_2_8<256, 16>(2); - return _Result; - } + struct _Remove_patterns_4_t { + uint8_t _Data[256][8]; + uint8_t _Count[256]; + }; - constexpr auto _Remove_patterns_4 = _Make_remove_patterns_4(); + constexpr auto _Remove_patterns_4 = _Make_remove_patterns_1_4<256, 8>(4); struct _Remove_patterns_8_t { uint8_t _Data[16][8]; uint8_t _Count[16]; }; - constexpr _Remove_patterns_8_t _Make_remove_patterns_8() { - _Remove_patterns_8_t _Result; - - for (unsigned _Vx = 0; _Vx != 16; ++_Vx) { - unsigned _Nx = 0; - - // Compact the source according to bitmap - for (unsigned _Hx = 0; _Hx != 4; ++_Hx) { - if ((_Vx & (1 << _Hx)) == 0) { - _Result._Data[_Vx][_Nx * 2 + 0] = static_cast(_Hx * 2 + 0); - _Result._Data[_Vx][_Nx * 2 + 1] = static_cast(_Hx * 2 + 1); - ++_Nx; - } - } - - _Result._Count[_Vx] = static_cast(_Nx * 8); - - // Fill the remaining as if not touched - for (; _Nx != 4; ++_Nx) { - _Result._Data[_Vx][_Nx * 2 + 0] = static_cast(_Nx * 2 + 0); - _Result._Data[_Vx][_Nx * 2 + 1] = static_cast(_Nx * 2 + 1); - } - } - - return _Result; - } - - constexpr auto _Remove_patterns_8 = _Make_remove_patterns_8(); + constexpr auto _Remove_patterns_8 = _Make_remove_patterns_2_8<16, 8>(8); } // unnamed namespace From 2b068f6b5ee174cf6979297e1d26459950daa347 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 28 Sep 2024 21:31:43 +0300 Subject: [PATCH 06/34] -leftovers --- stl/src/vector_algorithms.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index d89d257b17..e37e864e56 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3705,26 +3705,8 @@ namespace { } constexpr auto _Remove_patterns_1 = _Make_remove_patterns_1_4<256, 8>(1); - - struct _Remove_patterns_2_t { - uint8_t _Data[256][16]; - uint8_t _Count[256]; - }; - constexpr auto _Remove_patterns_2 = _Make_remove_patterns_2_8<256, 16>(2); - - struct _Remove_patterns_4_t { - uint8_t _Data[256][8]; - uint8_t _Count[256]; - }; - constexpr auto _Remove_patterns_4 = _Make_remove_patterns_1_4<256, 8>(4); - - struct _Remove_patterns_8_t { - uint8_t _Data[16][8]; - uint8_t _Count[16]; - }; - constexpr auto _Remove_patterns_8 = _Make_remove_patterns_2_8<16, 8>(8); From d99ff0cd8d5f3b7819afc644946aa0fec87ef79a Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 28 Sep 2024 21:38:59 +0300 Subject: [PATCH 07/34] wrong comparison! --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index e37e864e56..cdc981ad59 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3808,7 +3808,7 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); do { const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); - const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); + const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); const unsigned _Bingo_swapped = _mm256_movemask_epi8(_mm256_shuffle_epi8(_Mask, _Dense_shuf)); const unsigned _Bingo = (_Bingo_swapped | (_Bingo_swapped >> 28)) & 0xF; const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_8._Data[_Bingo])); From d0e59380d7d266d06d5d070ad80ce556d0acde41 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 28 Sep 2024 22:03:04 +0300 Subject: [PATCH 08/34] reduce copypasta even more --- stl/src/vector_algorithms.cpp | 50 +++++++++-------------------------- 1 file changed, 12 insertions(+), 38 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index cdc981ad59..fea7f64924 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3651,16 +3651,18 @@ namespace { }; template - constexpr auto _Make_remove_patterns_1_4(const uint8_t _Mul) { + constexpr auto _Make_remove_patterns(const unsigned _Mul, const unsigned _Ew) { _Remove_patterns_t<_Size_v, _Size_h> _Result; for (unsigned _Vx = 0; _Vx != _Size_v; ++_Vx) { unsigned _Nx = 0; // Compact the source according to bitmap - for (unsigned _Hx = 0; _Hx != _Size_h; ++_Hx) { + for (unsigned _Hx = 0; _Hx != _Size_h / _Ew; ++_Hx) { if ((_Vx & (1 << _Hx)) == 0) { - _Result._Data[_Vx][_Nx] = static_cast(_Hx); + for (unsigned _Ex = 0; _Ex != _Ew; ++_Ex) { + _Result._Data[_Vx][_Nx * _Ew + _Ex] = static_cast(_Hx * _Ew + _Ex); + } ++_Nx; } } @@ -3668,48 +3670,20 @@ namespace { _Result._Count[_Vx] = static_cast(_Nx * _Mul); // Fill the remaining as if not touched - for (; _Nx != _Size_h; ++_Nx) { - _Result._Data[_Vx][_Nx] = static_cast(_Nx); - } - } - - return _Result; - } - - template - constexpr auto _Make_remove_patterns_2_8(const uint8_t _Mul) { - _Remove_patterns_t<_Size_v, _Size_h> _Result; - - for (unsigned _Vx = 0; _Vx != _Size_v; ++_Vx) { - unsigned _Nx = 0; - - // Compact the source according to bitmap - for (unsigned _Hx = 0; _Hx != _Size_h / 2; ++_Hx) { - if ((_Vx & (1 << _Hx)) == 0) { - _Result._Data[_Vx][_Nx * 2 + 0] = static_cast(_Hx * 2 + 0); - _Result._Data[_Vx][_Nx * 2 + 1] = static_cast(_Hx * 2 + 1); - ++_Nx; + for (; _Nx != _Size_h / _Ew; ++_Nx) { + for (unsigned _Ex = 0; _Ex != _Ew; ++_Ex) { + _Result._Data[_Vx][_Nx * _Ew + _Ex] = static_cast(_Nx * _Ew + _Ex); } } - - _Result._Count[_Vx] = static_cast(_Nx * _Mul); - - // Fill the remaining as if not touched - for (; _Nx != _Size_h / 2; ++_Nx) { - _Result._Data[_Vx][_Nx * 2 + 0] = static_cast(_Nx * 2 + 0); - _Result._Data[_Vx][_Nx * 2 + 1] = static_cast(_Nx * 2 + 1); - } } return _Result; } - constexpr auto _Remove_patterns_1 = _Make_remove_patterns_1_4<256, 8>(1); - constexpr auto _Remove_patterns_2 = _Make_remove_patterns_2_8<256, 16>(2); - constexpr auto _Remove_patterns_4 = _Make_remove_patterns_1_4<256, 8>(4); - constexpr auto _Remove_patterns_8 = _Make_remove_patterns_2_8<16, 8>(8); - - + constexpr auto _Remove_patterns_1 = _Make_remove_patterns<256, 8>(1, 1); + constexpr auto _Remove_patterns_2 = _Make_remove_patterns<256, 16>(2, 2); + constexpr auto _Remove_patterns_4 = _Make_remove_patterns<256, 8>(4, 1); + constexpr auto _Remove_patterns_8 = _Make_remove_patterns<16, 8>(8, 2); } // unnamed namespace extern "C" { From 79727800b356a112deceb8cfd6642612f641a857 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 28 Sep 2024 22:12:14 +0300 Subject: [PATCH 09/34] bingo consistency --- stl/src/vector_algorithms.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index fea7f64924..9ea6f83856 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3751,12 +3751,12 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); do { - const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); - const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); - const unsigned _Bingo_swapped = _rotl(_mm256_movemask_epi8(_mm256_shuffle_epi8(_Mask, _Dense_shuf)), 4); - const unsigned _Bingo = _rotl8(static_cast(_Bingo_swapped), 4); - const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_4._Data[_Bingo])); - const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); + const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); + const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); + const unsigned _Bingo_d = _mm256_movemask_epi8(_mm256_shuffle_epi8(_Mask, _Dense_shuf)); + const unsigned _Bingo = _rotl8(static_cast(_rotl(_Bingo_d, 4)), 4); + const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_4._Data[_Bingo])); + const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); _Advance_bytes(_Out, _Remove_patterns_4._Count[_Bingo]); _Advance_bytes(_First, 32); @@ -3781,12 +3781,12 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); do { - const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); - const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); - const unsigned _Bingo_swapped = _mm256_movemask_epi8(_mm256_shuffle_epi8(_Mask, _Dense_shuf)); - const unsigned _Bingo = (_Bingo_swapped | (_Bingo_swapped >> 28)) & 0xF; - const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_8._Data[_Bingo])); - const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); + const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); + const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); + const unsigned _Bingo_d = _mm256_movemask_epi8(_mm256_shuffle_epi8(_Mask, _Dense_shuf)); + const unsigned _Bingo = (_Bingo_d | (_Bingo_d >> 28)) & 0xF; + const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_8._Data[_Bingo])); + const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); _Advance_bytes(_Out, _Remove_patterns_8._Count[_Bingo]); _Advance_bytes(_First, 32); From 4a7d60bb4e6ae9ef9261e22f52a6cc1ef12bb18b Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Wed, 2 Oct 2024 19:32:30 +0300 Subject: [PATCH 10/34] vzeroupper --- stl/src/vector_algorithms.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 9ea6f83856..92fea69d75 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3761,6 +3761,8 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ _Advance_bytes(_Out, _Remove_patterns_4._Count[_Bingo]); _Advance_bytes(_First, 32); } while (_First != _Stop); + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } return _Remove_fallback(_First, _Last, _Out, _Val); @@ -3791,6 +3793,8 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ _Advance_bytes(_Out, _Remove_patterns_8._Count[_Bingo]); _Advance_bytes(_First, 32); } while (_First != _Stop); + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } return _Remove_fallback(_First, _Last, _Out, _Val); From 39974d199a082d7566f5f763fbf077b7bdd82eff Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Fri, 4 Oct 2024 13:57:08 +0300 Subject: [PATCH 11/34] mask like floats --- stl/src/vector_algorithms.cpp | 36 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 92fea69d75..99ee1635da 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3741,22 +3741,16 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ void* _Out = _First; if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes > 32) { - const __m256i _Match = _mm256_set1_epi32(_Val); - const __m256i _Dense_shuf = _mm256_set_epi8( // - 12, 8, 4, 0, -1, -1, -1, -1, // - -1, -1, -1, -1, -1, -1, -1, -1, // - -1, -1, -1, -1, -1, -1, -1, -1, // - -1, -1, -1, -1, 12, 8, 4, 0); + const __m256i _Match = _mm256_set1_epi32(_Val); void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); do { - const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); - const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); - const unsigned _Bingo_d = _mm256_movemask_epi8(_mm256_shuffle_epi8(_Mask, _Dense_shuf)); - const unsigned _Bingo = _rotl8(static_cast(_rotl(_Bingo_d, 4)), 4); - const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_4._Data[_Bingo])); - const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); + const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); + const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); + const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_4._Data[_Bingo])); + const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); _Advance_bytes(_Out, _Remove_patterns_4._Count[_Bingo]); _Advance_bytes(_First, 32); @@ -3773,22 +3767,16 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ void* _Out = _First; if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes > 32) { - const __m256i _Match = _mm256_set1_epi64x(_Val); - const __m256i _Dense_shuf = _mm256_set_epi8( // - 8, 0, -1, -1, -1, -1, -1, -1, // - -1, -1, -1, -1, -1, -1, -1, -1, // - -1, -1, -1, -1, -1, -1, -1, -1, // - -1, -1, -1, -1, -1, -1, 8, 0); + const __m256i _Match = _mm256_set1_epi64x(_Val); void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); do { - const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); - const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); - const unsigned _Bingo_d = _mm256_movemask_epi8(_mm256_shuffle_epi8(_Mask, _Dense_shuf)); - const unsigned _Bingo = (_Bingo_d | (_Bingo_d >> 28)) & 0xF; - const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_8._Data[_Bingo])); - const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); + const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); + const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); + const unsigned _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask)); + const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_8._Data[_Bingo])); + const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); _Advance_bytes(_Out, _Remove_patterns_8._Count[_Bingo]); _Advance_bytes(_First, 32); From 3e9109a30f4adb9da4cf3b4d66954c1af9352884 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Fri, 4 Oct 2024 14:03:57 +0300 Subject: [PATCH 12/34] also remove shuffle from here --- stl/src/vector_algorithms.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 99ee1635da..74cf38b1b9 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3716,15 +3716,14 @@ void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _ void* _Out = _First; if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes > 16) { - const __m128i _Match = _mm_set1_epi16(_Val); - const __m128i _Dense_shuf = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0); + const __m128i _Match = _mm_set1_epi16(_Val); void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); do { const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); const __m128i _Mask = _mm_cmpeq_epi16(_Src, _Match); - const unsigned _Bingo = _mm_movemask_epi8(_mm_shuffle_epi8(_Mask, _Dense_shuf)); + const unsigned _Bingo = _mm_movemask_epi8(_mm_packs_epi16(_Mask, _mm_setzero_si128())); const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_patterns_2._Data[_Bingo])); const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); From c8ec13b5b375987e0a88e7a6456c74011578dc9f Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 5 Oct 2024 09:47:45 +0300 Subject: [PATCH 13/34] elaborate comments on the complex part of obtaining the tables --- stl/src/vector_algorithms.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 74cf38b1b9..a9dc0e5f0e 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3644,22 +3644,22 @@ namespace { return _Dest; } - template - struct _Remove_patterns_t { - uint8_t _Data[_Size_v][_Size_h]; - uint8_t _Count[_Size_v]; - }; template constexpr auto _Make_remove_patterns(const unsigned _Mul, const unsigned _Ew) { - _Remove_patterns_t<_Size_v, _Size_h> _Result; + struct { + uint8_t _Data[_Size_v][_Size_h]; + uint8_t _Count[_Size_v]; + } _Result; for (unsigned _Vx = 0; _Vx != _Size_v; ++_Vx) { unsigned _Nx = 0; - // Compact the source according to bitmap + // Make shuffle mask for pshufb / vpermd corresponding to _Vx bit value. + // Every bit set corresponds to element skipped. for (unsigned _Hx = 0; _Hx != _Size_h / _Ew; ++_Hx) { if ((_Vx & (1 << _Hx)) == 0) { + // Inner loop needed for cases where shuffle mask operate on element pars rather than whole elements for (unsigned _Ex = 0; _Ex != _Ew; ++_Ex) { _Result._Data[_Vx][_Nx * _Ew + _Ex] = static_cast(_Hx * _Ew + _Ex); } @@ -3667,10 +3667,16 @@ namespace { } } + // Count of bytes for removed elements that are not removed _Result._Count[_Vx] = static_cast(_Nx * _Mul); - // Fill the remaining as if not touched + // Fill the remaining with arbitrary elements. + // It is not possible to leave them untouched, while keeping this optimization efficient. + // This should not be a problem though, as they should be either overwritten by the next step, + // or left in the removed range. Still setting them to the values of some of existing elements, + // rather than zero, to reduce the surprising behavior. for (; _Nx != _Size_h / _Ew; ++_Nx) { + // Inner loop needed for cases where shuffle mask operate on element pars rather than whole elements for (unsigned _Ex = 0; _Ex != _Ew; ++_Ex) { _Result._Data[_Vx][_Nx * _Ew + _Ex] = static_cast(_Nx * _Ew + _Ex); } From 63cb68184eae6d0862b7ed0cdaa2b9c33b421b30 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 5 Oct 2024 20:25:59 +0300 Subject: [PATCH 14/34] doesn't matter, but it is unsigned --- stl/src/vector_algorithms.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index a9dc0e5f0e..6ece66cca4 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3754,7 +3754,7 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); - const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_4._Data[_Bingo])); + const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_patterns_4._Data[_Bingo])); const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); _Advance_bytes(_Out, _Remove_patterns_4._Count[_Bingo]); @@ -3780,7 +3780,7 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); const unsigned _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask)); - const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_8._Data[_Bingo])); + const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_patterns_8._Data[_Bingo])); const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); _Advance_bytes(_Out, _Remove_patterns_8._Count[_Bingo]); From 61a97c6bad1dd54894f9ac814dc73461ddc5f562 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 5 Oct 2024 20:31:24 +0300 Subject: [PATCH 15/34] what did I say --- stl/src/vector_algorithms.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 6ece66cca4..ffca6b8a69 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3648,8 +3648,8 @@ namespace { template constexpr auto _Make_remove_patterns(const unsigned _Mul, const unsigned _Ew) { struct { - uint8_t _Data[_Size_v][_Size_h]; - uint8_t _Count[_Size_v]; + uint8_t _Shuf[_Size_v][_Size_h]; + uint8_t _Size[_Size_v]; } _Result; for (unsigned _Vx = 0; _Vx != _Size_v; ++_Vx) { @@ -3661,14 +3661,14 @@ namespace { if ((_Vx & (1 << _Hx)) == 0) { // Inner loop needed for cases where shuffle mask operate on element pars rather than whole elements for (unsigned _Ex = 0; _Ex != _Ew; ++_Ex) { - _Result._Data[_Vx][_Nx * _Ew + _Ex] = static_cast(_Hx * _Ew + _Ex); + _Result._Shuf[_Vx][_Nx * _Ew + _Ex] = static_cast(_Hx * _Ew + _Ex); } ++_Nx; } } - // Count of bytes for removed elements that are not removed - _Result._Count[_Vx] = static_cast(_Nx * _Mul); + // Size of elements that are not removed in bytes + _Result._Size[_Vx] = static_cast(_Nx * _Mul); // Fill the remaining with arbitrary elements. // It is not possible to leave them untouched, while keeping this optimization efficient. @@ -3678,7 +3678,7 @@ namespace { for (; _Nx != _Size_h / _Ew; ++_Nx) { // Inner loop needed for cases where shuffle mask operate on element pars rather than whole elements for (unsigned _Ex = 0; _Ex != _Ew; ++_Ex) { - _Result._Data[_Vx][_Nx * _Ew + _Ex] = static_cast(_Nx * _Ew + _Ex); + _Result._Shuf[_Vx][_Nx * _Ew + _Ex] = static_cast(_Nx * _Ew + _Ex); } } } @@ -3706,10 +3706,10 @@ void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _V do { const __m128i _Src = _mm_loadu_si64(_First); const unsigned _Bingo = _mm_movemask_epi8(_mm_cmpeq_epi8(_Src, _Match)) & 0xFF; - const __m128i _Shuf = _mm_loadu_si64(_Remove_patterns_1._Data[_Bingo]); + const __m128i _Shuf = _mm_loadu_si64(_Remove_patterns_1._Shuf[_Bingo]); const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); _mm_storeu_si64(_Out, _Dest); - _Advance_bytes(_Out, _Remove_patterns_1._Count[_Bingo]); + _Advance_bytes(_Out, _Remove_patterns_1._Size[_Bingo]); _Advance_bytes(_First, 8); } while (_First != _Stop); } @@ -3730,10 +3730,10 @@ void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _ const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); const __m128i _Mask = _mm_cmpeq_epi16(_Src, _Match); const unsigned _Bingo = _mm_movemask_epi8(_mm_packs_epi16(_Mask, _mm_setzero_si128())); - const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_patterns_2._Data[_Bingo])); + const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_patterns_2._Shuf[_Bingo])); const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_patterns_2._Count[_Bingo]); + _Advance_bytes(_Out, _Remove_patterns_2._Size[_Bingo]); _Advance_bytes(_First, 16); } while (_First != _Stop); } @@ -3754,10 +3754,10 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); - const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_patterns_4._Data[_Bingo])); + const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_patterns_4._Shuf[_Bingo])); const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_patterns_4._Count[_Bingo]); + _Advance_bytes(_Out, _Remove_patterns_4._Size[_Bingo]); _Advance_bytes(_First, 32); } while (_First != _Stop); @@ -3780,10 +3780,10 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); const unsigned _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask)); - const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_patterns_8._Data[_Bingo])); + const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_patterns_8._Shuf[_Bingo])); const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_patterns_8._Count[_Bingo]); + _Advance_bytes(_Out, _Remove_patterns_8._Size[_Bingo]); _Advance_bytes(_First, 32); } while (_First != _Stop); From 167c2763c5a5a3c0f87468f7aadf0204d4d1364f Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 6 Oct 2024 18:21:40 +0300 Subject: [PATCH 16/34] no strict --- stl/src/vector_algorithms.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index ffca6b8a69..ead1d54a93 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3698,7 +3698,7 @@ void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _V _First = const_cast(__std_find_trivial_1(_First, _Last, _Val)); void* _Out = _First; - if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes > 8) { + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 8) { const __m128i _Match = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Val), _mm_setzero_si128()); void* _Stop = _First; @@ -3721,7 +3721,7 @@ void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _ _First = const_cast(__std_find_trivial_2(_First, _Last, _Val)); void* _Out = _First; - if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes > 16) { + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 16) { const __m128i _Match = _mm_set1_epi16(_Val); void* _Stop = _First; @@ -3745,7 +3745,7 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ _First = const_cast(__std_find_trivial_4(_First, _Last, _Val)); void* _Out = _First; - if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes > 32) { + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { const __m256i _Match = _mm256_set1_epi32(_Val); void* _Stop = _First; @@ -3771,7 +3771,7 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ _First = const_cast(__std_find_trivial_8(_First, _Last, _Val)); void* _Out = _First; - if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes > 32) { + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { const __m256i _Match = _mm256_set1_epi64x(_Val); void* _Stop = _First; From e6dc56d5028d366840a2467990664b902435fdba Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 6 Oct 2024 18:41:05 +0300 Subject: [PATCH 17/34] Still something on SSE4.2 for 32 and 64 bit eleemnts --- stl/src/vector_algorithms.cpp | 64 +++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index ead1d54a93..9acd82a338 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3686,10 +3686,12 @@ namespace { return _Result; } - constexpr auto _Remove_patterns_1 = _Make_remove_patterns<256, 8>(1, 1); - constexpr auto _Remove_patterns_2 = _Make_remove_patterns<256, 16>(2, 2); - constexpr auto _Remove_patterns_4 = _Make_remove_patterns<256, 8>(4, 1); - constexpr auto _Remove_patterns_8 = _Make_remove_patterns<16, 8>(8, 2); + constexpr auto _Remove_patterns_1_sse = _Make_remove_patterns<256, 8>(1, 1); + constexpr auto _Remove_patterns_2_sse = _Make_remove_patterns<256, 16>(2, 2); + constexpr auto _Remove_patterns_4_sse = _Make_remove_patterns<16, 16>(4, 4); + constexpr auto _Remove_patterns_4_avx = _Make_remove_patterns<256, 8>(4, 1); + constexpr auto _Remove_patterns_8_sse = _Make_remove_patterns<4, 16>(8, 8); + constexpr auto _Remove_patterns_8_avx = _Make_remove_patterns<16, 8>(8, 2); } // unnamed namespace extern "C" { @@ -3706,10 +3708,10 @@ void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _V do { const __m128i _Src = _mm_loadu_si64(_First); const unsigned _Bingo = _mm_movemask_epi8(_mm_cmpeq_epi8(_Src, _Match)) & 0xFF; - const __m128i _Shuf = _mm_loadu_si64(_Remove_patterns_1._Shuf[_Bingo]); + const __m128i _Shuf = _mm_loadu_si64(_Remove_patterns_1_sse._Shuf[_Bingo]); const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); _mm_storeu_si64(_Out, _Dest); - _Advance_bytes(_Out, _Remove_patterns_1._Size[_Bingo]); + _Advance_bytes(_Out, _Remove_patterns_1_sse._Size[_Bingo]); _Advance_bytes(_First, 8); } while (_First != _Stop); } @@ -3730,10 +3732,11 @@ void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _ const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); const __m128i _Mask = _mm_cmpeq_epi16(_Src, _Match); const unsigned _Bingo = _mm_movemask_epi8(_mm_packs_epi16(_Mask, _mm_setzero_si128())); - const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_patterns_2._Shuf[_Bingo])); - const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); + const __m128i _Shuf = + _mm_loadu_si128(reinterpret_cast(_Remove_patterns_2_sse._Shuf[_Bingo])); + const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_patterns_2._Size[_Bingo]); + _Advance_bytes(_Out, _Remove_patterns_2_sse._Size[_Bingo]); _Advance_bytes(_First, 16); } while (_First != _Stop); } @@ -3745,7 +3748,8 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ _First = const_cast(__std_find_trivial_4(_First, _Last, _Val)); void* _Out = _First; - if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { + const size_t _Size_bytes = _Byte_length(_First, _Last); + if (_Use_avx2() && _Size_bytes >= 32) { const __m256i _Match = _mm256_set1_epi32(_Val); void* _Stop = _First; @@ -3754,14 +3758,30 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); - const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_patterns_4._Shuf[_Bingo])); + const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_patterns_4_avx._Shuf[_Bingo])); const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_patterns_4._Size[_Bingo]); + _Advance_bytes(_Out, _Remove_patterns_4_avx._Size[_Bingo]); _Advance_bytes(_First, 32); } while (_First != _Stop); _mm256_zeroupper(); // TRANSITION, DevCom-10331414 + } else if (_Use_sse42() && _Size_bytes >= 16) { + const __m128i _Match = _mm_set1_epi32(_Val); + + void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); + do { + const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); + const __m128i _Mask = _mm_cmpeq_epi32(_Src, _Match); + const unsigned _Bingo = _mm_movemask_ps(_mm_castsi128_ps(_Mask)); + const __m128i _Shuf = + _mm_loadu_si128(reinterpret_cast(_Remove_patterns_4_sse._Shuf[_Bingo])); + const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); + _Advance_bytes(_Out, _Remove_patterns_4_sse._Size[_Bingo]); + _Advance_bytes(_First, 16); + } while (_First != _Stop); } return _Remove_fallback(_First, _Last, _Out, _Val); @@ -3780,14 +3800,30 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); const unsigned _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask)); - const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_patterns_8._Shuf[_Bingo])); + const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_patterns_8_avx._Shuf[_Bingo])); const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_patterns_8._Size[_Bingo]); + _Advance_bytes(_Out, _Remove_patterns_8_avx._Size[_Bingo]); _Advance_bytes(_First, 32); } while (_First != _Stop); _mm256_zeroupper(); // TRANSITION, DevCom-10331414 + } else if (_Use_sse42() && _Size_bytes >= 16) { + const __m128i _Match = _mm_set1_epi64x(_Val); + + void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); + do { + const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); + const __m128i _Mask = _mm_cmpeq_epi64(_Src, _Match); + const unsigned _Bingo = _mm_movemask_pd(_mm_castsi128_pd(_Mask)); + const __m128i _Shuf = + _mm_loadu_si128(reinterpret_cast(_Remove_patterns_8_sse._Shuf[_Bingo])); + const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); + _Advance_bytes(_Out, _Remove_patterns_8_sse._Size[_Bingo]); + _Advance_bytes(_First, 16); + } while (_First != _Stop); } return _Remove_fallback(_First, _Last, _Out, _Val); From c48d887b89f49fb846c9e15eb4119b5c494be080 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Mon, 7 Oct 2024 07:31:38 +0300 Subject: [PATCH 18/34] Not patterns just tables --- stl/src/vector_algorithms.cpp | 41 ++++++++++++++++------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 9acd82a338..26d0c3ba96 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3646,7 +3646,7 @@ namespace { template - constexpr auto _Make_remove_patterns(const unsigned _Mul, const unsigned _Ew) { + constexpr auto _Make_remove_tables(const unsigned _Mul, const unsigned _Ew) { struct { uint8_t _Shuf[_Size_v][_Size_h]; uint8_t _Size[_Size_v]; @@ -3686,12 +3686,12 @@ namespace { return _Result; } - constexpr auto _Remove_patterns_1_sse = _Make_remove_patterns<256, 8>(1, 1); - constexpr auto _Remove_patterns_2_sse = _Make_remove_patterns<256, 16>(2, 2); - constexpr auto _Remove_patterns_4_sse = _Make_remove_patterns<16, 16>(4, 4); - constexpr auto _Remove_patterns_4_avx = _Make_remove_patterns<256, 8>(4, 1); - constexpr auto _Remove_patterns_8_sse = _Make_remove_patterns<4, 16>(8, 8); - constexpr auto _Remove_patterns_8_avx = _Make_remove_patterns<16, 8>(8, 2); + constexpr auto _Remove_tables_1_sse = _Make_remove_tables<256, 8>(1, 1); + constexpr auto _Remove_tables_2_sse = _Make_remove_tables<256, 16>(2, 2); + constexpr auto _Remove_tables_4_sse = _Make_remove_tables<16, 16>(4, 4); + constexpr auto _Remove_tables_4_avx = _Make_remove_tables<256, 8>(4, 1); + constexpr auto _Remove_tables_8_sse = _Make_remove_tables<4, 16>(8, 8); + constexpr auto _Remove_tables_8_avx = _Make_remove_tables<16, 8>(8, 2); } // unnamed namespace extern "C" { @@ -3708,10 +3708,10 @@ void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _V do { const __m128i _Src = _mm_loadu_si64(_First); const unsigned _Bingo = _mm_movemask_epi8(_mm_cmpeq_epi8(_Src, _Match)) & 0xFF; - const __m128i _Shuf = _mm_loadu_si64(_Remove_patterns_1_sse._Shuf[_Bingo]); + const __m128i _Shuf = _mm_loadu_si64(_Remove_tables_1_sse._Shuf[_Bingo]); const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); _mm_storeu_si64(_Out, _Dest); - _Advance_bytes(_Out, _Remove_patterns_1_sse._Size[_Bingo]); + _Advance_bytes(_Out, _Remove_tables_1_sse._Size[_Bingo]); _Advance_bytes(_First, 8); } while (_First != _Stop); } @@ -3732,11 +3732,10 @@ void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _ const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); const __m128i _Mask = _mm_cmpeq_epi16(_Src, _Match); const unsigned _Bingo = _mm_movemask_epi8(_mm_packs_epi16(_Mask, _mm_setzero_si128())); - const __m128i _Shuf = - _mm_loadu_si128(reinterpret_cast(_Remove_patterns_2_sse._Shuf[_Bingo])); + const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_2_sse._Shuf[_Bingo])); const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_patterns_2_sse._Size[_Bingo]); + _Advance_bytes(_Out, _Remove_tables_2_sse._Size[_Bingo]); _Advance_bytes(_First, 16); } while (_First != _Stop); } @@ -3758,10 +3757,10 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); - const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_patterns_4_avx._Shuf[_Bingo])); + const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_4_avx._Shuf[_Bingo])); const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_patterns_4_avx._Size[_Bingo]); + _Advance_bytes(_Out, _Remove_tables_4_avx._Size[_Bingo]); _Advance_bytes(_First, 32); } while (_First != _Stop); @@ -3775,11 +3774,10 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); const __m128i _Mask = _mm_cmpeq_epi32(_Src, _Match); const unsigned _Bingo = _mm_movemask_ps(_mm_castsi128_ps(_Mask)); - const __m128i _Shuf = - _mm_loadu_si128(reinterpret_cast(_Remove_patterns_4_sse._Shuf[_Bingo])); + const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_4_sse._Shuf[_Bingo])); const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_patterns_4_sse._Size[_Bingo]); + _Advance_bytes(_Out, _Remove_tables_4_sse._Size[_Bingo]); _Advance_bytes(_First, 16); } while (_First != _Stop); } @@ -3800,10 +3798,10 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); const unsigned _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask)); - const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_patterns_8_avx._Shuf[_Bingo])); + const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_8_avx._Shuf[_Bingo])); const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_patterns_8_avx._Size[_Bingo]); + _Advance_bytes(_Out, _Remove_tables_8_avx._Size[_Bingo]); _Advance_bytes(_First, 32); } while (_First != _Stop); @@ -3817,11 +3815,10 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); const __m128i _Mask = _mm_cmpeq_epi64(_Src, _Match); const unsigned _Bingo = _mm_movemask_pd(_mm_castsi128_pd(_Mask)); - const __m128i _Shuf = - _mm_loadu_si128(reinterpret_cast(_Remove_patterns_8_sse._Shuf[_Bingo])); + const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_8_sse._Shuf[_Bingo])); const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_patterns_8_sse._Size[_Bingo]); + _Advance_bytes(_Out, _Remove_tables_8_sse._Size[_Bingo]); _Advance_bytes(_First, 16); } while (_First != _Stop); } From eaa1ec06a6f3108d021d7a3f89aba68e84127006 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Wed, 23 Oct 2024 07:52:42 +0300 Subject: [PATCH 19/34] unsigned -> uint32_t --- stl/src/vector_algorithms.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 4167d231cc..059f5f03a4 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3879,21 +3879,21 @@ namespace { template - constexpr auto _Make_remove_tables(const unsigned _Mul, const unsigned _Ew) { + constexpr auto _Make_remove_tables(const uint32_t _Mul, const uint32_t _Ew) { struct { uint8_t _Shuf[_Size_v][_Size_h]; uint8_t _Size[_Size_v]; } _Result; - for (unsigned _Vx = 0; _Vx != _Size_v; ++_Vx) { - unsigned _Nx = 0; + for (uint32_t _Vx = 0; _Vx != _Size_v; ++_Vx) { + uint32_t _Nx = 0; // Make shuffle mask for pshufb / vpermd corresponding to _Vx bit value. // Every bit set corresponds to element skipped. - for (unsigned _Hx = 0; _Hx != _Size_h / _Ew; ++_Hx) { + for (uint32_t _Hx = 0; _Hx != _Size_h / _Ew; ++_Hx) { if ((_Vx & (1 << _Hx)) == 0) { // Inner loop needed for cases where shuffle mask operate on element pars rather than whole elements - for (unsigned _Ex = 0; _Ex != _Ew; ++_Ex) { + for (uint32_t _Ex = 0; _Ex != _Ew; ++_Ex) { _Result._Shuf[_Vx][_Nx * _Ew + _Ex] = static_cast(_Hx * _Ew + _Ex); } ++_Nx; @@ -3910,7 +3910,7 @@ namespace { // rather than zero, to reduce the surprising behavior. for (; _Nx != _Size_h / _Ew; ++_Nx) { // Inner loop needed for cases where shuffle mask operate on element pars rather than whole elements - for (unsigned _Ex = 0; _Ex != _Ew; ++_Ex) { + for (uint32_t _Ex = 0; _Ex != _Ew; ++_Ex) { _Result._Shuf[_Vx][_Nx * _Ew + _Ex] = static_cast(_Nx * _Ew + _Ex); } } @@ -3940,7 +3940,7 @@ void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _V _Advance_bytes(_Stop, _Size_bytes & ~size_t{7}); do { const __m128i _Src = _mm_loadu_si64(_First); - const unsigned _Bingo = _mm_movemask_epi8(_mm_cmpeq_epi8(_Src, _Match)) & 0xFF; + const uint32_t _Bingo = _mm_movemask_epi8(_mm_cmpeq_epi8(_Src, _Match)) & 0xFF; const __m128i _Shuf = _mm_loadu_si64(_Remove_tables_1_sse._Shuf[_Bingo]); const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); _mm_storeu_si64(_Out, _Dest); @@ -3964,7 +3964,7 @@ void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _ do { const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); const __m128i _Mask = _mm_cmpeq_epi16(_Src, _Match); - const unsigned _Bingo = _mm_movemask_epi8(_mm_packs_epi16(_Mask, _mm_setzero_si128())); + const uint32_t _Bingo = _mm_movemask_epi8(_mm_packs_epi16(_Mask, _mm_setzero_si128())); const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_2_sse._Shuf[_Bingo])); const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); @@ -3989,7 +3989,7 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ do { const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); - const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + const uint32_t _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_4_avx._Shuf[_Bingo])); const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); @@ -4006,7 +4006,7 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ do { const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); const __m128i _Mask = _mm_cmpeq_epi32(_Src, _Match); - const unsigned _Bingo = _mm_movemask_ps(_mm_castsi128_ps(_Mask)); + const uint32_t _Bingo = _mm_movemask_ps(_mm_castsi128_ps(_Mask)); const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_4_sse._Shuf[_Bingo])); const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); @@ -4030,7 +4030,7 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ do { const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); - const unsigned _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask)); + const uint32_t _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask)); const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_8_avx._Shuf[_Bingo])); const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); @@ -4047,7 +4047,7 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ do { const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); const __m128i _Mask = _mm_cmpeq_epi64(_Src, _Match); - const unsigned _Bingo = _mm_movemask_pd(_mm_castsi128_pd(_Mask)); + const uint32_t _Bingo = _mm_movemask_pd(_mm_castsi128_pd(_Mask)); const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_8_sse._Shuf[_Bingo])); const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); From 6fb36ce911405ffccd6751e727aa645651ec175a Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Wed, 23 Oct 2024 07:54:57 +0300 Subject: [PATCH 20/34] parts --- stl/src/vector_algorithms.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 059f5f03a4..c44580a145 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3892,7 +3892,8 @@ namespace { // Every bit set corresponds to element skipped. for (uint32_t _Hx = 0; _Hx != _Size_h / _Ew; ++_Hx) { if ((_Vx & (1 << _Hx)) == 0) { - // Inner loop needed for cases where shuffle mask operate on element pars rather than whole elements + // Inner loop needed for cases where shuffle mask operate on element parts rather than whole + // elements; for whole elements there would be one iteration for (uint32_t _Ex = 0; _Ex != _Ew; ++_Ex) { _Result._Shuf[_Vx][_Nx * _Ew + _Ex] = static_cast(_Hx * _Ew + _Ex); } From 877a48b8b1096d8b1ac1e928a3e1f52497eb93b8 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Wed, 23 Oct 2024 07:59:44 +0300 Subject: [PATCH 21/34] order --- .../VSO_0000000_vector_algorithms/test.cpp | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 475b3dccca..613569d38a 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -749,33 +749,6 @@ FwdIt2 last_known_good_swap_ranges(FwdIt1 first1, const FwdIt1 last1, FwdIt2 des return dest; } -template -void test_swap_ranges(mt19937_64& gen) { - const auto fn = [&]() { return static_cast(gen()); }; - vector left(dataCount); - vector right(dataCount); - generate(left.begin(), left.end(), fn); - generate(right.begin(), right.end(), fn); - - auto leftCopy = left; - auto rightCopy = right; - - for (ptrdiff_t attempts = 0; attempts < static_cast(dataCount); ++attempts) { - assert(right.begin() + attempts == swap_ranges(left.begin(), left.begin() + attempts, right.begin())); - last_known_good_swap_ranges(leftCopy.begin(), leftCopy.begin() + attempts, rightCopy.begin()); - assert(left == leftCopy); - assert(right == rightCopy); - - // also test unaligned input - const auto endOffset = min(static_cast(dataCount), attempts + 1); - assert( - right.begin() + (endOffset - 1) == swap_ranges(left.begin() + 1, left.begin() + endOffset, right.begin())); - last_known_good_swap_ranges(leftCopy.begin() + 1, leftCopy.begin() + endOffset, rightCopy.begin()); - assert(left == leftCopy); - assert(right == rightCopy); - } -} - template FwdIt last_known_good_remove(FwdIt first, FwdIt last, T val) { FwdIt dest = first; @@ -816,7 +789,7 @@ void test_remove(mt19937_64& gen) { vector in_out_actual; vector in_out_actual_r; - for (auto v : {&source, &in_out_expected, &in_out_actual, &in_out_actual_r}) { + for (const auto& v : {&source, &in_out_expected, &in_out_actual, &in_out_actual_r}) { v->reserve(dataCount); } @@ -824,7 +797,7 @@ void test_remove(mt19937_64& gen) { for (size_t attempts = 0; attempts < dataCount; ++attempts) { source.push_back(static_cast(dis(gen))); - for (auto v : {&in_out_expected, &in_out_actual, &in_out_actual_r}) { + for (const auto& v : {&in_out_expected, &in_out_actual, &in_out_actual_r}) { *v = source; } @@ -832,6 +805,33 @@ void test_remove(mt19937_64& gen) { } } +template +void test_swap_ranges(mt19937_64& gen) { + const auto fn = [&]() { return static_cast(gen()); }; + vector left(dataCount); + vector right(dataCount); + generate(left.begin(), left.end(), fn); + generate(right.begin(), right.end(), fn); + + auto leftCopy = left; + auto rightCopy = right; + + for (ptrdiff_t attempts = 0; attempts < static_cast(dataCount); ++attempts) { + assert(right.begin() + attempts == swap_ranges(left.begin(), left.begin() + attempts, right.begin())); + last_known_good_swap_ranges(leftCopy.begin(), leftCopy.begin() + attempts, rightCopy.begin()); + assert(left == leftCopy); + assert(right == rightCopy); + + // also test unaligned input + const auto endOffset = min(static_cast(dataCount), attempts + 1); + assert( + right.begin() + (endOffset - 1) == swap_ranges(left.begin() + 1, left.begin() + endOffset, right.begin())); + last_known_good_swap_ranges(leftCopy.begin() + 1, leftCopy.begin() + endOffset, rightCopy.begin()); + assert(left == leftCopy); + assert(right == rightCopy); + } +} + void test_vector_algorithms(mt19937_64& gen) { test_count(gen); test_count(gen); From 7c9fb18db36ad6686d10ddaa85a5134abcb3cc34 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Wed, 23 Oct 2024 08:01:20 +0300 Subject: [PATCH 22/34] Wait, is it C++20? Always has been --- stl/inc/algorithm | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 3f47e66de9..8be809fbf7 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -4590,12 +4590,9 @@ namespace ranges { _STL_INTERNAL_STATIC_ASSERT(indirect_binary_predicate, const _Ty*>); #if _USE_STD_VECTOR_ALGORITHMS -#if _HAS_CXX20 if constexpr (_Vector_alg_in_find_is_safe<_It, _Ty> && sized_sentinel_for<_Se, _It> && is_same_v<_Pj, identity>) { - if (!_STD is_constant_evaluated()) -#endif // _HAS_CXX20 - { + if (!_STD is_constant_evaluated()) { const auto _Size = _Last - _First; if (!_STD _Could_compare_equal_to_value_type<_It>(_Val)) { return {_First + _Size, _First + _Size}; From fa245b071f071e7f6b0bb2564344e79993439933 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Wed, 23 Oct 2024 08:05:51 +0300 Subject: [PATCH 23/34] Out of ARM --- stl/src/vector_algorithms.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index c44580a145..87d7e8969d 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3877,7 +3877,7 @@ namespace { return _Dest; } - +#ifndef _M_ARM64EC template constexpr auto _Make_remove_tables(const uint32_t _Mul, const uint32_t _Ew) { struct { @@ -3926,6 +3926,7 @@ namespace { constexpr auto _Remove_tables_4_avx = _Make_remove_tables<256, 8>(4, 1); constexpr auto _Remove_tables_8_sse = _Make_remove_tables<4, 16>(8, 8); constexpr auto _Remove_tables_8_avx = _Make_remove_tables<16, 8>(8, 2); +#endif // !defined(_M_ARM64EC) } // unnamed namespace extern "C" { @@ -3934,6 +3935,7 @@ void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _V _First = const_cast(__std_find_trivial_1(_First, _Last, _Val)); void* _Out = _First; +#ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 8) { const __m128i _Match = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Val), _mm_setzero_si128()); @@ -3949,6 +3951,7 @@ void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _V _Advance_bytes(_First, 8); } while (_First != _Stop); } +#endif // !defined(_M_ARM64EC) return _Remove_fallback(_First, _Last, _Out, _Val); } @@ -3957,6 +3960,7 @@ void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _ _First = const_cast(__std_find_trivial_2(_First, _Last, _Val)); void* _Out = _First; +#ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 16) { const __m128i _Match = _mm_set1_epi16(_Val); @@ -3973,6 +3977,7 @@ void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _ _Advance_bytes(_First, 16); } while (_First != _Stop); } +#endif // !defined(_M_ARM64EC) return _Remove_fallback(_First, _Last, _Out, _Val); } @@ -3981,6 +3986,7 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ _First = const_cast(__std_find_trivial_4(_First, _Last, _Val)); void* _Out = _First; +#ifndef _M_ARM64EC const size_t _Size_bytes = _Byte_length(_First, _Last); if (_Use_avx2() && _Size_bytes >= 32) { const __m256i _Match = _mm256_set1_epi32(_Val); @@ -4015,6 +4021,7 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ _Advance_bytes(_First, 16); } while (_First != _Stop); } +#endif // !defined(_M_ARM64EC) return _Remove_fallback(_First, _Last, _Out, _Val); } @@ -4023,6 +4030,7 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ _First = const_cast(__std_find_trivial_8(_First, _Last, _Val)); void* _Out = _First; +#ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { const __m256i _Match = _mm256_set1_epi64x(_Val); @@ -4056,6 +4064,7 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ _Advance_bytes(_First, 16); } while (_First != _Stop); } +#endif // !defined(_M_ARM64EC) return _Remove_fallback(_First, _Last, _Out, _Val); } From 55e632f3dc6cf2c4cd581bd3351974639aa062dc Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Wed, 23 Oct 2024 08:01:07 -0700 Subject: [PATCH 24/34] Use `string_view` for `lorem_ipsum`. search.cpp can construct `std::string` directly from `lorem_ipsum`. sv_equal.cpp can use `lorem_ipsum.substr(0, 2048)`. --- benchmarks/inc/lorem.hpp | 4 +++- benchmarks/src/remove.cpp | 4 ++-- benchmarks/src/replace.cpp | 12 ++++++------ benchmarks/src/search.cpp | 8 ++++---- benchmarks/src/sv_equal.cpp | 2 +- 5 files changed, 16 insertions(+), 14 deletions(-) diff --git a/benchmarks/inc/lorem.hpp b/benchmarks/inc/lorem.hpp index e7e70aa20f..5671e2ae7d 100644 --- a/benchmarks/inc/lorem.hpp +++ b/benchmarks/inc/lorem.hpp @@ -3,7 +3,9 @@ #pragma once -const char lorem_ipsum[] = +#include + +inline constexpr std::string_view lorem_ipsum = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam mollis imperdiet massa, at dapibus elit interdum " "ac. In eget sollicitudin mi. Nam at tellus at sapien tincidunt sollicitudin vel non eros. Pellentesque nunc nunc, " "ullamcorper eu accumsan at, pulvinar non turpis. Quisque vel mauris pulvinar, pretium purus vel, ultricies erat. " diff --git a/benchmarks/src/remove.cpp b/benchmarks/src/remove.cpp index 2e47fbcc20..ce6bf4fe91 100644 --- a/benchmarks/src/remove.cpp +++ b/benchmarks/src/remove.cpp @@ -12,9 +12,9 @@ enum class alg_type { std_fn, rng }; template void r(benchmark::State& state) { - std::vector src(std::begin(lorem_ipsum), std::end(lorem_ipsum)); + std::vector src(lorem_ipsum.begin(), lorem_ipsum.end()); std::vector v; - v.reserve(std::size(lorem_ipsum)); + v.reserve(lorem_ipsum.size()); for (auto _ : state) { v = src; benchmark::DoNotOptimize(v); diff --git a/benchmarks/src/replace.cpp b/benchmarks/src/replace.cpp index ea83dc8d4e..7a41e04156 100644 --- a/benchmarks/src/replace.cpp +++ b/benchmarks/src/replace.cpp @@ -10,8 +10,8 @@ template void r(benchmark::State& state) { - const std::vector a(std::begin(lorem_ipsum), std::end(lorem_ipsum)); - std::vector b(std::size(lorem_ipsum)); + const std::vector a(lorem_ipsum.begin(), lorem_ipsum.end()); + std::vector b(lorem_ipsum.size()); for (auto _ : state) { b = a; @@ -21,8 +21,8 @@ void r(benchmark::State& state) { template void rc(benchmark::State& state) { - const std::vector a(std::begin(lorem_ipsum), std::end(lorem_ipsum)); - std::vector b(std::size(lorem_ipsum)); + const std::vector a(lorem_ipsum.begin(), lorem_ipsum.end()); + std::vector b(lorem_ipsum.size()); for (auto _ : state) { std::replace_copy(std::begin(a), std::end(a), std::begin(b), T{'m'}, T{'w'}); @@ -31,8 +31,8 @@ void rc(benchmark::State& state) { template void rc_if(benchmark::State& state) { - const std::vector a(std::begin(lorem_ipsum), std::end(lorem_ipsum)); - std::vector b(std::size(lorem_ipsum)); + const std::vector a(lorem_ipsum.begin(), lorem_ipsum.end()); + std::vector b(lorem_ipsum.size()); for (auto _ : state) { (void) std::replace_copy_if( diff --git a/benchmarks/src/search.cpp b/benchmarks/src/search.cpp index 2d5265e0f1..5805743dfd 100644 --- a/benchmarks/src/search.cpp +++ b/benchmarks/src/search.cpp @@ -23,7 +23,7 @@ constexpr std::array patterns = { void c_strstr(benchmark::State& state) { const auto& src_needle = patterns[static_cast(state.range())]; - const std::string haystack(std::begin(lorem_ipsum), std::end(lorem_ipsum)); + const std::string haystack(lorem_ipsum); const std::string needle(std::begin(src_needle), std::end(src_needle)); for (auto _ : state) { @@ -38,7 +38,7 @@ template void classic_search(benchmark::State& state) { const auto& src_needle = patterns[static_cast(state.range())]; - const std::vector haystack(std::begin(lorem_ipsum), std::end(lorem_ipsum)); + const std::vector haystack(lorem_ipsum.begin(), lorem_ipsum.end()); const std::vector needle(std::begin(src_needle), std::end(src_needle)); for (auto _ : state) { @@ -53,7 +53,7 @@ template void ranges_search(benchmark::State& state) { const auto& src_needle = patterns[static_cast(state.range())]; - const std::vector haystack(std::begin(lorem_ipsum), std::end(lorem_ipsum)); + const std::vector haystack(lorem_ipsum.begin(), lorem_ipsum.end()); const std::vector needle(std::begin(src_needle), std::end(src_needle)); for (auto _ : state) { @@ -68,7 +68,7 @@ template void search_default_searcher(benchmark::State& state) { const auto& src_needle = patterns[static_cast(state.range())]; - const std::vector haystack(std::begin(lorem_ipsum), std::end(lorem_ipsum)); + const std::vector haystack(lorem_ipsum.begin(), lorem_ipsum.end()); const std::vector needle(std::begin(src_needle), std::end(src_needle)); for (auto _ : state) { diff --git a/benchmarks/src/sv_equal.cpp b/benchmarks/src/sv_equal.cpp index b0f95da6f2..219b6b927c 100644 --- a/benchmarks/src/sv_equal.cpp +++ b/benchmarks/src/sv_equal.cpp @@ -10,7 +10,7 @@ #include "lorem.hpp" -constexpr std::string_view haystack(lorem_ipsum, lorem_ipsum + 2048); +constexpr std::string_view haystack = lorem_ipsum.substr(0, 2048); constexpr std::size_t Count = 8u; From dd093d071907f2c5f145b38ec55f9e07f4c1f61b Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Wed, 23 Oct 2024 08:16:13 -0700 Subject: [PATCH 25/34] Add `const`. --- benchmarks/src/remove.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/src/remove.cpp b/benchmarks/src/remove.cpp index ce6bf4fe91..f0d28f6d73 100644 --- a/benchmarks/src/remove.cpp +++ b/benchmarks/src/remove.cpp @@ -12,7 +12,7 @@ enum class alg_type { std_fn, rng }; template void r(benchmark::State& state) { - std::vector src(lorem_ipsum.begin(), lorem_ipsum.end()); + const std::vector src(lorem_ipsum.begin(), lorem_ipsum.end()); std::vector v; v.reserve(lorem_ipsum.size()); for (auto _ : state) { From 2e704e362edcecabf1f05e7ac12dba8fbae580b1 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Wed, 23 Oct 2024 08:22:18 -0700 Subject: [PATCH 26/34] Add `noexcept` to `_Meow_vectorized`. --- stl/inc/algorithm | 2 +- stl/inc/xmemory | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 8be809fbf7..5c9143afd0 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -79,7 +79,7 @@ __declspec(noalias) void __stdcall __std_replace_8( _STD_BEGIN template -__declspec(noalias) void _Reverse_copy_vectorized(const void* _First, const void* _Last, void* _Dest) { +__declspec(noalias) void _Reverse_copy_vectorized(const void* _First, const void* _Last, void* _Dest) noexcept { if constexpr (_Nx == 1) { ::__std_reverse_copy_trivially_copyable_1(_First, _Last, _Dest); } else if constexpr (_Nx == 2) { diff --git a/stl/inc/xmemory b/stl/inc/xmemory index f9bb68e8a2..ecd0767c9b 100644 --- a/stl/inc/xmemory +++ b/stl/inc/xmemory @@ -35,7 +35,7 @@ void* __stdcall __std_remove_8(void* _First, void* _Last, uint64_t _Val) noexcep _STD_BEGIN template -_Ty* _Remove_vectorized(_Ty* const _First, _Ty* const _Last, const _TVal _Val) { +_Ty* _Remove_vectorized(_Ty* const _First, _Ty* const _Last, const _TVal _Val) noexcept { if constexpr (is_pointer_v<_Ty>) { #ifdef _WIN64 return reinterpret_cast<_Ty*>(::__std_remove_8(_First, _Last, reinterpret_cast(_Val))); From dc5f773599f3d4d11ca7db2463a8609817ae83b7 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Wed, 23 Oct 2024 08:53:11 -0700 Subject: [PATCH 27/34] After calling `_STD` or `_RANGES` `_Find_unchecked`, we know we've found a value to remove. Move the vectorized codepath below the existing call to `_RANGES _Find_unchecked`. Drop `_Could_compare_equal_to_value_type`, as `_Find_unchecked` has handled that and found a value. Finally, `__std_remove_N` doesn't need to start with `__std_find_trivial_N`. --- stl/inc/algorithm | 17 +++++++---------- stl/inc/xmemory | 4 ---- stl/src/vector_algorithms.cpp | 4 ---- 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 5c9143afd0..b6c1855f1c 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -4589,14 +4589,17 @@ namespace ranges { _STL_INTERNAL_STATIC_ASSERT(sentinel_for<_Se, _It>); _STL_INTERNAL_STATIC_ASSERT(indirect_binary_predicate, const _Ty*>); + _First = _RANGES _Find_unchecked(_STD move(_First), _Last, _Val, _Proj); + auto _Next = _First; + if (_First == _Last) { + return {_STD move(_Next), _STD move(_First)}; + } + #if _USE_STD_VECTOR_ALGORITHMS if constexpr (_Vector_alg_in_find_is_safe<_It, _Ty> && sized_sentinel_for<_Se, _It> && is_same_v<_Pj, identity>) { if (!_STD is_constant_evaluated()) { - const auto _Size = _Last - _First; - if (!_STD _Could_compare_equal_to_value_type<_It>(_Val)) { - return {_First + _Size, _First + _Size}; - } + const auto _Size = _Last - _First; const auto _First_ptr = _STD _To_address(_First); const auto _Last_ptr = _First_ptr + static_cast(_Size); const auto _Result = _STD _Remove_vectorized(_First_ptr, _Last_ptr, _Val); @@ -4610,12 +4613,6 @@ namespace ranges { } #endif // _USE_STD_VECTOR_ALGORITHMS - _First = _RANGES _Find_unchecked(_STD move(_First), _Last, _Val, _Proj); - auto _Next = _First; - if (_First == _Last) { - return {_STD move(_Next), _STD move(_First)}; - } - while (++_First != _Last) { if (_STD invoke(_Proj, *_First) != _Val) { *_Next = _RANGES iter_move(_First); diff --git a/stl/inc/xmemory b/stl/inc/xmemory index ecd0767c9b..108e6b861c 100644 --- a/stl/inc/xmemory +++ b/stl/inc/xmemory @@ -2237,10 +2237,6 @@ _NODISCARD_REMOVE_ALG _CONSTEXPR20 _FwdIt remove(_FwdIt _First, const _FwdIt _La if (!_STD is_constant_evaluated()) #endif // _HAS_CXX20 { - if (!_STD _Could_compare_equal_to_value_type(_Val)) { - return _Last; - } - const auto _First_ptr = _STD _To_address(_UFirst); const auto _Result = _STD _Remove_vectorized(_First_ptr, _STD _To_address(_ULast), _Val); diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 87d7e8969d..9eeb20e534 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3932,7 +3932,6 @@ namespace { extern "C" { void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _Val) noexcept { - _First = const_cast(__std_find_trivial_1(_First, _Last, _Val)); void* _Out = _First; #ifndef _M_ARM64EC @@ -3957,7 +3956,6 @@ void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _V } void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _Val) noexcept { - _First = const_cast(__std_find_trivial_2(_First, _Last, _Val)); void* _Out = _First; #ifndef _M_ARM64EC @@ -3983,7 +3981,6 @@ void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _ } void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _Val) noexcept { - _First = const_cast(__std_find_trivial_4(_First, _Last, _Val)); void* _Out = _First; #ifndef _M_ARM64EC @@ -4027,7 +4024,6 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ } void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _Val) noexcept { - _First = const_cast(__std_find_trivial_8(_First, _Last, _Val)); void* _Out = _First; #ifndef _M_ARM64EC From 400d7781dc4f425498b3e90a5636910d330ac977 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Wed, 23 Oct 2024 09:04:42 -0700 Subject: [PATCH 28/34] In C++20, use `to_address`. --- stl/inc/algorithm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index b6c1855f1c..a07e57eca0 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -4600,7 +4600,7 @@ namespace ranges { && is_same_v<_Pj, identity>) { if (!_STD is_constant_evaluated()) { const auto _Size = _Last - _First; - const auto _First_ptr = _STD _To_address(_First); + const auto _First_ptr = _STD to_address(_First); const auto _Last_ptr = _First_ptr + static_cast(_Size); const auto _Result = _STD _Remove_vectorized(_First_ptr, _Last_ptr, _Val); From 93b9222b42c32b246e849f42fa67c5710b459d1e Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Wed, 23 Oct 2024 09:04:56 -0700 Subject: [PATCH 29/34] Drop unnecessary `static_cast`s. --- stl/inc/algorithm | 2 +- stl/inc/xmemory | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index a07e57eca0..09f0f12fa6 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -4605,7 +4605,7 @@ namespace ranges { const auto _Result = _STD _Remove_vectorized(_First_ptr, _Last_ptr, _Val); if constexpr (is_pointer_v<_It>) { - return {static_cast<_It>(_Result), static_cast<_It>(_Last_ptr)}; + return {_Result, _Last_ptr}; } else { return {_First + (_Result - _First_ptr), _First + _Size}; } diff --git a/stl/inc/xmemory b/stl/inc/xmemory index 108e6b861c..b8e62c2619 100644 --- a/stl/inc/xmemory +++ b/stl/inc/xmemory @@ -2241,7 +2241,7 @@ _NODISCARD_REMOVE_ALG _CONSTEXPR20 _FwdIt remove(_FwdIt _First, const _FwdIt _La const auto _Result = _STD _Remove_vectorized(_First_ptr, _STD _To_address(_ULast), _Val); if constexpr (is_pointer_v) { - _UNext = static_cast(_Result); + _UNext = _Result; } else { _UNext += _Result - _First_ptr; } From 1722ec40b85c80d465efb231a13b108f1a88459b Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Wed, 23 Oct 2024 09:27:07 -0700 Subject: [PATCH 30/34] Fix argument order. --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 613569d38a..81ebe82c30 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -793,7 +793,7 @@ void test_remove(mt19937_64& gen) { v->reserve(dataCount); } - test_case_remove(in_out_actual, in_out_expected, in_out_actual_r, static_cast(dis(gen))); + test_case_remove(in_out_expected, in_out_actual, in_out_actual_r, static_cast(dis(gen))); for (size_t attempts = 0; attempts < dataCount; ++attempts) { source.push_back(static_cast(dis(gen))); From 09935327ac9c0efa0e93d29dd2cd7e0d0fa39fb5 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Wed, 23 Oct 2024 09:50:57 -0700 Subject: [PATCH 31/34] Comment cleanups. Drop the bit about "surprising behavior". --- stl/src/vector_algorithms.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 9eeb20e534..e93f642b49 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3889,11 +3889,11 @@ namespace { uint32_t _Nx = 0; // Make shuffle mask for pshufb / vpermd corresponding to _Vx bit value. - // Every bit set corresponds to element skipped. + // Every bit set corresponds to an element skipped. for (uint32_t _Hx = 0; _Hx != _Size_h / _Ew; ++_Hx) { if ((_Vx & (1 << _Hx)) == 0) { - // Inner loop needed for cases where shuffle mask operate on element parts rather than whole - // elements; for whole elements there would be one iteration + // Inner loop needed for cases where the shuffle mask operates on element parts rather than whole + // elements; for whole elements there would be one iteration. for (uint32_t _Ex = 0; _Ex != _Ew; ++_Ex) { _Result._Shuf[_Vx][_Nx * _Ew + _Ex] = static_cast(_Hx * _Ew + _Ex); } @@ -3901,16 +3901,16 @@ namespace { } } - // Size of elements that are not removed in bytes + // Size of elements that are not removed in bytes. _Result._Size[_Vx] = static_cast(_Nx * _Mul); // Fill the remaining with arbitrary elements. - // It is not possible to leave them untouched, while keeping this optimization efficient. + // It is not possible to leave them untouched while keeping this optimization efficient. // This should not be a problem though, as they should be either overwritten by the next step, - // or left in the removed range. Still setting them to the values of some of existing elements, - // rather than zero, to reduce the surprising behavior. + // or left in the removed range. for (; _Nx != _Size_h / _Ew; ++_Nx) { - // Inner loop needed for cases where shuffle mask operate on element pars rather than whole elements + // Inner loop needed for cases where the shuffle mask operates on element parts rather than whole + // elements; for whole elements there would be one iteration. for (uint32_t _Ex = 0; _Ex != _Ew; ++_Ex) { _Result._Shuf[_Vx][_Nx * _Ew + _Ex] = static_cast(_Nx * _Ew + _Ex); } From afc9543ea5bd96558ee2a0fb7276de60163ca62b Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Wed, 23 Oct 2024 09:59:51 -0700 Subject: [PATCH 32/34] Give `_Remove_tables` a name. --- stl/src/vector_algorithms.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index e93f642b49..d5e6eb5d1b 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3878,12 +3878,15 @@ namespace { } #ifndef _M_ARM64EC + template + struct _Remove_tables { + uint8_t _Shuf[_Size_v][_Size_h]; + uint8_t _Size[_Size_v]; + }; + template constexpr auto _Make_remove_tables(const uint32_t _Mul, const uint32_t _Ew) { - struct { - uint8_t _Shuf[_Size_v][_Size_h]; - uint8_t _Size[_Size_v]; - } _Result; + _Remove_tables<_Size_v, _Size_h> _Result; for (uint32_t _Vx = 0; _Vx != _Size_v; ++_Vx) { uint32_t _Nx = 0; From 2dd0b410f3afe64f11e842896f861977e9a52c41 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Wed, 23 Oct 2024 10:08:02 -0700 Subject: [PATCH 33/34] Scope `_Size_bytes` within `if` in `__std_remove_4`. --- stl/src/vector_algorithms.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index d5e6eb5d1b..ad96ea72a5 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -3987,8 +3987,7 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ void* _Out = _First; #ifndef _M_ARM64EC - const size_t _Size_bytes = _Byte_length(_First, _Last); - if (_Use_avx2() && _Size_bytes >= 32) { + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { const __m256i _Match = _mm256_set1_epi32(_Val); void* _Stop = _First; From 4a68a287bf8af2facc4ec77886b3b865b76b21f0 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Thu, 24 Oct 2024 07:52:32 -0700 Subject: [PATCH 34/34] Fix the merge conflicts properly. --- benchmarks/src/search.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/benchmarks/src/search.cpp b/benchmarks/src/search.cpp index dc1c78ba2f..3bbf45f70e 100644 --- a/benchmarks/src/search.cpp +++ b/benchmarks/src/search.cpp @@ -52,8 +52,8 @@ void c_strstr(benchmark::State& state) { const auto& src_haystack = patterns[static_cast(state.range())].data; const auto& src_needle = patterns[static_cast(state.range())].pattern; - const std::string haystack(lorem_ipsum); - const std::string needle(std::begin(src_needle), std::end(src_needle)); + const std::string haystack(src_haystack); + const std::string needle(src_needle); for (auto _ : state) { benchmark::DoNotOptimize(haystack); @@ -68,8 +68,8 @@ void classic_search(benchmark::State& state) { const auto& src_haystack = patterns[static_cast(state.range())].data; const auto& src_needle = patterns[static_cast(state.range())].pattern; - const std::vector haystack(lorem_ipsum.begin(), lorem_ipsum.end()); - const std::vector needle(std::begin(src_needle), std::end(src_needle)); + const std::vector haystack(src_haystack.begin(), src_haystack.end()); + const std::vector needle(src_needle.begin(), src_needle.end()); for (auto _ : state) { benchmark::DoNotOptimize(haystack); @@ -84,8 +84,8 @@ void ranges_search(benchmark::State& state) { const auto& src_haystack = patterns[static_cast(state.range())].data; const auto& src_needle = patterns[static_cast(state.range())].pattern; - const std::vector haystack(lorem_ipsum.begin(), lorem_ipsum.end()); - const std::vector needle(std::begin(src_needle), std::end(src_needle)); + const std::vector haystack(src_haystack.begin(), src_haystack.end()); + const std::vector needle(src_needle.begin(), src_needle.end()); for (auto _ : state) { benchmark::DoNotOptimize(haystack); @@ -100,8 +100,8 @@ void search_default_searcher(benchmark::State& state) { const auto& src_haystack = patterns[static_cast(state.range())].data; const auto& src_needle = patterns[static_cast(state.range())].pattern; - const std::vector haystack(lorem_ipsum.begin(), lorem_ipsum.end()); - const std::vector needle(std::begin(src_needle), std::end(src_needle)); + const std::vector haystack(src_haystack.begin(), src_haystack.end()); + const std::vector needle(src_needle.begin(), src_needle.end()); for (auto _ : state) { benchmark::DoNotOptimize(haystack); @@ -116,8 +116,8 @@ void classic_find_end(benchmark::State& state) { const auto& src_haystack = patterns[static_cast(state.range())].data; const auto& src_needle = patterns[static_cast(state.range())].pattern; - const std::vector haystack(std::begin(src_haystack), std::end(src_haystack)); - const std::vector needle(std::begin(src_needle), std::end(src_needle)); + const std::vector haystack(src_haystack.begin(), src_haystack.end()); + const std::vector needle(src_needle.begin(), src_needle.end()); for (auto _ : state) { benchmark::DoNotOptimize(haystack); @@ -132,8 +132,8 @@ void ranges_find_end(benchmark::State& state) { const auto& src_haystack = patterns[static_cast(state.range())].data; const auto& src_needle = patterns[static_cast(state.range())].pattern; - const std::vector haystack(std::begin(src_haystack), std::end(src_haystack)); - const std::vector needle(std::begin(src_needle), std::end(src_needle)); + const std::vector haystack(src_haystack.begin(), src_haystack.end()); + const std::vector needle(src_needle.begin(), src_needle.end()); for (auto _ : state) { benchmark::DoNotOptimize(haystack);