-
Notifications
You must be signed in to change notification settings - Fork 1.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Avoid unconditional make_heap
for priority_queue::push_range
#4025
Conversation
The following benchmark compares Additional benchmark#include <benchmark/benchmark.h>
#include <random>
#include <span>
#include <string>
#include <vector>
#include<array>
#include<functional>
using namespace std;
namespace {
constexpr size_t vec_size = 10000000;
template<class T>
auto create_vec(size_t vsize, function<T(uint64_t)> transform) {
vector<T> vec(vsize);
for (mt19937_64 rnd(1); auto & e:vec) {
e = transform(rnd());
}
return vec;
}
template<class T>
T cast_to(uint64_t val) {
return static_cast<T>(val);
}
const auto vec_u8 = create_vec<uint8_t>(vec_size, cast_to<uint8_t>);
const auto vec_u16 = create_vec<uint16_t>(vec_size, cast_to<uint16_t>);
const auto vec_u32 = create_vec<uint32_t>(vec_size, cast_to<uint32_t>);
const auto vec_u64 = create_vec<uint64_t>(vec_size, cast_to<uint64_t>);
const auto vec_float = create_vec<float>(vec_size, cast_to<float>);
const auto vec_double = create_vec<double>(vec_size, cast_to<double>);
const auto vec_str = create_vec<string>(vec_size, [](uint64_t v) {
return to_string(static_cast<uint32_t>(v));
});
const auto vec_wstr = create_vec<wstring>(vec_size, [](uint64_t v) {
return to_wstring(static_cast<uint32_t>(v));
});
using keypair = array<uint64_t, 2>;
const auto vec_keypair = create_vec<keypair>(vec_size, [](uint64_t v) {
keypair pr{};
pr[0] = v & 0xffffffff;
pr[1] = v >> 32;
return pr;
});
struct keyobj {
uint64_t key;
uint64_t dat;
bool operator<(const keyobj& r)const {
return key < r.key;
}
};
const auto vec_keyobj = create_vec<keyobj>(vec_size, [](uint64_t v) {
keyobj obj;
obj.key = v;
obj.dat = v;
return obj;
});
template<size_t L>
void putvs(const benchmark::State&) {
static bool b = [] {
puts("↑vs↓");
return true;
}();
}
template<size_t L>
void putln(const benchmark::State&) {
static bool b = [] {
puts(string(94, '-').c_str());
return true;
}();
}
template <class T, const auto& Data, bool Remake>
void BM_test(benchmark::State& state) {
const size_t size_a = static_cast<size_t>(state.range(0));
const size_t rate = static_cast<size_t>(state.range(1)); // new / old
const size_t size_b = size_a * (rate - 1);
for (auto _ : state) {
state.PauseTiming();
span spn(Data);
assert(spn.size() >= size_a + size_b);
vector<T> c;
c.append_range(spn.subspan(0, size_a));
std::make_heap(c.begin(), c.end());
spn = spn.subspan(size_a);
c.append_range(spn.subspan(0, size_b));
state.ResumeTiming();
if constexpr (Remake) {
std::make_heap(c.begin(), c.end());
}
else {
const auto _Begin = _Get_unwrapped(c.begin());
auto _Heap_end = _Begin + size_a;
const auto _End = _Get_unwrapped(c.end());
while (_Heap_end != _End) {
std::push_heap(_Begin, ++_Heap_end);
}
}
benchmark::DoNotOptimize(c);
}
}
}
enum :bool { Remake = true, PushEach = false };
const int ratio = 2; // new_size / old_size, >=2
#define ADD_BENCHMARK(T,source) \
BENCHMARK(BM_test<T, source, Remake>)->ArgsProduct({ benchmark::CreateRange(10, vec_size/ratio, 10),{ratio} })->Setup(putln<__LINE__>);\
BENCHMARK(BM_test<T, source, PushEach>)->ArgsProduct({ benchmark::CreateRange(10, vec_size/ratio, 10),{ratio} })->Setup(putvs<__LINE__>);
ADD_BENCHMARK(uint8_t, vec_u8);
ADD_BENCHMARK(uint16_t, vec_u16);
ADD_BENCHMARK(uint32_t, vec_u32);
ADD_BENCHMARK(uint64_t, vec_u64);
ADD_BENCHMARK(float, vec_float);
ADD_BENCHMARK(double, vec_double);
ADD_BENCHMARK(string_view, vec_str);
ADD_BENCHMARK(string, vec_str);
ADD_BENCHMARK(wstring_view, vec_wstr);
ADD_BENCHMARK(wstring, vec_wstr);
ADD_BENCHMARK(keypair, vec_keypair);
ADD_BENCHMARK(keyobj, vec_keyobj);
BENCHMARK_MAIN(); Result
If we change the `BM_test` part to: replace BM_test and ADD_BENCHMARK template <class T, const auto& Data, bool Remake>
void BM_test(benchmark::State& state) {
const size_t vsize = static_cast<size_t>(state.range(0));
for (auto _ : state) {
span spn(Data);
assert(spn.size() >= vsize);
state.PauseTiming();
vector<T> c(from_range, spn.subspan(0, vsize));
state.ResumeTiming();
if constexpr (Remake) {
std::make_heap(c.begin(), c.end());
}
else {
const auto _Begin = _Get_unwrapped(c.begin());
auto _Heap_end = _Begin;
const auto _End = _Get_unwrapped(c.end());
while (_Heap_end != _End) {
std::push_heap(_Begin, ++_Heap_end);
}
}
benchmark::DoNotOptimize(c);
}
}
}
enum :bool { Remake = true, PushEach = false };
#define ADD_BENCHMARK(T,source) \
BENCHMARK(BM_test<T, source, Remake>)->ArgsProduct({ benchmark::CreateRange(10, vec_size, 10) })->Setup(putln<__LINE__>);\
BENCHMARK(BM_test<T, source, PushEach>)->ArgsProduct({ benchmark::CreateRange(10, vec_size, 10) })->Setup(putvs<__LINE__>); Result
It compares remaking the whole heap with |
->Setup(putln<__LINE__>) \ | ||
->RangeMultiplier(100) \ | ||
->Range(1, vec_size) \ | ||
->Arg(vec_size / 2 + 1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
stl/inc/xutility
Outdated
@@ -541,7 +541,8 @@ struct less_equal<void> { | |||
template <class _Fx> | |||
struct _Ref_fn { // pass function object by value as a reference | |||
template <class... _Args> | |||
constexpr decltype(auto) operator()(_Args&&... _Vals) { // forward function call operator | |||
constexpr decltype(auto) operator()(_Args&&... _Vals) noexcept( | |||
is_nothrow_invocable_v<_Fx&, _Args&&...>) { // forward function call operator |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can use _Select_invoke_traits<_Fx&, _Args...>::_Is_nothrow_invocable::value
(&&
can be omitted as it is added by default).
Line 1910 in d8ed02d
inline constexpr bool is_nothrow_invocable_v = _Select_invoke_traits<_Callable, _Args...>::_Is_nothrow_invocable::value; |
7962c18
to
b676d18
Compare
constexpr decltype(auto) operator()(_Args&&... _Vals) { // forward function call operator | ||
constexpr decltype(auto) operator()(_Args&&... _Vals) noexcept( | ||
_Select_invoke_traits<_Fx&, _Args...>::_Is_nothrow_invocable::value) { // forward function call operator |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No change required - I don't have data to back up the claim - but there's a lot of expensive SFINAE in _Select_invoke_traits
that _Ref_fn
doesn't need. We may be better off with:
template <class _Fx, bool = is_member_pointer_v<_Fx>>
struct _Ref_fn { // pass function object by value as a reference
template <class... _Args>
constexpr decltype(auto) operator()(_Args&&... _Vals) noexcept(
noexcept(_STD invoke(_Fn, _STD forward<_Args>(_Vals)...))) {
return _STD invoke(_Fn, _STD forward<_Args>(_Vals)...);
}
_Fx& _Fn;
};
template <class _Fx>
struct _Ref_fn<_Fx, false> {
template <class... _Args>
constexpr decltype(auto) operator()(_Args&&... _Vals) noexcept(noexcept(_Fn(_STD forward<_Args>(_Vals)...))) {
return _Fn(_STD forward<_Args>(_Vals)...);
}
_Fx& _Fn;
};
I believe that this change is sufficiently safe, and the new benchmark provides justification for making the change, that we can merge this without a second maintainer approval. |
I'm mirroring this to the MSVC-internal repo - please notify me if any further changes are pushed. |
Thanks for investigating and improving this performance with an added benchmark! 🎉 🚀 ⏱️ |
Fixes #2826.
make_heap
- we will get great performance penalty if we dopush_heap
in small batches.push_heap
loop. For those types, we can unconditionally dopush_heap
loop instead. However, for some trivial types (especially scalars) we will really get better performance if we callmake_heap
.new_size/2>old_size
as the boundary to prefermake_heap
topush_heap
loop. This will completely avoid the problem in 1.About
new_size/2>old_size
:make_heap(size 2*o)
doeso
_Pop_heap_hole_by_index
, and each_Pop_heap_hole_by_index
calls a_Push_heap_by_index
, so I think it's safe to assume doing ao
-push_heap
loop can be faster before/around this boundary.The following benchmark shows a generally better result, including those at the boundary (
Arg(vec_size / 2 + 1)
; 5001). However, I'm not sure going on inu32/u64/float
'sArg(vec_size / 2 + 1)
case (in which case remaking the heap is actually faster). (This does not necessarily mean we have to set a smaller ratio boundary, as it looks size/type/sequence-sensitive. See the update part below; for more extensive benchmarks see the next comment.)Benchmark result (
vec_size==10000
):Previous
Now
UPDATE: The new approach is actually faster around the boundary (
Arg(vec_size / 2 + 1)
; 4001,6001) whenvec_size==8000 or 12000
:When
vec_size==8000
:Previous
Now
When
vec_size==12000
:Previous
Now