Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Help the compiler vectorize std::iota #4627

Merged
merged 12 commits into from
Apr 27, 2024
Merged
1 change: 1 addition & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ endfunction()
add_benchmark(bitset_to_string src/bitset_to_string.cpp)
add_benchmark(find_and_count src/find_and_count.cpp)
add_benchmark(find_first_of src/find_first_of.cpp)
add_benchmark(iota src/iota.cpp)
add_benchmark(locale_classic src/locale_classic.cpp)
add_benchmark(minmax_element src/minmax_element.cpp)
add_benchmark(mismatch src/mismatch.cpp)
Expand Down
27 changes: 27 additions & 0 deletions benchmarks/src/iota.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <benchmark/benchmark.h>
#include <numeric>
#include <vector>
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved

template <class T>
void bm(benchmark::State& state) {
const size_t size = static_cast<size_t>(state.range(0));
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved

std::vector<T> a(size);

for (auto _ : state) {
std::iota(a.begin(), a.end(), T{22});
benchmark::DoNotOptimize(a);
}
}

void common_args(auto bm) {
bm->Arg(7)->Arg(18)->Arg(43)->Arg(131)->Arg(315)->Arg(1212);
}

BENCHMARK(bm<std::uint32_t>)->Apply(common_args);
BENCHMARK(bm<std::uint64_t>)->Apply(common_args);

BENCHMARK_MAIN();
30 changes: 30 additions & 0 deletions stl/inc/numeric
Original file line number Diff line number Diff line change
Expand Up @@ -511,12 +511,42 @@ _FwdIt2 adjacent_difference(_ExPo&& _Exec, const _FwdIt1 _First, const _FwdIt1 _
}
#endif // _HAS_CXX17

template <class _Ty>
constexpr bool _Iota_optimization_can_overflow(const size_t _Size) {
if constexpr (is_unsigned_v<_Ty>) {
// _Ty overflow is defined, _Size might not fit _Ty
if constexpr (sizeof(_Ty) >= sizeof(size_t)) {
return false; // any size fits type
} else {
return _Size > ~_Ty{0};
}
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
} else {
return false; // Signed _Ty overflow is UB, size should fit _Ty range
}
}

_EXPORT_STD template <class _FwdIt, class _Ty>
_CONSTEXPR20 void iota(_FwdIt _First, _FwdIt _Last, _Ty _Val) {
// compute increasing sequence into [_First, _Last)
_STD _Adl_verify_range(_First, _Last);
auto _UFirst = _STD _Get_unwrapped(_First);
const auto _ULast = _STD _Get_unwrapped(_Last);

if constexpr (_Iterator_is_contiguous<decltype(_UFirst)> && is_integral_v<_Ty> && sizeof(_Ty) >= 4) {
// TRANSITION, DevCom-10593477: help the compiler vectorize
const auto _Ptr = _To_address(_UFirst);
const auto _Size = static_cast<size_t>(_ULast - _UFirst);

if (!_Iota_optimization_can_overflow<_Ty>(_Size)) {
const auto _Size_typed = static_cast<_Ty>(_Size);
for (_Ty _Ix = 0; _Ix != _Size_typed; ++_Ix) {
_Ptr[_Ix] = _Val + _Ix;
}

return;
}
}

for (; _UFirst != _ULast; ++_UFirst, (void) ++_Val) {
*_UFirst = _Val;
}
Expand Down