Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Always use cmpxchg16b on x64 for atomic_ref<16 bytes> #4751

Merged
merged 20 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
be1dabe
Simplify `__std_atomic_has_cmpxchg16b`: cmpxchg16b is always availabl…
StephanTLavavej Jun 16, 2024
b16508d
Simplify `__std_atomic_compare_exchange_128` accordingly.
StephanTLavavej Jun 16, 2024
bad4814
`__std_atomic_compare_exchange_128` and `__std_atomic_has_cmpxchg16b`…
StephanTLavavej Jun 16, 2024
7bf2aa7
Preserve `__std_atomic_compare_exchange_128` for bincompat.
StephanTLavavej Jun 16, 2024
e3a9a91
Replace the `_STD_COMPARE_EXCHANGE_128` macro with `_InterlockedCompa…
StephanTLavavej Jun 16, 2024
60798a3
Allow Clang x64 to use `_InterlockedCompareExchange128`.
StephanTLavavej Jun 16, 2024
c7261ef
Simplify the "future ABI" `atomic<T>::is_lock_free()`.
StephanTLavavej Jun 16, 2024
72349d3
Major style: Add parens when mixing arithmetic and bitwise operators …
StephanTLavavej Jun 16, 2024
6411c97
Minor style: Simplify `atomic<T>::is_lock_free()`.
StephanTLavavej Jun 16, 2024
b855bb9
Enhance the "future ABI" `_Is_always_lock_free` (aka `atomic<T>::is_a…
StephanTLavavej Jun 16, 2024
ea085c5
Centralize `atomic<T>::is_lock_free()` and `_Is_always_lock_free`.
StephanTLavavej Jun 16, 2024
c468e9e
`atomic_ref::is_lock_free()`: Simplify.
StephanTLavavej Jun 16, 2024
3861e48
`atomic_ref::is_lock_free()`: Further simplify.
StephanTLavavej Jun 16, 2024
0e17494
Enhance `atomic_ref::is_always_lock_free`.
StephanTLavavej Jun 16, 2024
fe82b0c
Collapse `atomic_ref` `is_always_lock_free` and `is_lock_free()`.
StephanTLavavej Jun 16, 2024
0c46d60
`_ATOMIC_HAS_DCAS` is now unused.
StephanTLavavej Jun 16, 2024
ba77cd1
Preserve `__std_atomic_has_cmpxchg16b` for bincompat.
StephanTLavavej Jun 16, 2024
96d86e7
Drop `_STD_ATOMIC_ALWAYS_USE_CMPXCHG16B` and simplify test coverage.
StephanTLavavej Jun 16, 2024
537bc95
`_STL_WIN32_WINNT_WINBLUE` is now unused.
StephanTLavavej Jun 25, 2024
e13d679
Silence unreferenced parameter warnings for 32-bit.
StephanTLavavej Jun 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 16 additions & 68 deletions stl/inc/atomic
Original file line number Diff line number Diff line change
Expand Up @@ -28,29 +28,10 @@ _STL_DISABLE_CLANG_WARNINGS
#pragma push_macro("new")
#undef new

#ifdef _WIN64
#if _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1
#define _STD_COMPARE_EXCHANGE_128 _InterlockedCompareExchange128
#else // ^^^ _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1 / _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 vvv
// 16-byte atomics are separately compiled for x64, as not all x64 hardware has the cmpxchg16b
// instruction; in the event this instruction is not available, the fallback is a global
// synchronization object shared by all 16-byte atomics.
// (Note: machines without this instruction typically have 2 cores or fewer, so this isn't too bad)
// All pointer parameters must be 16-byte aligned.
extern "C" _NODISCARD unsigned char __stdcall __std_atomic_compare_exchange_128(
_Inout_bytecount_(16) long long* _Destination, _In_ long long _ExchangeHigh, _In_ long long _ExchangeLow,
_Inout_bytecount_(16) long long* _ComparandResult) noexcept;
extern "C" _NODISCARD char __stdcall __std_atomic_has_cmpxchg16b() noexcept;
#define _STD_COMPARE_EXCHANGE_128 __std_atomic_compare_exchange_128
#endif // ^^^ _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 ^^^
#endif // defined(_WIN64)

// Controls whether atomic::is_always_lock_free triggers for sizeof(void *) or 2 * sizeof(void *)
#if _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1 || !defined(_M_X64) || defined(_M_ARM64EC)
#define _ATOMIC_HAS_DCAS 1
#else // ^^^ We always have DCAS / We only sometimes have DCAS vvv
#define _ATOMIC_HAS_DCAS 0
#endif // ^^^ We only sometimes have DCAS ^^^
// Allow _InterlockedCompareExchange128 to be used:
#if defined(__clang__) && defined(_M_X64)
#pragma clang attribute _STD_ATOMIC_HEADER.push([[gnu::target("cx16")]], apply_to = function)
#endif // ^^^ defined(__clang__) && defined(_M_X64) ^^^

// Controls whether ARM64 ldar/ldapr/stlr should be used
#ifndef _STD_ATOMIC_USE_ARM64_LDAR_STLR
Expand Down Expand Up @@ -603,7 +584,7 @@ inline bool __stdcall _Atomic_wait_compare_16_bytes(const void* _Storage, void*
const auto _Cmp = static_cast<const long long*>(_Comparand);
alignas(16) long long _Tmp[2] = {_Cmp[0], _Cmp[1]};
#if defined(_M_X64) && !defined(_M_ARM64EC)
return _STD_COMPARE_EXCHANGE_128(_Dest, _Tmp[1], _Tmp[0], _Tmp) != 0;
return _InterlockedCompareExchange128(_Dest, _Tmp[1], _Tmp[0], _Tmp) != 0;
#else // ^^^ _M_X64 / ARM64, _M_ARM64EC vvv
return _InterlockedCompareExchange128_nf(_Dest, _Tmp[1], _Tmp[0], _Tmp) != 0;
#endif // ^^^ ARM64, _M_ARM64EC ^^^
Expand Down Expand Up @@ -1199,7 +1180,7 @@ struct _Atomic_storage<_Ty&, 16> { // lock-free using 16-byte intrinsics
_NODISCARD _TVal load() const noexcept { // load with sequential consistency
long long* const _Storage_ptr = const_cast<long long*>(_STD _Atomic_address_as<const long long>(_Storage));
_Int128 _Result{}; // atomic CAS 0 with 0
(void) _STD_COMPARE_EXCHANGE_128(_Storage_ptr, 0, 0, &_Result._Low);
(void) _InterlockedCompareExchange128(_Storage_ptr, 0, 0, &_Result._Low);
return reinterpret_cast<_TVal&>(_Result);
}

Expand Down Expand Up @@ -1270,7 +1251,7 @@ struct _Atomic_storage<_Ty&, 16> { // lock-free using 16-byte intrinsics
&_Expected_temp._Low);
#else // ^^^ _M_ARM64, _M_ARM64EC / _M_X64 vvv
(void) _Order;
_Result = _STD_COMPARE_EXCHANGE_128(&reinterpret_cast<long long&>(_Storage), _Desired_bytes._High,
_Result = _InterlockedCompareExchange128(&reinterpret_cast<long long&>(_Storage), _Desired_bytes._High,
_Desired_bytes._Low, &_Expected_temp._Low);
#endif // ^^^ _M_X64 ^^^
if (_Result) {
Expand All @@ -1296,7 +1277,7 @@ struct _Atomic_storage<_Ty&, 16> { // lock-free using 16-byte intrinsics
&_Expected_temp._Low);
#else // ^^^ _M_ARM64, _M_ARM64EC / _M_X64 vvv
(void) _Order;
_Result = _STD_COMPARE_EXCHANGE_128(
_Result = _InterlockedCompareExchange128(
&reinterpret_cast<long long&>(_Storage), _Desired_bytes._High, _Desired_bytes._Low, &_Expected_temp._Low);
#endif // ^^^ _M_X64 ^^^
if (_Result == 0) {
Expand Down Expand Up @@ -1649,13 +1630,8 @@ struct _Atomic_integral<_Ty, 8> : _Atomic_storage<_Ty> { // atomic integral oper
template <size_t _TypeSize>
constexpr bool _Is_always_lock_free = _TypeSize <= 8 && (_TypeSize & (_TypeSize - 1)) == 0;
#else // ^^^ don't break ABI / break ABI vvv
#if _ATOMIC_HAS_DCAS
template <size_t _TypeSize>
constexpr bool _Is_always_lock_free = _TypeSize <= 2 * sizeof(void*);
#else // ^^^ _ATOMIC_HAS_DCAS / !_ATOMIC_HAS_DCAS vvv
template <size_t _TypeSize>
constexpr bool _Is_always_lock_free = _TypeSize <= sizeof(void*);
#endif // ^^^ !_ATOMIC_HAS_DCAS ^^^
#endif // ^^^ break ABI ^^^

template <class _Ty, bool _Is_lock_free = _Is_always_lock_free<sizeof(_Ty)>>
Expand Down Expand Up @@ -2175,23 +2151,10 @@ public:
static constexpr bool is_always_lock_free = _Is_always_lock_free<sizeof(_Ty)>;
#endif // _HAS_CXX17

#if 1 // TRANSITION, ABI
_NODISCARD bool is_lock_free() const volatile noexcept {
constexpr bool _Result = sizeof(_Ty) <= 8 && (sizeof(_Ty) & sizeof(_Ty) - 1) == 0;
return _Result;
return _Is_always_lock_free<sizeof(_Ty)>;
}

#else // ^^^ don't break ABI / break ABI vvv

_NODISCARD bool is_lock_free() const volatile noexcept {
#if _ATOMIC_HAS_DCAS
return sizeof(_Ty) <= 2 * sizeof(void*);
#else // ^^^ _ATOMIC_HAS_DCAS / !_ATOMIC_HAS_DCAS vvv
return sizeof(_Ty) <= sizeof(void*) || (sizeof(_Ty) <= 2 * sizeof(void*) && __std_atomic_has_cmpxchg16b());
#endif // ^^^ !_ATOMIC_HAS_DCAS ^^^
}
#endif // ^^^ break ABI ^^^

_NODISCARD bool is_lock_free() const noexcept {
return static_cast<const volatile atomic*>(this)->is_lock_free();
}
Expand Down Expand Up @@ -2341,7 +2304,7 @@ public:
using value_type = _Ty;

explicit atomic_ref(_Ty& _Value) noexcept /* strengthened */ : _Base(_Value) {
if constexpr (_Is_potentially_lock_free) {
if constexpr (is_always_lock_free) {
_Check_alignment(_Value);
} else {
this->_Init_spinlock_for_ref();
Expand All @@ -2352,30 +2315,13 @@ public:

atomic_ref& operator=(const atomic_ref&) = delete;

static constexpr bool _Is_potentially_lock_free =
sizeof(_Ty) <= 2 * sizeof(void*) && (sizeof(_Ty) & (sizeof(_Ty) - 1)) == 0;

static constexpr bool is_always_lock_free =
#if _ATOMIC_HAS_DCAS
_Is_potentially_lock_free;
#else // ^^^ _ATOMIC_HAS_DCAS / !_ATOMIC_HAS_DCAS vvv
_Is_potentially_lock_free && sizeof(_Ty) <= sizeof(void*);
#endif // ^^^ !_ATOMIC_HAS_DCAS ^^^
sizeof(_Ty) <= 2 * sizeof(void*) && (sizeof(_Ty) & (sizeof(_Ty) - 1)) == 0;

static constexpr size_t required_alignment = _Is_potentially_lock_free ? sizeof(_Ty) : alignof(_Ty);
static constexpr size_t required_alignment = is_always_lock_free ? sizeof(_Ty) : alignof(_Ty);

_NODISCARD bool is_lock_free() const noexcept {
#if _ATOMIC_HAS_DCAS
return is_always_lock_free;
#else // ^^^ _ATOMIC_HAS_DCAS / !_ATOMIC_HAS_DCAS vvv
if constexpr (is_always_lock_free) {
return true;
} else if constexpr (_Is_potentially_lock_free) {
return __std_atomic_has_cmpxchg16b() != 0;
} else {
return false;
}
#endif // ^^^ !_ATOMIC_HAS_DCAS ^^^
}

void store(const _Ty _Value) const noexcept {
Expand Down Expand Up @@ -3049,16 +2995,18 @@ _STD_END
#undef _ATOMIC_STORE_32_SEQ_CST
#undef _ATOMIC_STORE_64_SEQ_CST
#undef _ATOMIC_STORE_64_SEQ_CST_IX86
#undef _ATOMIC_HAS_DCAS
#undef _ATOMIC_STORE_SEQ_CST_ARM64
#undef __LOAD_ACQUIRE_ARM64
#undef _ATOMIC_LOAD_ARM64
#undef __STORE_RELEASE
#undef _STD_ATOMIC_USE_ARM64_LDAR_STLR

#undef _STD_COMPARE_EXCHANGE_128
#undef _INVALID_MEMORY_ORDER

#if defined(__clang__) && defined(_M_X64)
#pragma clang attribute _STD_ATOMIC_HEADER.pop
#endif // ^^^ defined(__clang__) && defined(_M_X64) ^^^

#pragma pop_macro("new")
_STL_RESTORE_CLANG_WARNINGS
#pragma warning(pop)
Expand Down
16 changes: 0 additions & 16 deletions stl/inc/yvals.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,22 +315,6 @@ _EMIT_STL_WARNING(STL4001, "/clr:pure is deprecated and will be REMOVED.");
#define _LOCK_STREAM 2
#define _LOCK_DEBUG 3

#ifndef _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B
#if _STL_WIN32_WINNT >= _STL_WIN32_WINNT_WINBLUE && defined(_WIN64)
#define _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B 1
#else // ^^^ modern 64-bit / less modern or 32-bit vvv
#define _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B 0
#endif // _STL_WIN32_WINNT >= _STL_WIN32_WINNT_WINBLUE && defined(_WIN64)
#endif // !defined(_STD_ATOMIC_ALWAYS_USE_CMPXCHG16B)

#if _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 && defined(_M_ARM64)
#error ARM64 requires _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B to be 1.
#endif // _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 && defined(_M_ARM64)

#if _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1 && !defined(_WIN64)
#error _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1 requires 64-bit.
#endif // _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1 && !defined(_WIN64)

_STD_BEGIN
enum _Uninitialized { // tag for suppressing initialization
_Noinit
Expand Down
7 changes: 3 additions & 4 deletions stl/inc/yvals_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -1941,10 +1941,9 @@ compiler option, or define _ALLOW_RTCc_IN_STL to suppress this error.
#error In yvals_core.h, defined(MRTDLL) implies defined(_M_CEE_PURE); !defined(_M_CEE_PURE) implies !defined(MRTDLL)
#endif // defined(MRTDLL) && !defined(_M_CEE_PURE)

#define _STL_WIN32_WINNT_VISTA 0x0600 // _WIN32_WINNT_VISTA from sdkddkver.h
#define _STL_WIN32_WINNT_WIN8 0x0602 // _WIN32_WINNT_WIN8 from sdkddkver.h
#define _STL_WIN32_WINNT_WINBLUE 0x0603 // _WIN32_WINNT_WINBLUE from sdkddkver.h
#define _STL_WIN32_WINNT_WIN10 0x0A00 // _WIN32_WINNT_WIN10 from sdkddkver.h
#define _STL_WIN32_WINNT_VISTA 0x0600 // _WIN32_WINNT_VISTA from sdkddkver.h
#define _STL_WIN32_WINNT_WIN8 0x0602 // _WIN32_WINNT_WIN8 from sdkddkver.h
#define _STL_WIN32_WINNT_WIN10 0x0A00 // _WIN32_WINNT_WIN10 from sdkddkver.h

// Note that the STL DLL builds will set this to XP for ABI compatibility with VS2015 which supported XP.
#ifndef _STL_WIN32_WINNT
Expand Down
61 changes: 15 additions & 46 deletions stl/src/atomic_wait.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include <atomic>
#include <cstdint>
#include <cstdlib>
#include <new>
#include <thread>

Expand Down Expand Up @@ -90,24 +91,6 @@ namespace {
}
#endif // defined(_DEBUG)
}

[[nodiscard]] unsigned char __std_atomic_compare_exchange_128_fallback(
_Inout_bytecount_(16) long long* _Destination, _In_ long long _ExchangeHigh, _In_ long long _ExchangeLow,
_Inout_bytecount_(16) long long* _ComparandResult) noexcept {
static SRWLOCK _Mtx = SRWLOCK_INIT;
_SrwLock_guard _Guard{_Mtx};
if (_Destination[0] == _ComparandResult[0] && _Destination[1] == _ComparandResult[1]) {
_ComparandResult[0] = _Destination[0];
_ComparandResult[1] = _Destination[1];
_Destination[0] = _ExchangeLow;
_Destination[1] = _ExchangeHigh;
return static_cast<unsigned char>(true);
} else {
_ComparandResult[0] = _Destination[0];
_ComparandResult[1] = _Destination[1];
return static_cast<unsigned char>(false);
}
}
} // unnamed namespace

extern "C" {
Expand Down Expand Up @@ -249,41 +232,27 @@ _Smtx_t* __stdcall __std_atomic_get_mutex(const void* const _Key) noexcept {
}
#pragma warning(pop)

// TRANSITION, ABI: preserved for binary compatibility
[[nodiscard]] unsigned char __stdcall __std_atomic_compare_exchange_128(_Inout_bytecount_(16) long long* _Destination,
_In_ long long _ExchangeHigh, _In_ long long _ExchangeLow,
_Inout_bytecount_(16) long long* _ComparandResult) noexcept {
#if !defined(_WIN64)
return __std_atomic_compare_exchange_128_fallback(_Destination, _ExchangeHigh, _ExchangeLow, _ComparandResult);
#elif _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1
#ifdef _WIN64
return _InterlockedCompareExchange128(_Destination, _ExchangeHigh, _ExchangeLow, _ComparandResult);
#else // ^^^ _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1 / _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 vvv
if (__std_atomic_has_cmpxchg16b()) {
return _InterlockedCompareExchange128(_Destination, _ExchangeHigh, _ExchangeLow, _ComparandResult);
}

return __std_atomic_compare_exchange_128_fallback(_Destination, _ExchangeHigh, _ExchangeLow, _ComparandResult);
#endif // ^^^ _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 ^^^
#else // ^^^ 64-bit / 32-bit vvv
(void) _Destination;
(void) _ExchangeHigh;
(void) _ExchangeLow;
(void) _ComparandResult;
_CSTD abort();
#endif // ^^^ 32-bit ^^^
}

// TRANSITION, ABI: preserved for binary compatibility
[[nodiscard]] char __stdcall __std_atomic_has_cmpxchg16b() noexcept {
#if !defined(_WIN64)
return false;
#elif _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1
#ifdef _WIN64
return true;
#else // ^^^ _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1 / _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 vvv
constexpr char _Cmpxchg_Absent = 0;
constexpr char _Cmpxchg_Present = 1;
constexpr char _Cmpxchg_Unknown = 2;

static std::atomic<char> _Cached_value{_Cmpxchg_Unknown};

char _Value = _Cached_value.load(std::memory_order_relaxed);
if (_Value == _Cmpxchg_Unknown) {
_Value = IsProcessorFeaturePresent(PF_COMPARE_EXCHANGE128) ? _Cmpxchg_Present : _Cmpxchg_Absent;
_Cached_value.store(_Value, std::memory_order_relaxed);
}

return _Value;
#endif // ^^^ _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 ^^^
#else
_CSTD abort();
#endif
}
} // extern "C"
3 changes: 0 additions & 3 deletions tests/std/tests/P0019R8_atomic_ref/env.lst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,3 @@
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

RUNALL_INCLUDE ..\usual_20_matrix.lst
RUNALL_CROSSLIST
* PM_CL=""
* PM_CL="/D_STD_ATOMIC_ALWAYS_USE_CMPXCHG16B=1 /DTEST_CMPXCHG16B"
13 changes: 0 additions & 13 deletions tests/std/tests/P0019R8_atomic_ref/test.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#if defined(TEST_CMPXCHG16B) && (defined(__clang__) || !defined(_M_X64))
// Skip Clang because it would require the -mcx16 compiler option.
// Skip non-x64 because _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B is always 1 for ARM64, and is forbidden to be 1 for 32-bit.
int main() {}
#else // ^^^ skip test / run test vvv

#include <atomic>
#include <cassert>
#include <cstddef>
Expand Down Expand Up @@ -436,13 +430,8 @@ void test_gh_4472() {

static_assert(std::atomic_ref<two_pointers_t>::required_alignment == sizeof(two_pointers_t));

#ifdef _WIN64
static_assert(std::atomic_ref<two_pointers_t>::is_always_lock_free == _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B);
#else
static_assert(std::atomic_ref<two_pointers_t>::is_always_lock_free);
#endif

// We expect tests to run on machines that support DCAS, which is required by Win8+.
std::atomic_ref<two_pointers_t> ar{two_pointers};
assert(ar.is_lock_free());
}
Expand Down Expand Up @@ -518,5 +507,3 @@ int main() {
test_gh_4472();
test_gh_4728();
}

#endif // ^^^ run test ^^^