Skip to content

Commit

Permalink
x86_64: Use xchg instead of lock or
Browse files Browse the repository at this point in the history
Follow-up to #156.
  • Loading branch information
taiki-e committed Jul 19, 2024
1 parent 86cee8f commit 0483042
Showing 1 changed file with 11 additions and 23 deletions.
34 changes: 11 additions & 23 deletions src/imp/atomic128/x86_64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,23 +68,6 @@ macro_rules! ptr_modifier {
};
}

#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
#[cfg(target_feature = "sse")]
#[cfg(target_pointer_width = "32")]
macro_rules! sp {
() => {
"esp"
};
}
#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
#[cfg(target_feature = "sse")]
#[cfg(target_pointer_width = "64")]
macro_rules! sp {
() => {
"rsp"
};
}

// Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction
// requirements for the currently enabled target features. In the first place,
// there is no option in the x86 assembly for such case, like ARM .arch_extension,
Expand Down Expand Up @@ -200,16 +183,21 @@ unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
);
}
Ordering::SeqCst => {
let p = core::cell::UnsafeCell::new(core::mem::MaybeUninit::<u64>::uninit());
asm!(
concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
// Equivalent to mfence, but is 10-35% faster at least in simple cases on Coffee Lake (https://github.com/taiki-e/portable-atomic/pull/156).
// Based on x86_32 64-bit atomic SeqCst store using SSE generated by LLVM.
// https://godbolt.org/z/9sKEr8YWc
concat!("lock or dword ptr [", sp!(), "], 0"),
// Equivalent to mfence, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
// - https://github.com/taiki-e/portable-atomic/pull/156
// - LLVM uses lock or for x86_32 64-bit atomic SeqCst store using SSE https://godbolt.org/z/9sKEr8YWc
// - Windows uses xchg for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
// - MSVC STL uses lock inc https://github.com/microsoft/STL/pull/740
// - boost uses lock or https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
concat!("xchg qword ptr [{p", ptr_modifier!(), "}], {tmp}"),
dst = in(reg) dst,
val = in(xmm_reg) val,
// Do not use `preserves_flags` because OR modifies the OF, CF, SF, ZF, and PF flags.
options(nostack),
p = inout(reg) p.get() => _,
tmp = lateout(reg) _,
options(nostack, preserves_flags),
);
}
_ => unreachable!(),
Expand Down

0 comments on commit 0483042

Please sign in to comment.