Skip to content

Commit

Permalink
aarch64: Use CASP instead of LDXP/STXP for load if available
Browse files Browse the repository at this point in the history
  • Loading branch information
taiki-e committed Aug 3, 2022
1 parent 5976f3c commit 1eb3f5d
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 10 deletions.
33 changes: 30 additions & 3 deletions .cirrus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ env:
RUSTFLAGS: -D warnings
RUSTUP_MAX_RETRIES: '10'

aarch64_linux_task:
aarch64_linux_test_task:
name: test (aarch64-unknown-linux-gnu)
env:
TARGET: aarch64-unknown-linux-gnu
Expand All @@ -28,7 +28,7 @@ aarch64_linux_task:
- RUSTFLAGS="$RUSTFLAGS -C target-feature=+lse" RUSTDOCFLAGS="$RUSTDOCFLAGS -C target-feature=+lse" CARGO_PROFILE_RELEASE_CODEGEN_UNITS=1 CARGO_PROFILE_RELEASE_LTO=fat cargo -Z build-std test -vv --workspace --exclude bench --all-features --release --tests --target $TARGET
# TODO: lse2 is not available on Graviton2 (armv8.2-a)

aarch64_macos_task:
aarch64_macos_test_task:
name: test (aarch64-apple-darwin)
env:
TARGET: aarch64-apple-darwin
Expand All @@ -43,7 +43,7 @@ aarch64_macos_task:
# Use -Z build-std because the prebuilt libtest seems to be incompatible with LTO, causing miscompilation: https://gist.github.com/taiki-e/9713f8e02e8f9f852ccee8d6f089ec24
- CARGO_PROFILE_RELEASE_CODEGEN_UNITS=1 CARGO_PROFILE_RELEASE_LTO=fat cargo -Z build-std test -vv --workspace --exclude bench --all-features --release --tests --target $TARGET

valgrind_task:
aarch64_linux_valgrind_task:
name: valgrind (aarch64-unknown-linux-gnu)
env:
CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER: valgrind -v --error-exitcode=1 --error-limit=no --leak-check=full --show-leak-kinds=all --track-origins=yes
Expand Down Expand Up @@ -80,3 +80,30 @@ valgrind_task:
# Use -Z build-std because the prebuilt libtest seems to be incompatible with LTO, causing miscompilation: https://gist.github.com/taiki-e/9713f8e02e8f9f852ccee8d6f089ec24
- RUSTFLAGS="$RUSTFLAGS -C target-feature=+lse" RUSTDOCFLAGS="$RUSTDOCFLAGS -C target-feature=+lse" CARGO_PROFILE_RELEASE_CODEGEN_UNITS=1 CARGO_PROFILE_RELEASE_LTO=fat cargo -Z build-std test -vv --workspace --exclude bench --all-features --release --tests --target $TARGET
# TODO: lse2 is not available on Graviton2 (armv8.2-a)

# aarch64_linux_bench_task:
# name: bench (aarch64-unknown-linux-gnu)
# env:
# TARGET: aarch64-unknown-linux-gnu
# arm_container:
# image: rust:latest
# cpu: 4
# memory: 12G
# setup_script:
# - rustup toolchain add nightly && rustup default nightly
# test_script:
# - cargo bench -vv --manifest-path bench/Cargo.toml
# - RUSTFLAGS="${RUSTFLAGS} -C target-feature=+lse" cargo bench -vv --manifest-path bench/Cargo.toml

# aarch64_macos_bench_task:
# name: bench (aarch64-apple-darwin)
# env:
# TARGET: aarch64-apple-darwin
# macos_instance:
# image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
# setup_script:
# - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal --default-toolchain nightly --component rust-src
# test_script:
# - sysctl -a | grep machdep.cpu
# - source $HOME/.cargo/env
# - cargo bench -vv --manifest-path bench/Cargo.toml
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ Note: In this file, do not use the hard wrap in the middle of a sentence for com

## [Unreleased]

- Optimize aarch64 128-bit load. ([#20](https://github.com/taiki-e/portable-atomic/pull/20))

## [0.3.9] - 2022-08-03

- Fix build error on old Miri.
Expand Down
24 changes: 24 additions & 0 deletions bench/benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,27 @@ fn bench_concurrent_store_swap<A: AtomicInt<T>, T: Copy + From<u32>>() -> A {
});
a
}
fn bench_concurrent_fetch_add<A: AtomicInt<T>, T: Copy + From<u32>>() -> A {
let a = black_box(A::new(T::from(1)));
let barrier = Barrier::new(THREADS * 2);
thread::scope(|s| {
for _ in 0..THREADS {
s.spawn(|| {
barrier.wait();
for i in 0..N {
let _ = black_box(a.fetch_add(T::from(i)));
}
});
s.spawn(|| {
barrier.wait();
for i in (0..N).rev() {
let _ = black_box(a.fetch_add(T::from(i)));
}
});
}
});
a
}

macro_rules! benches {
($name:ident, $atomic_u128:path) => {
Expand Down Expand Up @@ -273,6 +294,9 @@ macro_rules! benches {
g.bench_function("u128_concurrent_store_swap", |b| {
b.iter(bench_concurrent_store_swap::<A, u128>);
});
g.bench_function("u128_concurrent_fetch_add", |b| {
b.iter(bench_concurrent_fetch_add::<A, u128>);
});
}
};
}
Expand Down
24 changes: 17 additions & 7 deletions src/imp/atomic128/aarch64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
// - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
//
// Generated asm:
// - aarch64 https://godbolt.org/z/x6qMff94K
// - aarch64 (+lse) https://godbolt.org/z/9MsnrhEKr
// - aarch64 (+lse,+lse2) https://godbolt.org/z/orhr74bqT
// - aarch64 https://godbolt.org/z/85YK7K7Ye
// - aarch64 (+lse) https://godbolt.org/z/neWhfadn4
// - aarch64 (+lse,+lse2) https://godbolt.org/z/nGazzqx9e

include!("macros.rs");

Expand Down Expand Up @@ -196,14 +196,16 @@ unsafe fn _casp(dst: *mut u128, old: u128, new: u128, order: Ordering) -> u128 {
}
}

// If CPU supports FEAT_LSE2, LDP is single-copy atomic reads,
// otherwise it is two single-copy atomic reads.
// Refs: B2.2.1 of the Arm Architecture Reference Manual Armv8, for Armv8-A architecture profile
#[cfg(any(target_feature = "lse2", portable_atomic_target_feature = "lse2", test))]
#[inline]
unsafe fn _ldp(src: *mut u128, order: Ordering) -> u128 {
debug_assert!(src as usize % 16 == 0);

// SAFETY: the caller must guarantee that `dst` is valid for reads,
// 16-byte aligned, that there are no concurrent non-atomic operations,
// and the CPU supports FEAT_LSE2.
// 16-byte aligned, that there are no concurrent non-atomic operations.
//
// Refs:
// - LDP: https://developer.arm.com/documentation/dui0801/g/A64-Data-Transfer-Instructions/LDP
Expand Down Expand Up @@ -231,14 +233,16 @@ unsafe fn _ldp(src: *mut u128, order: Ordering) -> u128 {
}
}

// If CPU supports FEAT_LSE2, STP is single-copy atomic writes,
// otherwise it is two single-copy atomic writes.
// Refs: B2.2.1 of the Arm Architecture Reference Manual Armv8, for Armv8-A architecture profile
#[cfg(any(target_feature = "lse2", portable_atomic_target_feature = "lse2", test))]
#[inline]
unsafe fn _stp(dst: *mut u128, val: u128, order: Ordering) {
debug_assert!(dst as usize % 16 == 0);

// SAFETY: the caller must guarantee that `dst` is valid for writes,
// 16-byte aligned, that there are no concurrent non-atomic operations,
// and the CPU supports FEAT_LSE2.
// 16-byte aligned, that there are no concurrent non-atomic operations.
//
// Refs:
// - STP: https://developer.arm.com/documentation/dui0801/g/A64-Data-Transfer-Instructions/STP
Expand Down Expand Up @@ -388,6 +392,12 @@ unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 {
// SAFETY: the caller must uphold the safety contract for `atomic_load`.
// cfg guarantee that the CPU supports FEAT_LSE2.
() => unsafe { _ldp(src, order) },
#[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))]
#[cfg(not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2")))]
// SAFETY: the caller must uphold the safety contract for `atomic_load`.
// cfg guarantee that the CPU supports FEAT_LSE.
() => unsafe { _casp(src, 0, 0, order) },
#[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))]
#[cfg(not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2")))]
// SAFETY: the caller must uphold the safety contract for `atomic_load`.
() => unsafe { _compare_exchange_ldxp_stxp(src, 0, 0, order) },
Expand Down

0 comments on commit 1eb3f5d

Please sign in to comment.