Optimize parse_str_radix_10 (#456)

* Fix `from_str` overflow rounding issue The culprit is where `maybe_round` prevented the number from overflowing , and ended up returning an `Ok` in the end. The fix is to round a number only after we have passed the decimal point. * Remove coeff ArrayVec from `parse_str_radix_10` This commit flattened add_by_internal noting the fact that it is only ever called with a fixed size array plus one u32. In addition, instead of accumulating the coefficients in `ArrayVec` and process it later, we try to compute the output number in one pass. This also removed the manual overflow rounding on the `coeff` array and replacing with a plain arithmetic +1. cargo bench --bench lib_benches -- decimal_from_str Baseline: decimal_from_str ... bench: 566 ns/iter (+/- 21) [M1 Pro] decimal_from_str ... bench: 554 ns/iter (+/- 21) [Ryzen 3990x] Current: decimal_from_str ... bench: 193 ns/iter (+/- 1) [M1 Pro] decimal_from_str ... bench: 320 ns/iter (+/- 3) [Ryzen 3990x] * Use 128 bit integer to speed up `parse_str_radix_10` Instead of manually doing 96-bit integer operations, use a 128 bit integer and check for overflowing 96 bits worth of state cargo bench --bench lib_benches -- decimal_from_str Baseline: decimal_from_str ... bench: 566 ns/iter (+/- 21) [M1 Pro] decimal_from_str ... bench: 554 ns/iter (+/- 21) [Ryzen 3990x] Current: decimal_from_str ... bench: 182 ns/iter (+/- 3) [M1 Pro] decimal_from_str ... bench: 299 ns/iter (+/- 4) [Ryzen 3990x] * Use 64 bit integer with 128 bit fallback to speed up further Accumulate parse state into a 64-bit integer until we would observe an overflow of this integer, and fall back to maintaining 128-bit state cargo bench --bench lib_benches -- decimal_from_str Baseline: decimal_from_str ... bench: 566 ns/iter (+/- 21) [M1 Pro] decimal_from_str ... bench: 554 ns/iter (+/- 21) [Ryzen 3990x] Current: decimal_from_str ... bench: 133 ns/iter (+/- 4) [M1 Pro] decimal_from_str ... bench: 190 ns/iter (+/- 5) [Ryzen 3990x] * Convert into series of tail-calls This commit changes the parsing code to operate as a series of tail calls. This allows loop and overflow conditions to be checked only when exactly needed, as well as makes it very easy for the compiler to generate quality code on the fast path. The parsing functions are also parameterized by const generics to generate optimized codepaths for various situations (i.e. has/hasn't seen digits yet, has/hasn't seen the decimal point) and remove carried state. cargo bench --bench lib_benches -- decimal_from_str Baseline: decimal_from_str ... bench: 566 ns/iter (+/- 21) [M1 Pro] decimal_from_str ... bench: 554 ns/iter (+/- 21) [Ryzen 3990x] Current: decimal_from_str ... bench: 85 ns/iter (+/- 2) [M1 Pro] decimal_from_str ... bench: 129 ns/iter (+/- 2) [Ryzen 3990x] * Give each operation a dedicated dispatch This change gives each of the possible operations for the u64 accumulation a dedicated dispatch site, giving each operation dedicated branch prediction. cargo bench --bench lib_benches -- decimal_from_str Baseline: decimal_from_str ... bench: 566 ns/iter (+/- 21) [M1 Pro] decimal_from_str ... bench: 554 ns/iter (+/- 21) [Ryzen 3990x] Current: decimal_from_str ... bench: 81 ns/iter (+/- 1) [M1 Pro] decimal_from_str ... bench: 120 ns/iter (+/- 2) [Ryzen 3990x] * Move 'rare' digit handling into dedicated function Remove up-front handling of digits like '+', '-', and inline handling of '_' and put all of it into a function dedicated to not common digits. This improved the up-front cost for numbers with no prefix and has similar startup cost to the checking code for rare digits if they exist Baseline: decimal_from_str ... bench: 566 ns/iter (+/- 21) [M1 Pro] Current: decimal_from_str ... bench: 71 ns/iter (+/- 2) [M1 Pro] * Remove unimportant const generic from digit dispatch * Verify test cases in unpacked representation This makes sure that the parsing behaviour is same as before Co-authored-by: Tomatillo <[email protected]>
paupino · Jan 8, 2022 · fca96a5 · fca96a5
1 parent 005b7be
commit fca96a5
Show file tree

Hide file tree

Showing 5 changed files with 423 additions and 135 deletions.
diff --git a/src/constants.rs b/src/constants.rs
@@ -65,3 +65,8 @@ pub const MAX_I32_SCALE: i32 = 9;
 pub const MAX_I64_SCALE: u32 = 19;
 #[cfg(not(feature = "legacy-ops"))]
 pub const U32_MAX: u64 = u32::MAX as u64;
+
+// Determines potential overflow for 128 bit operations
+pub const OVERFLOW_U96: u128 = 1u128 << 96;
+pub const WILL_OVERFLOW_U64: u64 = u64::MAX / 10 - u8::MAX as u64;
+pub const BYTES_TO_OVERFLOW_U64: usize = 18; // We can probably get away with less
diff --git a/src/decimal.rs b/src/decimal.rs
@@ -83,7 +83,7 @@ const NEGATIVE_ONE: Decimal = Decimal {
 
 /// `UnpackedDecimal` contains unpacked representation of `Decimal` where each component
 /// of decimal-format stored in it's own field
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, PartialEq)]
 pub struct UnpackedDecimal {
     pub negative: bool,
     pub scale: u32,

diff --git a/src/error.rs b/src/error.rs
@@ -1,4 +1,4 @@
-use crate::constants::MAX_PRECISION_U32;
+use crate::{constants::MAX_PRECISION_U32, Decimal};
 use alloc::string::String;
 use core::fmt;
 
@@ -21,6 +21,11 @@ where
     }
 }
 
+#[cold]
+pub(crate) fn tail_error(from: &'static str) -> Result<Decimal, Error> {
+    Err(from.into())
+}
+
 #[cfg(feature = "std")]
 impl std::error::Error for Error {}
 

diff --git a/src/ops/array.rs b/src/ops/array.rs
@@ -54,6 +54,7 @@ pub(crate) fn rescale_internal(value: &mut [u32; 3], value_scale: &mut u32, new_
     }
 }
 
+#[cfg(feature = "legacy-ops")]
 pub(crate) fn add_by_internal(value: &mut [u32], by: &[u32]) -> u32 {
     let mut carry: u64 = 0;
     let vl = value.len();
@@ -92,6 +93,25 @@ pub(crate) fn add_by_internal(value: &mut [u32], by: &[u32]) -> u32 {
     carry as u32
 }
 
+pub(crate) fn add_by_internal_flattened(value: &mut [u32; 3], by: u32) -> u32 {
+    let mut carry: u64;
+    let mut sum: u64;
+    sum = u64::from(value[0]) + u64::from(by);
+    value[0] = (sum & U32_MASK) as u32;
+    carry = sum >> 32;
+    if carry > 0 {
+        sum = u64::from(value[1]) + carry;
+        value[1] = (sum & U32_MASK) as u32;
+        carry = sum >> 32;
+        if carry > 0 {
+            sum = u64::from(value[2]) + carry;
+            value[2] = (sum & U32_MASK) as u32;
+            carry = sum >> 32;
+        }
+    }
+    carry as u32
+}
+
 #[inline]
 pub(crate) fn add_one_internal(value: &mut [u32]) -> u32 {
     let mut carry: u64 = 1; // Start with one, since adding one