From cd5000ac935999144f5a31e70eefd9f7ad9e3242 Mon Sep 17 00:00:00 2001 From: Tom Kaitchuck Date: Wed, 27 Mar 2024 16:11:01 -0700 Subject: [PATCH] Prevent perf regression on Aarch64 (#228) * Add mixcolumns step accedently removed earlier. Signed-off-by: Tom Kaitchuck --- .github/workflows/rust.yml | 9 +++++- src/aes_hash.rs | 8 ------ src/lib.rs | 4 +-- src/operations.rs | 57 +++++++++++++++++++++----------------- 4 files changed, 41 insertions(+), 37 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7e28cda..ae04cc7 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -64,7 +64,7 @@ jobs: - run: cargo +1.72.0 check --target armv7-unknown-linux-gnueabihf aarch64-apple-darwin: name: Aarch64 Apple Darwin - runs-on: macos-latest + runs-on: macos-14 steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@master @@ -80,6 +80,13 @@ jobs: toolchain: 1.72.0 targets: aarch64-apple-darwin - run: cargo +1.72.0 check --target aarch64-apple-darwin +# aarch64-debug: +# name: Debug Apple +# runs-on: macos-14 +# steps: +# - uses: actions/checkout@v2 +# - name: Setup upterm session +# uses: lhotari/action-upterm@v1 i686-unknown-linux-gnu: name: Linux i686 runs-on: ubuntu-latest diff --git a/src/aes_hash.rs b/src/aes_hash.rs index c0aed7d..39fd40f 100644 --- a/src/aes_hash.rs +++ b/src/aes_hash.rs @@ -101,16 +101,8 @@ impl AHasher { let result: [u64; 2] = aesdec(combined, combined).convert(); result[0] } - - #[inline] - #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] - fn final_mix(&self) -> u128 { - let sum = aesenc(self.sum, self.key); - aesdec(aesdec(sum, self.enc), sum) - } #[inline] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn final_mix(&self) -> u128 { let combined = aesenc(self.sum, self.enc); aesdec(aesdec(combined, self.key), combined) diff --git a/src/lib.rs b/src/lib.rs index 66af806..c0a9ece 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -251,8 +251,8 @@ impl Default for AHasher { // #[inline(never)] // #[doc(hidden)] // pub fn hash_test(input: &[u8]) -> u64 { -// let a = RandomState::with_seeds(11, 22, 33, 44); -// <[u8]>::get_hash(input, &a) +// let a = RandomState::<&[u8]>::with_seeds(11, 22, 33, 44); +// a.hash_one(input) // } #[cfg(feature = "std")] diff --git a/src/operations.rs b/src/operations.rs index 986a500..eed3a2a 100644 --- a/src/operations.rs +++ b/src/operations.rs @@ -7,7 +7,7 @@ pub(crate) const MULTIPLE: u64 = 6364136223846793005; /// This is a constant with a lot of special properties found by automated search. /// See the unit tests below. (Below are alternative values) -#[cfg(all(target_feature = "ssse3", not(miri)))] +#[allow(dead_code)] const SHUFFLE_MASK: u128 = 0x020a0700_0c01030e_050f0d08_06090b04_u128; //const SHUFFLE_MASK: u128 = 0x000d0702_0a040301_05080f0c_0e0b0609_u128; //const SHUFFLE_MASK: u128 = 0x040A0700_030E0106_0D050F08_020B0C09_u128; @@ -51,17 +51,19 @@ pub(crate) fn read_small(data: &[u8]) -> [u64; 2] { #[inline(always)] pub(crate) fn shuffle(a: u128) -> u128 { - #[cfg(all(target_feature = "ssse3", not(miri)))] - { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - unsafe { transmute!(_mm_shuffle_epi8(transmute!(a), transmute!(SHUFFLE_MASK))) } - } - #[cfg(not(all(target_feature = "ssse3", not(miri))))] - { - a.swap_bytes() + cfg_if::cfg_if! { + if #[cfg(all(target_feature = "ssse3", not(miri)))] { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + unsafe { transmute!(_mm_shuffle_epi8(transmute!(a), transmute!(SHUFFLE_MASK))) } + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon", not(miri)))] { + use core::arch::aarch64::vqtbl1q_s8; + unsafe { transmute!(vqtbl1q_s8(transmute!(a), transmute!(SHUFFLE_MASK))) } + } else { + a.swap_bytes() + } } } @@ -79,22 +81,25 @@ pub(crate) fn shuffle_and_add(base: u128, to_add: u128) -> u128 { add_by_64s(shuffled, to_add.convert()).convert() } -#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2", not(miri)))] #[inline(always)] pub(crate) fn add_by_64s(a: [u64; 2], b: [u64; 2]) -> [u64; 2] { - unsafe { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - transmute!(_mm_add_epi64(transmute!(a), transmute!(b))) + cfg_if::cfg_if! { + if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2", not(miri)))] { + unsafe { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + transmute!(_mm_add_epi64(transmute!(a), transmute!(b))) + } + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon", not(miri)))] { + use core::arch::aarch64::vaddq_u64; + unsafe { transmute!(vaddq_u64(transmute!(a), transmute!(b))) } + } else { + [a[0].wrapping_add(b[0]), a[1].wrapping_add(b[1])] + } } -} -#[cfg(not(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2", not(miri))))] -#[inline(always)] -pub(crate) fn add_by_64s(a: [u64; 2], b: [u64; 2]) -> [u64; 2] { - [a[0].wrapping_add(b[0]), a[1].wrapping_add(b[1])] } #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "aes", not(miri)))] @@ -122,7 +127,7 @@ pub(crate) fn aesenc(value: u128, xor: u128) -> u128 { use core::arch::aarch64::*; #[cfg(target_arch = "arm")] use core::arch::arm::*; - unsafe { transmute!(vaeseq_u8(transmute!(value), transmute!(xor))) } + unsafe { transmute!(vaesmcq_u8(vaeseq_u8(transmute!(value), transmute!(xor)))) } } #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "aes", not(miri)))] @@ -150,7 +155,7 @@ pub(crate) fn aesdec(value: u128, xor: u128) -> u128 { use core::arch::aarch64::*; #[cfg(target_arch = "arm")] use core::arch::arm::*; - unsafe { transmute!(vaesdq_u8(transmute!(value), transmute!(xor))) } + unsafe { transmute!(vaesimcq_u8(vaesdq_u8(transmute!(value), transmute!(xor)))) } } #[allow(unused)]