From 0c128bf9efc9c90047fd60622af718ba860f9b8b Mon Sep 17 00:00:00 2001 From: Brian Smith Date: Thu, 13 Feb 2025 13:36:08 -0800 Subject: [PATCH] aes_gcm/x86_64: Enable AVX2 VAES-CLMUL implementation. If using GNU binutils as the assembler, this may require a newer version than what was previously required. --- Cargo.toml | 1 + build.rs | 5 ++ .../fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl | 49 +---------- src/aead/aes.rs | 2 +- src/aead/aes_gcm.rs | 40 ++++++++- src/aead/aes_gcm/aeshwclmulmovbe.rs | 2 + src/aead/aes_gcm/vaesclmulavx2.rs | 86 +++++++++++++++++++ src/aead/gcm.rs | 10 +++ src/aead/gcm/clmul.rs | 1 + src/aead/gcm/vclmulavx2.rs | 46 ++++++++++ src/cpu/intel.rs | 17 +++- 11 files changed, 208 insertions(+), 51 deletions(-) create mode 100644 src/aead/aes_gcm/vaesclmulavx2.rs create mode 100644 src/aead/gcm/vclmulavx2.rs diff --git a/Cargo.toml b/Cargo.toml index 5163255331..8c51848af8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,6 +51,7 @@ include = [ "crypto/curve25519/curve25519_tables.h", "crypto/curve25519/internal.h", "crypto/fipsmodule/aes/aes_nohw.c", + "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl", "crypto/fipsmodule/aes/asm/aesni-x86.pl", "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl", "crypto/fipsmodule/aes/asm/aesni-x86_64.pl", diff --git a/build.rs b/build.rs index b8c54b36fd..9843ad8aa5 100644 --- a/build.rs +++ b/build.rs @@ -76,6 +76,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[ (&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"), (&[X86_64], "crypto/curve25519/curve25519_64_adx.c"), + (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"), (&[X86_64], "crypto/fipsmodule/aes/asm/ghash-x86_64.pl"), @@ -887,7 +888,9 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "LIMB_shr", "OPENSSL_cpuid_setup", "aes_gcm_dec_kernel", + "aes_gcm_dec_update_vaes_avx2", "aes_gcm_enc_kernel", + "aes_gcm_enc_update_vaes_avx2", "aes_hw_ctr32_encrypt_blocks", "aes_hw_set_encrypt_key", "aes_hw_set_encrypt_key_alt", @@ -946,11 +949,13 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "gcm_ghash_avx", "gcm_ghash_clmul", "gcm_ghash_neon", + "gcm_ghash_vpclmulqdq_avx2_1", "gcm_gmult_clmul", "gcm_gmult_neon", "gcm_init_avx", "gcm_init_clmul", "gcm_init_neon", + "gcm_init_vpclmulqdq_avx2", "k25519Precomp", "limbs_mul_add_limb", "little_endian_bytes_from_scalar", diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl index a67debc609..db4bea0baa 100644 --- a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl +++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl @@ -439,12 +439,8 @@ sub _ghash_4x { # const uint8_t *in, size_t len); # # Using the key |Htable|, update the GHASH accumulator |Xi| with the data given -# by |in| and |len|. |len| must be a multiple of 16. -# -# This function handles large amounts of AAD efficiently, while also keeping the -# overhead low for small amounts of AAD which is the common case. TLS uses less -# than one block of AAD, but (uncommonly) other use cases may use much more. -$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2", 1; +# by |in| and |len|. |len| must be exactly 16. +$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2_1", 1; { # Function arguments my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ]; @@ -470,49 +466,8 @@ sub _ghash_4x { vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM vbroadcasti128 .Lgfpoly(%rip), $GFPOLY - # Optimize for AADLEN < 32 by checking for AADLEN < 32 before AADLEN < 128. - cmp \$32, $AADLEN - jb .Lghash_lastblock - - cmp \$127, $AADLEN - jbe .Lghash_loop_1x - - # Update GHASH with 128 bytes of AAD at a time. - vmovdqu $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED - vmovdqu $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED -.Lghash_loop_4x: - @{[ _ghash_4x $AAD, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, $H_POW1_XORED, - $TMP0, $TMP0_XMM, $TMP1, $TMP2, $LO, $MI, $GHASH_ACC, - $GHASH_ACC_XMM ]} - sub \$-128, $AAD # 128 is 4 bytes, -128 is 1 byte - add \$-128, $AADLEN - cmp \$127, $AADLEN - ja .Lghash_loop_4x - - # Update GHASH with 32 bytes of AAD at a time. - cmp \$32, $AADLEN - jb .Lghash_loop_1x_done -.Lghash_loop_1x: - vmovdqu ($AAD), $TMP0 - vpshufb $BSWAP_MASK, $TMP0, $TMP0 - vpxor $TMP0, $GHASH_ACC, $GHASH_ACC - vmovdqu $OFFSETOFEND_H_POWERS-32($HTABLE), $TMP0 - @{[ _ghash_mul $TMP0, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $TMP1, $TMP2, $LO ]} - vextracti128 \$1, $GHASH_ACC, $TMP0_XMM - vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM - add \$32, $AAD - sub \$32, $AADLEN - cmp \$32, $AADLEN - jae .Lghash_loop_1x -.Lghash_loop_1x_done: - # Issue the vzeroupper that is needed after using ymm registers. Do it here - # instead of at the end, to minimize overhead for small AADLEN. - vzeroupper - # Update GHASH with the remaining 16-byte block if any. .Lghash_lastblock: - test $AADLEN, $AADLEN - jz .Lghash_done vmovdqu ($AAD), $TMP0_XMM vpshufb $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM diff --git a/src/aead/aes.rs b/src/aead/aes.rs index eff2cd885a..8de8451395 100644 --- a/src/aead/aes.rs +++ b/src/aead/aes.rs @@ -151,7 +151,7 @@ impl Counter { iv } - fn increment_by_less_safe(&mut self, increment_by: NonZeroU32) { + pub(super) fn increment_by_less_safe(&mut self, increment_by: NonZeroU32) { let [.., c0, c1, c2, c3] = &mut self.0; let old_value: u32 = u32::from_be_bytes([*c0, *c1, *c2, *c3]); let new_value = old_value + increment_by.get(); diff --git a/src/aead/aes_gcm.rs b/src/aead/aes_gcm.rs index 39f10854f2..d9e08a3116 100644 --- a/src/aead/aes_gcm.rs +++ b/src/aead/aes_gcm.rs @@ -35,6 +35,7 @@ use cpu::GetFeature as _; mod aarch64; mod aeshwclmulmovbe; +mod vaesclmulavx2; #[derive(Clone)] pub(super) struct Key(DynKey); @@ -50,6 +51,9 @@ impl Key { #[derive(Clone)] enum DynKey { + #[cfg(target_arch = "x86_64")] + VAesClMulAvx2(Combo), + #[cfg(target_arch = "x86_64")] AesHwClMulAvxMovbe(Combo), @@ -75,11 +79,16 @@ enum DynKey { impl DynKey { fn new(key: aes::KeyBytes, cpu: cpu::Features) -> Result { let cpu = cpu.values(); + #[cfg(target_arch = "x86_64")] if let Some((aes, gcm)) = cpu.get_feature() { + // 14.3.1 Detection of VEX-Encoded AES and VPCLMULQDQ let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature())?; let gcm_key_value = derive_gcm_key_value(&aes_key); let combo = if let Some(cpu) = cpu.get_feature() { + let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu); + Self::VAesClMulAvx2(Combo { aes_key, gcm_key }) + } else if let Some(cpu) = cpu.get_feature() { let gcm_key = gcm::clmulavxmovbe::Key::new(gcm_key_value, cpu); Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) } else { @@ -181,6 +190,16 @@ pub(super) fn seal( seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole) } + #[cfg(target_arch = "x86_64")] + DynKey::VAesClMulAvx2(c) => seal_whole_partial( + c, + aad, + in_out, + ctr, + tag_iv, + vaesclmulavx2::seal_whole_vaes_clmul_avx2, + ), + #[cfg(target_arch = "x86_64")] DynKey::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) => { aeshwclmulmovbe::seal(aes_key, gcm_key, ctr, tag_iv, aad, in_out) @@ -201,7 +220,10 @@ pub(super) fn seal( } } -#[cfg(all(target_arch = "aarch64", target_endian = "little"))] +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" +))] fn seal_whole_partial( Combo { aes_key, gcm_key }: &Combo, aad: Aad<&[u8]>, @@ -295,6 +317,17 @@ pub(super) fn open( open_whole_partial(c, aad, in_out_slice, src, ctr, tag_iv, aarch64::open_whole) } + #[cfg(target_arch = "x86_64")] + DynKey::VAesClMulAvx2(c) => open_whole_partial( + c, + aad, + in_out_slice, + src, + ctr, + tag_iv, + vaesclmulavx2::open_whole_vaes_clmul_avx2, + ), + #[cfg(target_arch = "x86_64")] DynKey::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) => { aeshwclmulmovbe::open(aes_key, gcm_key, ctr, tag_iv, aad, in_out_slice, src) @@ -315,7 +348,10 @@ pub(super) fn open( } } -#[cfg(all(target_arch = "aarch64", target_endian = "little"))] +#[cfg(any( + all(target_arch = "aarch64", target_endian = "little"), + target_arch = "x86_64" +))] fn open_whole_partial( Combo { aes_key, gcm_key }: &Combo, aad: Aad<&[u8]>, diff --git a/src/aead/aes_gcm/aeshwclmulmovbe.rs b/src/aead/aes_gcm/aeshwclmulmovbe.rs index 35236474ff..e6d49ee3bb 100644 --- a/src/aead/aes_gcm/aeshwclmulmovbe.rs +++ b/src/aead/aes_gcm/aeshwclmulmovbe.rs @@ -26,6 +26,7 @@ use crate::{ }; use core::ops::RangeFrom; +#[inline(never)] pub(super) fn seal( aes_key: &aes::hw::Key, gcm_key: &gcm::clmulavxmovbe::Key, @@ -79,6 +80,7 @@ pub(super) fn seal( super::seal_finish(aes_key, auth, remainder, ctr, tag_iv) } +#[inline(never)] pub(super) fn open( aes_key: &aes::hw::Key, gcm_key: &gcm::clmulavxmovbe::Key, diff --git a/src/aead/aes_gcm/vaesclmulavx2.rs b/src/aead/aes_gcm/vaesclmulavx2.rs new file mode 100644 index 0000000000..8a2a68f238 --- /dev/null +++ b/src/aead/aes_gcm/vaesclmulavx2.rs @@ -0,0 +1,86 @@ +// Copyright 2015-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(target_arch = "x86_64")] + +use super::{aes, gcm, Counter, BLOCK_LEN}; +use crate::{aead::aes::Overlapping, c, polyfill::slice::AsChunksMut}; +use core::num::NonZeroU32; + +pub(super) fn seal_whole_vaes_clmul_avx2( + aes_key: &aes::hw::Key, + auth: &mut gcm::Context, + ctr: &mut Counter, + mut in_out: AsChunksMut, +) { + prefixed_extern! { + fn aes_gcm_enc_update_vaes_avx2( + input: *const u8, + output: *mut u8, + len: c::size_t, + key: &aes::AES_KEY, + ivec: &Counter, + Htable: &gcm::HTable, + Xi: &mut gcm::Xi); + } + + let in_out = in_out.as_flattened_mut(); + + // Precondition: Since we have a `gcm::Context` then the number of blocks + // must fit in `u32`. + let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap(); + + if let Some(blocks) = NonZeroU32::new(blocks) { + let aes_key = aes_key.inner_less_safe(); + let (htable, xi) = auth.inner(); + let input = in_out.as_ptr(); + let output = in_out.as_mut_ptr(); + let len = in_out.len(); + unsafe { aes_gcm_enc_update_vaes_avx2(input, output, len, aes_key, ctr, htable, xi) }; + ctr.increment_by_less_safe(blocks); + } +} + +pub(super) fn open_whole_vaes_clmul_avx2( + aes_key: &aes::hw::Key, + auth: &mut gcm::Context, + in_out: Overlapping, + ctr: &mut Counter, +) { + prefixed_extern! { + fn aes_gcm_dec_update_vaes_avx2( + input: *const u8, + output: *mut u8, + len: c::size_t, + key: &aes::AES_KEY, + ivec: &mut Counter, + Htable: &gcm::HTable, + Xi: &mut gcm::Xi); + } + + // Precondition. TODO: Create an overlapping::AsChunks for this. + assert_eq!(in_out.len() % BLOCK_LEN, 0); + // Precondition: Since we have a `gcm::Context` then the number of blocks + // must fit in `u32`. + let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap(); + + if let Some(blocks) = NonZeroU32::new(blocks) { + let aes_key = aes_key.inner_less_safe(); + let (htable, xi) = auth.inner(); + in_out.with_input_output_len(|input, output, len| unsafe { + aes_gcm_dec_update_vaes_avx2(input, output, len, aes_key, ctr, htable, xi) + }); + ctr.increment_by_less_safe(blocks); + } +} diff --git a/src/aead/gcm.rs b/src/aead/gcm.rs index 9ac1cc1bd8..443c19e16b 100644 --- a/src/aead/gcm.rs +++ b/src/aead/gcm.rs @@ -38,6 +38,7 @@ pub(super) mod clmul; pub(super) mod clmulavxmovbe; pub(super) mod fallback; pub(super) mod neon; +pub(super) mod vclmulavx2; pub(super) struct Context<'key, K> { Xi: Xi, @@ -118,6 +119,15 @@ impl Context<'_, clmulavxmovbe::Key> { } } +#[cfg(target_arch = "x86_64")] +impl Context<'_, vclmulavx2::Key> { + /// Access to `inner` for the integrated AES-GCM implementations only. + #[inline] + pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) { + (self.key.inner(), &mut self.Xi) + } +} + impl Context<'_, K> { #[inline(always)] pub fn update_blocks(&mut self, input: AsChunks) { diff --git a/src/aead/gcm/clmul.rs b/src/aead/gcm/clmul.rs index d7f84aa6ef..8cd55a4eeb 100644 --- a/src/aead/gcm/clmul.rs +++ b/src/aead/gcm/clmul.rs @@ -36,6 +36,7 @@ pub struct Key { } impl Key { + #[cfg_attr(target_arch = "x86_64", inline(never))] pub(in super::super) fn new(value: KeyValue, _cpu: RequiredCpuFeatures) -> Self { Self { h_table: unsafe { htable_new!(gcm_init_clmul, value) }, diff --git a/src/aead/gcm/vclmulavx2.rs b/src/aead/gcm/vclmulavx2.rs new file mode 100644 index 0000000000..916dd1eb32 --- /dev/null +++ b/src/aead/gcm/vclmulavx2.rs @@ -0,0 +1,46 @@ +// Copyright 2018-2025 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#![cfg(target_arch = "x86_64")] + +use super::{ffi::KeyValue, HTable, UpdateBlock, Xi}; +use crate::{ + aead::gcm::ffi::BLOCK_LEN, + cpu::intel::{Avx2, VAesClmul}, + polyfill::slice::AsChunks, +}; + +#[derive(Clone)] +pub struct Key { + h_table: HTable, +} + +impl Key { + pub(in super::super) fn new(value: KeyValue, _cpu: (Avx2, VAesClmul)) -> Self { + Self { + h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx2, value) }, + } + } + + pub(super) fn inner(&self) -> &HTable { + &self.h_table + } +} + +impl UpdateBlock for Key { + fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) { + let input: AsChunks = (&a).into(); + unsafe { ghash!(gcm_ghash_vpclmulqdq_avx2_1, xi, &self.h_table, input) } + } +} diff --git a/src/cpu/intel.rs b/src/cpu/intel.rs index ec117c207a..f45052fe7f 100644 --- a/src/cpu/intel.rs +++ b/src/cpu/intel.rs @@ -137,7 +137,7 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 { // Intel: "Structured Extended Feature Flags Enumeration Leaf" #[cfg(target_arch = "x86_64")] - let extended_features_ebx = cpuid[2]; + let (extended_features_ebx, extended_features_ecx) = (cpuid[2], cpuid[3]); let mut caps = 0; @@ -218,6 +218,20 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 { set(&mut caps, Shift::Avx); } + #[cfg(target_arch = "x86_64")] + if avx_available { + // The Intel docs don't seem to document the detection. The instruction + // definitions of the VEX.256 instructions reference the + // VAES/VPCLMULQDQ features and the documentation for the extended + // features gives the values. We combine these into one feature because + // we never use them independently. + let vaes_available = check(extended_features_ecx, 9); + let vclmul_available = check(extended_features_ecx, 10); + if vaes_available && vclmul_available { + set(&mut caps, Shift::VAesClmul); + } + } + // "14.7.1 Detection of Intel AVX2 Hardware support" // XXX: We don't condition AVX2 on AVX. TODO: Address this. // `OPENSSL_cpuid_setup` clears this bit when it detects the OS doesn't @@ -318,6 +332,7 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 { impl_get_feature! { features: [ + { ("x86_64") => VAesClmul }, { ("x86", "x86_64") => ClMul }, { ("x86", "x86_64") => Ssse3 }, { ("x86_64") => Sse41 },