Skip to content

Commit

Permalink
aes_gcm/x86_64: Enable AVX2 VAES-CLMUL implementation.
Browse files Browse the repository at this point in the history
If using GNU binutils as the assembler, this may require a newer
version than what was previously required.
  • Loading branch information
briansmith committed Mar 4, 2025
1 parent 8ae4f2d commit 0c128bf
Show file tree
Hide file tree
Showing 11 changed files with 208 additions and 51 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ include = [
"crypto/curve25519/curve25519_tables.h",
"crypto/curve25519/internal.h",
"crypto/fipsmodule/aes/aes_nohw.c",
"crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl",
"crypto/fipsmodule/aes/asm/aesni-x86.pl",
"crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl",
"crypto/fipsmodule/aes/asm/aesni-x86_64.pl",
Expand Down
5 changes: 5 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[

(&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"),
(&[X86_64], "crypto/curve25519/curve25519_64_adx.c"),
(&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"),
(&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"),
(&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"),
(&[X86_64], "crypto/fipsmodule/aes/asm/ghash-x86_64.pl"),
Expand Down Expand Up @@ -887,7 +888,9 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
"LIMB_shr",
"OPENSSL_cpuid_setup",
"aes_gcm_dec_kernel",
"aes_gcm_dec_update_vaes_avx2",
"aes_gcm_enc_kernel",
"aes_gcm_enc_update_vaes_avx2",
"aes_hw_ctr32_encrypt_blocks",
"aes_hw_set_encrypt_key",
"aes_hw_set_encrypt_key_alt",
Expand Down Expand Up @@ -946,11 +949,13 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
"gcm_ghash_avx",
"gcm_ghash_clmul",
"gcm_ghash_neon",
"gcm_ghash_vpclmulqdq_avx2_1",
"gcm_gmult_clmul",
"gcm_gmult_neon",
"gcm_init_avx",
"gcm_init_clmul",
"gcm_init_neon",
"gcm_init_vpclmulqdq_avx2",
"k25519Precomp",
"limbs_mul_add_limb",
"little_endian_bytes_from_scalar",
Expand Down
49 changes: 2 additions & 47 deletions crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
Original file line number Diff line number Diff line change
Expand Up @@ -439,12 +439,8 @@ sub _ghash_4x {
# const uint8_t *in, size_t len);
#
# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
# by |in| and |len|. |len| must be a multiple of 16.
#
# This function handles large amounts of AAD efficiently, while also keeping the
# overhead low for small amounts of AAD which is the common case. TLS uses less
# than one block of AAD, but (uncommonly) other use cases may use much more.
$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2", 1;
# by |in| and |len|. |len| must be exactly 16.
$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2_1", 1;
{
# Function arguments
my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
Expand All @@ -470,49 +466,8 @@ sub _ghash_4x {
vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
vbroadcasti128 .Lgfpoly(%rip), $GFPOLY

# Optimize for AADLEN < 32 by checking for AADLEN < 32 before AADLEN < 128.
cmp \$32, $AADLEN
jb .Lghash_lastblock

cmp \$127, $AADLEN
jbe .Lghash_loop_1x

# Update GHASH with 128 bytes of AAD at a time.
vmovdqu $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED
vmovdqu $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED
.Lghash_loop_4x:
@{[ _ghash_4x $AAD, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, $H_POW1_XORED,
$TMP0, $TMP0_XMM, $TMP1, $TMP2, $LO, $MI, $GHASH_ACC,
$GHASH_ACC_XMM ]}
sub \$-128, $AAD # 128 is 4 bytes, -128 is 1 byte
add \$-128, $AADLEN
cmp \$127, $AADLEN
ja .Lghash_loop_4x

# Update GHASH with 32 bytes of AAD at a time.
cmp \$32, $AADLEN
jb .Lghash_loop_1x_done
.Lghash_loop_1x:
vmovdqu ($AAD), $TMP0
vpshufb $BSWAP_MASK, $TMP0, $TMP0
vpxor $TMP0, $GHASH_ACC, $GHASH_ACC
vmovdqu $OFFSETOFEND_H_POWERS-32($HTABLE), $TMP0
@{[ _ghash_mul $TMP0, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $TMP1, $TMP2, $LO ]}
vextracti128 \$1, $GHASH_ACC, $TMP0_XMM
vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
add \$32, $AAD
sub \$32, $AADLEN
cmp \$32, $AADLEN
jae .Lghash_loop_1x
.Lghash_loop_1x_done:
# Issue the vzeroupper that is needed after using ymm registers. Do it here
# instead of at the end, to minimize overhead for small AADLEN.
vzeroupper

# Update GHASH with the remaining 16-byte block if any.
.Lghash_lastblock:
test $AADLEN, $AADLEN
jz .Lghash_done
vmovdqu ($AAD), $TMP0_XMM
vpshufb $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM
vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
Expand Down
2 changes: 1 addition & 1 deletion src/aead/aes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ impl Counter {
iv
}

fn increment_by_less_safe(&mut self, increment_by: NonZeroU32) {
pub(super) fn increment_by_less_safe(&mut self, increment_by: NonZeroU32) {
let [.., c0, c1, c2, c3] = &mut self.0;
let old_value: u32 = u32::from_be_bytes([*c0, *c1, *c2, *c3]);
let new_value = old_value + increment_by.get();
Expand Down
40 changes: 38 additions & 2 deletions src/aead/aes_gcm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ use cpu::GetFeature as _;

mod aarch64;
mod aeshwclmulmovbe;
mod vaesclmulavx2;

#[derive(Clone)]
pub(super) struct Key(DynKey);
Expand All @@ -50,6 +51,9 @@ impl Key {

#[derive(Clone)]
enum DynKey {
#[cfg(target_arch = "x86_64")]
VAesClMulAvx2(Combo<aes::hw::Key, gcm::vclmulavx2::Key>),

#[cfg(target_arch = "x86_64")]
AesHwClMulAvxMovbe(Combo<aes::hw::Key, gcm::clmulavxmovbe::Key>),

Expand All @@ -75,11 +79,16 @@ enum DynKey {
impl DynKey {
fn new(key: aes::KeyBytes, cpu: cpu::Features) -> Result<Self, error::Unspecified> {
let cpu = cpu.values();

#[cfg(target_arch = "x86_64")]
if let Some((aes, gcm)) = cpu.get_feature() {
// 14.3.1 Detection of VEX-Encoded AES and VPCLMULQDQ
let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature())?;
let gcm_key_value = derive_gcm_key_value(&aes_key);
let combo = if let Some(cpu) = cpu.get_feature() {
let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu);
Self::VAesClMulAvx2(Combo { aes_key, gcm_key })
} else if let Some(cpu) = cpu.get_feature() {
let gcm_key = gcm::clmulavxmovbe::Key::new(gcm_key_value, cpu);
Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key })
} else {
Expand Down Expand Up @@ -181,6 +190,16 @@ pub(super) fn seal(
seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole)
}

#[cfg(target_arch = "x86_64")]
DynKey::VAesClMulAvx2(c) => seal_whole_partial(
c,
aad,
in_out,
ctr,
tag_iv,
vaesclmulavx2::seal_whole_vaes_clmul_avx2,
),

#[cfg(target_arch = "x86_64")]
DynKey::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) => {
aeshwclmulmovbe::seal(aes_key, gcm_key, ctr, tag_iv, aad, in_out)
Expand All @@ -201,7 +220,10 @@ pub(super) fn seal(
}
}

#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
#[cfg(any(
all(target_arch = "aarch64", target_endian = "little"),
target_arch = "x86_64"
))]
fn seal_whole_partial<A: aes::EncryptBlock, G: gcm::UpdateBlock>(
Combo { aes_key, gcm_key }: &Combo<A, G>,
aad: Aad<&[u8]>,
Expand Down Expand Up @@ -295,6 +317,17 @@ pub(super) fn open(
open_whole_partial(c, aad, in_out_slice, src, ctr, tag_iv, aarch64::open_whole)
}

#[cfg(target_arch = "x86_64")]
DynKey::VAesClMulAvx2(c) => open_whole_partial(
c,
aad,
in_out_slice,
src,
ctr,
tag_iv,
vaesclmulavx2::open_whole_vaes_clmul_avx2,
),

#[cfg(target_arch = "x86_64")]
DynKey::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) => {
aeshwclmulmovbe::open(aes_key, gcm_key, ctr, tag_iv, aad, in_out_slice, src)
Expand All @@ -315,7 +348,10 @@ pub(super) fn open(
}
}

#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
#[cfg(any(
all(target_arch = "aarch64", target_endian = "little"),
target_arch = "x86_64"
))]
fn open_whole_partial<A: aes::EncryptBlock, G: gcm::UpdateBlock>(
Combo { aes_key, gcm_key }: &Combo<A, G>,
aad: Aad<&[u8]>,
Expand Down
2 changes: 2 additions & 0 deletions src/aead/aes_gcm/aeshwclmulmovbe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use crate::{
};
use core::ops::RangeFrom;

#[inline(never)]
pub(super) fn seal(
aes_key: &aes::hw::Key,
gcm_key: &gcm::clmulavxmovbe::Key,
Expand Down Expand Up @@ -79,6 +80,7 @@ pub(super) fn seal(
super::seal_finish(aes_key, auth, remainder, ctr, tag_iv)
}

#[inline(never)]
pub(super) fn open(
aes_key: &aes::hw::Key,
gcm_key: &gcm::clmulavxmovbe::Key,
Expand Down
86 changes: 86 additions & 0 deletions src/aead/aes_gcm/vaesclmulavx2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright 2015-2025 Brian Smith.
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

#![cfg(target_arch = "x86_64")]

use super::{aes, gcm, Counter, BLOCK_LEN};
use crate::{aead::aes::Overlapping, c, polyfill::slice::AsChunksMut};
use core::num::NonZeroU32;

pub(super) fn seal_whole_vaes_clmul_avx2(
aes_key: &aes::hw::Key,
auth: &mut gcm::Context<gcm::vclmulavx2::Key>,
ctr: &mut Counter,
mut in_out: AsChunksMut<u8, BLOCK_LEN>,
) {
prefixed_extern! {
fn aes_gcm_enc_update_vaes_avx2(
input: *const u8,
output: *mut u8,
len: c::size_t,
key: &aes::AES_KEY,
ivec: &Counter,
Htable: &gcm::HTable,
Xi: &mut gcm::Xi);
}

let in_out = in_out.as_flattened_mut();

// Precondition: Since we have a `gcm::Context` then the number of blocks
// must fit in `u32`.
let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();

if let Some(blocks) = NonZeroU32::new(blocks) {
let aes_key = aes_key.inner_less_safe();
let (htable, xi) = auth.inner();
let input = in_out.as_ptr();
let output = in_out.as_mut_ptr();
let len = in_out.len();
unsafe { aes_gcm_enc_update_vaes_avx2(input, output, len, aes_key, ctr, htable, xi) };
ctr.increment_by_less_safe(blocks);
}
}

pub(super) fn open_whole_vaes_clmul_avx2(
aes_key: &aes::hw::Key,
auth: &mut gcm::Context<gcm::vclmulavx2::Key>,
in_out: Overlapping,
ctr: &mut Counter,
) {
prefixed_extern! {
fn aes_gcm_dec_update_vaes_avx2(
input: *const u8,
output: *mut u8,
len: c::size_t,
key: &aes::AES_KEY,
ivec: &mut Counter,
Htable: &gcm::HTable,
Xi: &mut gcm::Xi);
}

// Precondition. TODO: Create an overlapping::AsChunks for this.
assert_eq!(in_out.len() % BLOCK_LEN, 0);
// Precondition: Since we have a `gcm::Context` then the number of blocks
// must fit in `u32`.
let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();

if let Some(blocks) = NonZeroU32::new(blocks) {
let aes_key = aes_key.inner_less_safe();
let (htable, xi) = auth.inner();
in_out.with_input_output_len(|input, output, len| unsafe {
aes_gcm_dec_update_vaes_avx2(input, output, len, aes_key, ctr, htable, xi)
});
ctr.increment_by_less_safe(blocks);
}
}
10 changes: 10 additions & 0 deletions src/aead/gcm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ pub(super) mod clmul;
pub(super) mod clmulavxmovbe;
pub(super) mod fallback;
pub(super) mod neon;
pub(super) mod vclmulavx2;

pub(super) struct Context<'key, K> {
Xi: Xi,
Expand Down Expand Up @@ -118,6 +119,15 @@ impl Context<'_, clmulavxmovbe::Key> {
}
}

#[cfg(target_arch = "x86_64")]
impl Context<'_, vclmulavx2::Key> {
/// Access to `inner` for the integrated AES-GCM implementations only.
#[inline]
pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) {
(self.key.inner(), &mut self.Xi)
}
}

impl<K: UpdateBlocks> Context<'_, K> {
#[inline(always)]
pub fn update_blocks(&mut self, input: AsChunks<u8, BLOCK_LEN>) {
Expand Down
1 change: 1 addition & 0 deletions src/aead/gcm/clmul.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ pub struct Key {
}

impl Key {
#[cfg_attr(target_arch = "x86_64", inline(never))]
pub(in super::super) fn new(value: KeyValue, _cpu: RequiredCpuFeatures) -> Self {
Self {
h_table: unsafe { htable_new!(gcm_init_clmul, value) },
Expand Down
46 changes: 46 additions & 0 deletions src/aead/gcm/vclmulavx2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright 2018-2025 Brian Smith.
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

#![cfg(target_arch = "x86_64")]

use super::{ffi::KeyValue, HTable, UpdateBlock, Xi};
use crate::{
aead::gcm::ffi::BLOCK_LEN,
cpu::intel::{Avx2, VAesClmul},
polyfill::slice::AsChunks,
};

#[derive(Clone)]
pub struct Key {
h_table: HTable,
}

impl Key {
pub(in super::super) fn new(value: KeyValue, _cpu: (Avx2, VAesClmul)) -> Self {
Self {
h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx2, value) },
}
}

pub(super) fn inner(&self) -> &HTable {
&self.h_table
}
}

impl UpdateBlock for Key {
fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
let input: AsChunks<u8, BLOCK_LEN> = (&a).into();
unsafe { ghash!(gcm_ghash_vpclmulqdq_avx2_1, xi, &self.h_table, input) }
}
}
Loading

0 comments on commit 0c128bf

Please sign in to comment.