From 0c128bf9efc9c90047fd60622af718ba860f9b8b Mon Sep 17 00:00:00 2001
From: Brian Smith <brian@briansmith.org>
Date: Thu, 13 Feb 2025 13:36:08 -0800
Subject: [PATCH] aes_gcm/x86_64: Enable AVX2 VAES-CLMUL implementation.

If using GNU binutils as the assembler, this may require a newer
version than what was previously required.
---
 Cargo.toml                                    |  1 +
 build.rs                                      |  5 ++
 .../fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl | 49 +----------
 src/aead/aes.rs                               |  2 +-
 src/aead/aes_gcm.rs                           | 40 ++++++++-
 src/aead/aes_gcm/aeshwclmulmovbe.rs           |  2 +
 src/aead/aes_gcm/vaesclmulavx2.rs             | 86 +++++++++++++++++++
 src/aead/gcm.rs                               | 10 +++
 src/aead/gcm/clmul.rs                         |  1 +
 src/aead/gcm/vclmulavx2.rs                    | 46 ++++++++++
 src/cpu/intel.rs                              | 17 +++-
 11 files changed, 208 insertions(+), 51 deletions(-)
 create mode 100644 src/aead/aes_gcm/vaesclmulavx2.rs
 create mode 100644 src/aead/gcm/vclmulavx2.rs

diff --git a/Cargo.toml b/Cargo.toml
index 5163255331..8c51848af8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -51,6 +51,7 @@ include = [
     "crypto/curve25519/curve25519_tables.h",
     "crypto/curve25519/internal.h",
     "crypto/fipsmodule/aes/aes_nohw.c",
+    "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl",
     "crypto/fipsmodule/aes/asm/aesni-x86.pl",
     "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl",
     "crypto/fipsmodule/aes/asm/aesni-x86_64.pl",
diff --git a/build.rs b/build.rs
index b8c54b36fd..9843ad8aa5 100644
--- a/build.rs
+++ b/build.rs
@@ -76,6 +76,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[
 
     (&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"),
     (&[X86_64], "crypto/curve25519/curve25519_64_adx.c"),
+    (&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"),
     (&[X86_64], "crypto/fipsmodule/aes/asm/ghash-x86_64.pl"),
@@ -887,7 +888,9 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "LIMB_shr",
         "OPENSSL_cpuid_setup",
         "aes_gcm_dec_kernel",
+        "aes_gcm_dec_update_vaes_avx2",
         "aes_gcm_enc_kernel",
+        "aes_gcm_enc_update_vaes_avx2",
         "aes_hw_ctr32_encrypt_blocks",
         "aes_hw_set_encrypt_key",
         "aes_hw_set_encrypt_key_alt",
@@ -946,11 +949,13 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "gcm_ghash_avx",
         "gcm_ghash_clmul",
         "gcm_ghash_neon",
+        "gcm_ghash_vpclmulqdq_avx2_1",
         "gcm_gmult_clmul",
         "gcm_gmult_neon",
         "gcm_init_avx",
         "gcm_init_clmul",
         "gcm_init_neon",
+        "gcm_init_vpclmulqdq_avx2",
         "k25519Precomp",
         "limbs_mul_add_limb",
         "little_endian_bytes_from_scalar",
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
index a67debc609..db4bea0baa 100644
--- a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
@@ -439,12 +439,8 @@ sub _ghash_4x {
 #                                const uint8_t *in, size_t len);
 #
 # Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
-# by |in| and |len|.  |len| must be a multiple of 16.
-#
-# This function handles large amounts of AAD efficiently, while also keeping the
-# overhead low for small amounts of AAD which is the common case.  TLS uses less
-# than one block of AAD, but (uncommonly) other use cases may use much more.
-$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2", 1;
+# by |in| and |len|.  |len| must be exactly 16.
+$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2_1", 1;
 {
     # Function arguments
     my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
@@ -470,49 +466,8 @@ sub _ghash_4x {
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vbroadcasti128  .Lgfpoly(%rip), $GFPOLY
 
-    # Optimize for AADLEN < 32 by checking for AADLEN < 32 before AADLEN < 128.
-    cmp             \$32, $AADLEN
-    jb              .Lghash_lastblock
-
-    cmp             \$127, $AADLEN
-    jbe             .Lghash_loop_1x
-
-    # Update GHASH with 128 bytes of AAD at a time.
-    vmovdqu         $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED
-    vmovdqu         $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED
-.Lghash_loop_4x:
-    @{[ _ghash_4x   $AAD, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, $H_POW1_XORED,
-                    $TMP0, $TMP0_XMM, $TMP1, $TMP2, $LO, $MI, $GHASH_ACC,
-                    $GHASH_ACC_XMM ]}
-    sub             \$-128, $AAD  # 128 is 4 bytes, -128 is 1 byte
-    add             \$-128, $AADLEN
-    cmp             \$127, $AADLEN
-    ja              .Lghash_loop_4x
-
-    # Update GHASH with 32 bytes of AAD at a time.
-    cmp             \$32, $AADLEN
-    jb              .Lghash_loop_1x_done
-.Lghash_loop_1x:
-    vmovdqu         ($AAD), $TMP0
-    vpshufb         $BSWAP_MASK, $TMP0, $TMP0
-    vpxor           $TMP0, $GHASH_ACC, $GHASH_ACC
-    vmovdqu         $OFFSETOFEND_H_POWERS-32($HTABLE), $TMP0
-    @{[ _ghash_mul  $TMP0, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $TMP1, $TMP2, $LO ]}
-    vextracti128    \$1, $GHASH_ACC, $TMP0_XMM
-    vpxor           $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
-    add             \$32, $AAD
-    sub             \$32, $AADLEN
-    cmp             \$32, $AADLEN
-    jae             .Lghash_loop_1x
-.Lghash_loop_1x_done:
-    # Issue the vzeroupper that is needed after using ymm registers.  Do it here
-    # instead of at the end, to minimize overhead for small AADLEN.
-    vzeroupper
-
     # Update GHASH with the remaining 16-byte block if any.
 .Lghash_lastblock:
-    test            $AADLEN, $AADLEN
-    jz              .Lghash_done
     vmovdqu         ($AAD), $TMP0_XMM
     vpshufb         $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM
     vpxor           $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
diff --git a/src/aead/aes.rs b/src/aead/aes.rs
index eff2cd885a..8de8451395 100644
--- a/src/aead/aes.rs
+++ b/src/aead/aes.rs
@@ -151,7 +151,7 @@ impl Counter {
         iv
     }
 
-    fn increment_by_less_safe(&mut self, increment_by: NonZeroU32) {
+    pub(super) fn increment_by_less_safe(&mut self, increment_by: NonZeroU32) {
         let [.., c0, c1, c2, c3] = &mut self.0;
         let old_value: u32 = u32::from_be_bytes([*c0, *c1, *c2, *c3]);
         let new_value = old_value + increment_by.get();
diff --git a/src/aead/aes_gcm.rs b/src/aead/aes_gcm.rs
index 39f10854f2..d9e08a3116 100644
--- a/src/aead/aes_gcm.rs
+++ b/src/aead/aes_gcm.rs
@@ -35,6 +35,7 @@ use cpu::GetFeature as _;
 
 mod aarch64;
 mod aeshwclmulmovbe;
+mod vaesclmulavx2;
 
 #[derive(Clone)]
 pub(super) struct Key(DynKey);
@@ -50,6 +51,9 @@ impl Key {
 
 #[derive(Clone)]
 enum DynKey {
+    #[cfg(target_arch = "x86_64")]
+    VAesClMulAvx2(Combo<aes::hw::Key, gcm::vclmulavx2::Key>),
+
     #[cfg(target_arch = "x86_64")]
     AesHwClMulAvxMovbe(Combo<aes::hw::Key, gcm::clmulavxmovbe::Key>),
 
@@ -75,11 +79,16 @@ enum DynKey {
 impl DynKey {
     fn new(key: aes::KeyBytes, cpu: cpu::Features) -> Result<Self, error::Unspecified> {
         let cpu = cpu.values();
+
         #[cfg(target_arch = "x86_64")]
         if let Some((aes, gcm)) = cpu.get_feature() {
+            // 14.3.1 Detection of VEX-Encoded AES and VPCLMULQDQ
             let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature())?;
             let gcm_key_value = derive_gcm_key_value(&aes_key);
             let combo = if let Some(cpu) = cpu.get_feature() {
+                let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu);
+                Self::VAesClMulAvx2(Combo { aes_key, gcm_key })
+            } else if let Some(cpu) = cpu.get_feature() {
                 let gcm_key = gcm::clmulavxmovbe::Key::new(gcm_key_value, cpu);
                 Self::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key })
             } else {
@@ -181,6 +190,16 @@ pub(super) fn seal(
             seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole)
         }
 
+        #[cfg(target_arch = "x86_64")]
+        DynKey::VAesClMulAvx2(c) => seal_whole_partial(
+            c,
+            aad,
+            in_out,
+            ctr,
+            tag_iv,
+            vaesclmulavx2::seal_whole_vaes_clmul_avx2,
+        ),
+
         #[cfg(target_arch = "x86_64")]
         DynKey::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) => {
             aeshwclmulmovbe::seal(aes_key, gcm_key, ctr, tag_iv, aad, in_out)
@@ -201,7 +220,10 @@ pub(super) fn seal(
     }
 }
 
-#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    target_arch = "x86_64"
+))]
 fn seal_whole_partial<A: aes::EncryptBlock, G: gcm::UpdateBlock>(
     Combo { aes_key, gcm_key }: &Combo<A, G>,
     aad: Aad<&[u8]>,
@@ -295,6 +317,17 @@ pub(super) fn open(
             open_whole_partial(c, aad, in_out_slice, src, ctr, tag_iv, aarch64::open_whole)
         }
 
+        #[cfg(target_arch = "x86_64")]
+        DynKey::VAesClMulAvx2(c) => open_whole_partial(
+            c,
+            aad,
+            in_out_slice,
+            src,
+            ctr,
+            tag_iv,
+            vaesclmulavx2::open_whole_vaes_clmul_avx2,
+        ),
+
         #[cfg(target_arch = "x86_64")]
         DynKey::AesHwClMulAvxMovbe(Combo { aes_key, gcm_key }) => {
             aeshwclmulmovbe::open(aes_key, gcm_key, ctr, tag_iv, aad, in_out_slice, src)
@@ -315,7 +348,10 @@ pub(super) fn open(
     }
 }
 
-#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
+#[cfg(any(
+    all(target_arch = "aarch64", target_endian = "little"),
+    target_arch = "x86_64"
+))]
 fn open_whole_partial<A: aes::EncryptBlock, G: gcm::UpdateBlock>(
     Combo { aes_key, gcm_key }: &Combo<A, G>,
     aad: Aad<&[u8]>,
diff --git a/src/aead/aes_gcm/aeshwclmulmovbe.rs b/src/aead/aes_gcm/aeshwclmulmovbe.rs
index 35236474ff..e6d49ee3bb 100644
--- a/src/aead/aes_gcm/aeshwclmulmovbe.rs
+++ b/src/aead/aes_gcm/aeshwclmulmovbe.rs
@@ -26,6 +26,7 @@ use crate::{
 };
 use core::ops::RangeFrom;
 
+#[inline(never)]
 pub(super) fn seal(
     aes_key: &aes::hw::Key,
     gcm_key: &gcm::clmulavxmovbe::Key,
@@ -79,6 +80,7 @@ pub(super) fn seal(
     super::seal_finish(aes_key, auth, remainder, ctr, tag_iv)
 }
 
+#[inline(never)]
 pub(super) fn open(
     aes_key: &aes::hw::Key,
     gcm_key: &gcm::clmulavxmovbe::Key,
diff --git a/src/aead/aes_gcm/vaesclmulavx2.rs b/src/aead/aes_gcm/vaesclmulavx2.rs
new file mode 100644
index 0000000000..8a2a68f238
--- /dev/null
+++ b/src/aead/aes_gcm/vaesclmulavx2.rs
@@ -0,0 +1,86 @@
+// Copyright 2015-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(target_arch = "x86_64")]
+
+use super::{aes, gcm, Counter, BLOCK_LEN};
+use crate::{aead::aes::Overlapping, c, polyfill::slice::AsChunksMut};
+use core::num::NonZeroU32;
+
+pub(super) fn seal_whole_vaes_clmul_avx2(
+    aes_key: &aes::hw::Key,
+    auth: &mut gcm::Context<gcm::vclmulavx2::Key>,
+    ctr: &mut Counter,
+    mut in_out: AsChunksMut<u8, BLOCK_LEN>,
+) {
+    prefixed_extern! {
+        fn aes_gcm_enc_update_vaes_avx2(
+            input: *const u8,
+            output: *mut u8,
+            len: c::size_t,
+            key: &aes::AES_KEY,
+            ivec: &Counter,
+            Htable: &gcm::HTable,
+            Xi: &mut gcm::Xi);
+    }
+
+    let in_out = in_out.as_flattened_mut();
+
+    // Precondition: Since we have a `gcm::Context` then the number of blocks
+    // must fit in `u32`.
+    let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
+
+    if let Some(blocks) = NonZeroU32::new(blocks) {
+        let aes_key = aes_key.inner_less_safe();
+        let (htable, xi) = auth.inner();
+        let input = in_out.as_ptr();
+        let output = in_out.as_mut_ptr();
+        let len = in_out.len();
+        unsafe { aes_gcm_enc_update_vaes_avx2(input, output, len, aes_key, ctr, htable, xi) };
+        ctr.increment_by_less_safe(blocks);
+    }
+}
+
+pub(super) fn open_whole_vaes_clmul_avx2(
+    aes_key: &aes::hw::Key,
+    auth: &mut gcm::Context<gcm::vclmulavx2::Key>,
+    in_out: Overlapping,
+    ctr: &mut Counter,
+) {
+    prefixed_extern! {
+        fn aes_gcm_dec_update_vaes_avx2(
+            input: *const u8,
+            output: *mut u8,
+            len: c::size_t,
+            key: &aes::AES_KEY,
+            ivec: &mut Counter,
+            Htable: &gcm::HTable,
+            Xi: &mut gcm::Xi);
+    }
+
+    // Precondition. TODO: Create an overlapping::AsChunks for this.
+    assert_eq!(in_out.len() % BLOCK_LEN, 0);
+    // Precondition: Since we have a `gcm::Context` then the number of blocks
+    // must fit in `u32`.
+    let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
+
+    if let Some(blocks) = NonZeroU32::new(blocks) {
+        let aes_key = aes_key.inner_less_safe();
+        let (htable, xi) = auth.inner();
+        in_out.with_input_output_len(|input, output, len| unsafe {
+            aes_gcm_dec_update_vaes_avx2(input, output, len, aes_key, ctr, htable, xi)
+        });
+        ctr.increment_by_less_safe(blocks);
+    }
+}
diff --git a/src/aead/gcm.rs b/src/aead/gcm.rs
index 9ac1cc1bd8..443c19e16b 100644
--- a/src/aead/gcm.rs
+++ b/src/aead/gcm.rs
@@ -38,6 +38,7 @@ pub(super) mod clmul;
 pub(super) mod clmulavxmovbe;
 pub(super) mod fallback;
 pub(super) mod neon;
+pub(super) mod vclmulavx2;
 
 pub(super) struct Context<'key, K> {
     Xi: Xi,
@@ -118,6 +119,15 @@ impl Context<'_, clmulavxmovbe::Key> {
     }
 }
 
+#[cfg(target_arch = "x86_64")]
+impl Context<'_, vclmulavx2::Key> {
+    /// Access to `inner` for the integrated AES-GCM implementations only.
+    #[inline]
+    pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) {
+        (self.key.inner(), &mut self.Xi)
+    }
+}
+
 impl<K: UpdateBlocks> Context<'_, K> {
     #[inline(always)]
     pub fn update_blocks(&mut self, input: AsChunks<u8, BLOCK_LEN>) {
diff --git a/src/aead/gcm/clmul.rs b/src/aead/gcm/clmul.rs
index d7f84aa6ef..8cd55a4eeb 100644
--- a/src/aead/gcm/clmul.rs
+++ b/src/aead/gcm/clmul.rs
@@ -36,6 +36,7 @@ pub struct Key {
 }
 
 impl Key {
+    #[cfg_attr(target_arch = "x86_64", inline(never))]
     pub(in super::super) fn new(value: KeyValue, _cpu: RequiredCpuFeatures) -> Self {
         Self {
             h_table: unsafe { htable_new!(gcm_init_clmul, value) },
diff --git a/src/aead/gcm/vclmulavx2.rs b/src/aead/gcm/vclmulavx2.rs
new file mode 100644
index 0000000000..916dd1eb32
--- /dev/null
+++ b/src/aead/gcm/vclmulavx2.rs
@@ -0,0 +1,46 @@
+// Copyright 2018-2025 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#![cfg(target_arch = "x86_64")]
+
+use super::{ffi::KeyValue, HTable, UpdateBlock, Xi};
+use crate::{
+    aead::gcm::ffi::BLOCK_LEN,
+    cpu::intel::{Avx2, VAesClmul},
+    polyfill::slice::AsChunks,
+};
+
+#[derive(Clone)]
+pub struct Key {
+    h_table: HTable,
+}
+
+impl Key {
+    pub(in super::super) fn new(value: KeyValue, _cpu: (Avx2, VAesClmul)) -> Self {
+        Self {
+            h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx2, value) },
+        }
+    }
+
+    pub(super) fn inner(&self) -> &HTable {
+        &self.h_table
+    }
+}
+
+impl UpdateBlock for Key {
+    fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
+        let input: AsChunks<u8, BLOCK_LEN> = (&a).into();
+        unsafe { ghash!(gcm_ghash_vpclmulqdq_avx2_1, xi, &self.h_table, input) }
+    }
+}
diff --git a/src/cpu/intel.rs b/src/cpu/intel.rs
index ec117c207a..f45052fe7f 100644
--- a/src/cpu/intel.rs
+++ b/src/cpu/intel.rs
@@ -137,7 +137,7 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 {
 
     // Intel: "Structured Extended Feature Flags Enumeration Leaf"
     #[cfg(target_arch = "x86_64")]
-    let extended_features_ebx = cpuid[2];
+    let (extended_features_ebx, extended_features_ecx) = (cpuid[2], cpuid[3]);
 
     let mut caps = 0;
 
@@ -218,6 +218,20 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 {
         set(&mut caps, Shift::Avx);
     }
 
+    #[cfg(target_arch = "x86_64")]
+    if avx_available {
+        // The Intel docs don't seem to document the detection. The instruction
+        // definitions of the VEX.256 instructions reference the
+        // VAES/VPCLMULQDQ features and the documentation for the extended
+        // features gives the values. We combine these into one feature because
+        // we never use them independently.
+        let vaes_available = check(extended_features_ecx, 9);
+        let vclmul_available = check(extended_features_ecx, 10);
+        if vaes_available && vclmul_available {
+            set(&mut caps, Shift::VAesClmul);
+        }
+    }
+
     // "14.7.1 Detection of Intel AVX2 Hardware support"
     // XXX: We don't condition AVX2 on AVX. TODO: Address this.
     // `OPENSSL_cpuid_setup` clears this bit when it detects the OS doesn't
@@ -318,6 +332,7 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 {
 
 impl_get_feature! {
     features: [
+        { ("x86_64") => VAesClmul },
         { ("x86", "x86_64") => ClMul },
         { ("x86", "x86_64") => Ssse3 },
         { ("x86_64") => Sse41 },