Skip to content

Commit c087740

Browse files
georges-armjedisct1
authored andcommitted
Add AArch64 FEAT_SHA3 implementation of AEGIS128L
Add an implementation of AEGIS128L using the Neon FEAT_SHA3 extension. This allows use of the EOR3 and BCAX instructions. Running the benchmarks on Arm Neoverse micro-architectures with LLVM 20, this gives the following changes (higher is better, 100% = no change): Neoverse V1 Neoverse V2 Neoverse N2 AEGIS-128L 112.62% 114.91% 116.28% AEGIS-128L MAC 81.10% 80.32% 93.89% The EOR3 instruction as present in the above cores has worse throughput than the standard EOR instruction, with only one EOR3 instruction being issued per cycle. This is compared to four EOR instructions per cycle on Neoverse V1 and Neoverse V2, and two per cycle on Neoverse N2. This difference in performance observation can therefore be explained as such: * For AEGIS-128L, the presence of a small number of EOR3 instructions reduces the overall vector instruction count and therefore improves performance. The reduced throughput of EOR3 instructions is not an issue here since the limiting factor is the number of other instructions executing on the remaining vector units. * For AEGIS-128L MAC, there are more EOR3 instructions automatically emitted by the compiler combining EOR intrinsics, leading to the limited throughput of these instructions becoming more of an issue and an overall slowdown in performance. The imbalance between EOR and EOR3 instruction throughputs is resolved from Neoverse V3 and Neoverse N3, so it is expected that the MAC performance inversion is limited to this current generation of micro-architectures. For comparison, when running on client cores which do not have the EOR3 throughput limitation, we instead observe the following improvements: Cortex-A520 Cortex-A720 Cortex-X4 AEGIS-128L 121.57% 117.18% 116.32% AEGIS-128L MAC 118.90% 112.52% 112.01%
1 parent 2859a32 commit c087740

File tree

4 files changed

+98
-1
lines changed

4 files changed

+98
-1
lines changed

build.zig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ pub fn build(b: *std.Build) void {
3232
"src/aegis128l/aegis128l_aesni.c",
3333
"src/aegis128l/aegis128l_altivec.c",
3434
"src/aegis128l/aegis128l_neon_aes.c",
35+
"src/aegis128l/aegis128l_neon_sha3.c",
3536
"src/aegis128l/aegis128l_soft.c",
3637
"src/aegis128l/aegis128l.c",
3738

@@ -47,7 +48,6 @@ pub fn build(b: *std.Build) void {
4748
"src/aegis128x4/aegis128x4_avx2.c",
4849
"src/aegis128x4/aegis128x4_avx512.c",
4950
"src/aegis128x4/aegis128x4_neon_aes.c",
50-
"src/aegis128x4/aegis128x4_neon_aes.c",
5151
"src/aegis128x4/aegis128x4_soft.c",
5252
"src/aegis128x4/aegis128x4.c",
5353

src/aegis128l/aegis128l.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "aegis128l_aesni.h"
88
#include "aegis128l_altivec.h"
99
#include "aegis128l_neon_aes.h"
10+
#include "aegis128l_neon_sha3.h"
1011

1112
#ifndef HAS_HW_AES
1213
# include "aegis128l_soft.h"
@@ -232,6 +233,10 @@ aegis128l_pick_best_implementation(void)
232233
#endif
233234

234235
#if defined(__aarch64__) || defined(_M_ARM64)
236+
if (aegis_runtime_has_neon_sha3()) {
237+
implementation = &aegis128l_neon_sha3_implementation;
238+
return 0;
239+
}
235240
if (aegis_runtime_has_neon_aes()) {
236241
implementation = &aegis128l_neon_aes_implementation;
237242
return 0;

src/aegis128l/aegis128l_neon_sha3.c

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#if defined(__aarch64__) || defined(_M_ARM64)
2+
3+
# include <stddef.h>
4+
# include <stdint.h>
5+
6+
# include "../common/common.h"
7+
# include "aegis128l.h"
8+
# include "aegis128l_neon_sha3.h"
9+
10+
# ifndef __ARM_FEATURE_CRYPTO
11+
# define __ARM_FEATURE_CRYPTO 1
12+
# endif
13+
# ifndef __ARM_FEATURE_AES
14+
# define __ARM_FEATURE_AES 1
15+
# endif
16+
# ifndef __ARM_FEATURE_SHA3
17+
# define __ARM_FEATURE_SHA3 1
18+
# endif
19+
20+
# include <arm_neon.h>
21+
22+
# ifdef __clang__
23+
# pragma clang attribute push(__attribute__((target("neon,crypto,aes,sha3"))), \
24+
apply_to = function)
25+
# elif defined(__GNUC__)
26+
# pragma GCC target("+simd+crypto+sha3")
27+
# endif
28+
29+
# define AES_BLOCK_LENGTH 16
30+
31+
typedef uint8x16_t aes_block_t;
32+
33+
# define AES_BLOCK_XOR(A, B) veorq_u8((A), (B))
34+
# define AES_BLOCK_XOR3(A, B, C) veor3q_u8((A), (B), (C))
35+
# define AES_BLOCK_AND(A, B) vandq_u8((A), (B))
36+
# define AES_BLOCK_LOAD(A) vld1q_u8(A)
37+
# define AES_BLOCK_LOAD_64x2(A, B) vreinterpretq_u8_u64(vsetq_lane_u64((A), vmovq_n_u64(B), 1))
38+
# define AES_BLOCK_STORE(A, B) vst1q_u8((A), (B))
39+
# define AES_ENC0(A) vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), (A)))
40+
# define AES_ENC(A, B) AES_BLOCK_XOR(AES_ENC0(A), (B))
41+
42+
static inline void
43+
aegis128l_update(aes_block_t *const state, const aes_block_t d1, const aes_block_t d2)
44+
{
45+
aes_block_t tmp;
46+
47+
tmp = state[7];
48+
state[7] = AES_ENC(state[6], state[7]);
49+
state[6] = AES_ENC(state[5], state[6]);
50+
state[5] = AES_ENC(state[4], state[5]);
51+
state[4] = AES_BLOCK_XOR3(state[4], AES_ENC0(state[3]), d2);
52+
state[3] = AES_ENC(state[2], state[3]);
53+
state[2] = AES_ENC(state[1], state[2]);
54+
state[1] = AES_ENC(state[0], state[1]);
55+
state[0] = AES_BLOCK_XOR3(state[0], AES_ENC0(tmp), d1);
56+
}
57+
58+
# include "aegis128l_common.h"
59+
60+
struct aegis128l_implementation aegis128l_neon_sha3_implementation = {
61+
.encrypt_detached = encrypt_detached,
62+
.decrypt_detached = decrypt_detached,
63+
.encrypt_unauthenticated = encrypt_unauthenticated,
64+
.decrypt_unauthenticated = decrypt_unauthenticated,
65+
.stream = stream,
66+
.state_init = state_init,
67+
.state_encrypt_update = state_encrypt_update,
68+
.state_encrypt_detached_final = state_encrypt_detached_final,
69+
.state_encrypt_final = state_encrypt_final,
70+
.state_decrypt_detached_update = state_decrypt_detached_update,
71+
.state_decrypt_detached_final = state_decrypt_detached_final,
72+
.state_mac_init = state_mac_init,
73+
.state_mac_update = state_mac_update,
74+
.state_mac_final = state_mac_final,
75+
.state_mac_reset = state_mac_reset,
76+
.state_mac_clone = state_mac_clone,
77+
};
78+
79+
# ifdef __clang__
80+
# pragma clang attribute pop
81+
# endif
82+
83+
#endif

src/aegis128l/aegis128l_neon_sha3.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#ifndef aegis128l_neon_sha3_H
2+
#define aegis128l_neon_sha3_H
3+
4+
#include "../common/common.h"
5+
#include "implementations.h"
6+
7+
extern struct aegis128l_implementation aegis128l_neon_sha3_implementation;
8+
9+
#endif

0 commit comments

Comments
 (0)