Skip to content

Commit

Permalink
Merge pull request #796 from pq-code-package/remove_opt_clean
Browse files Browse the repository at this point in the history
Remove opt clean suffixes from AArch64 backend
  • Loading branch information
hanno-becker authored Feb 24, 2025
2 parents 227195e + d50368d commit e319849
Show file tree
Hide file tree
Showing 42 changed files with 303 additions and 290 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ undefined behaviour in C, including out of bounds memory accesses and integer ov
all C code in [mlkem/*](mlkem) and [mlkem/fips202/*](mlkem/fips202) involved in running mlkem-native with its C backend.
See [proofs/cbmc](proofs/cbmc) for details.

HOL-Light functional correctness proofs for the optimized AArch64 NTT [ntt_opt.S](mlkem/native/aarch64/src/ntt_opt.S) and inverse NTT [intt_opt.S](mlkem/native/aarch64/src/intt_opt.S)
HOL-Light functional correctness proofs for the optimized AArch64 NTT [ntt.S](dev/aarch64_opt/src/ntt.S) and inverse NTT [intt.S](dev/aarch64_opt/src/intt.S)
can be found in [proofs/hol_light/arm](proofs/hol_light/arm). These proofs were contributed by John Harrison, and are
utilizing the verification infrastructure provided by [s2n-bignum](https://github.com/awslabs/s2n-bignum) infrastructure.

Expand All @@ -80,8 +80,8 @@ offers three backends for C, AArch64 and x86_64 - if you'd like contribute new b
PR.

Our AArch64 assembly is developed using [SLOTHY](https://github.com/slothy-optimizer/slothy): We write
'clean' assembly by hand and automate micro-optimizations (e.g. see the [clean](dev/aarch64_clean/src/ntt_clean.S)
vs [optimized](mlkem/native/aarch64/src/ntt_opt.S) AArch64 NTT). See [dev/README.md](dev/README.md) for more details.
'clean' assembly by hand and automate micro-optimizations (e.g. see the [clean](dev/aarch64_clean/src/ntt.S)
vs [optimized](dev/aarch64_opt/src/ntt.S) AArch64 NTT). See [dev/README.md](dev/README.md) for more details.

## How should I use mlkem-native?

Expand Down
63 changes: 34 additions & 29 deletions dev/aarch64_clean/src/arith_native_aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,44 +29,49 @@ extern const int16_t mlk_aarch64_zetas_mulcache_native[];
extern const int16_t mlk_aarch64_zetas_mulcache_twisted_native[];
extern const uint8_t mlk_rej_uniform_table[];

#define mlk_ntt_asm_clean MLK_NAMESPACE(ntt_asm_clean)
void mlk_ntt_asm_clean(int16_t *, const int16_t *, const int16_t *);
#define mlk_ntt_asm MLK_NAMESPACE(ntt_asm)
void mlk_ntt_asm(int16_t *, const int16_t *, const int16_t *);

#define mlk_intt_asm_clean MLK_NAMESPACE(intt_asm_clean)
void mlk_intt_asm_clean(int16_t *, const int16_t *, const int16_t *);
#define mlk_intt_asm MLK_NAMESPACE(intt_asm)
void mlk_intt_asm(int16_t *, const int16_t *, const int16_t *);

#define mlk_rej_uniform_asm_clean MLK_NAMESPACE(rej_uniform_asm_clean)
unsigned mlk_rej_uniform_asm_clean(int16_t *r, const uint8_t *buf,
unsigned buflen, const uint8_t *table);
#define mlk_rej_uniform_asm MLK_NAMESPACE(rej_uniform_asm)
unsigned mlk_rej_uniform_asm(int16_t *r, const uint8_t *buf, unsigned buflen,
const uint8_t *table);

#define mlk_poly_reduce_asm_clean MLK_NAMESPACE(poly_reduce_asm_clean)
void mlk_poly_reduce_asm_clean(int16_t *);
#define mlk_poly_reduce_asm MLK_NAMESPACE(poly_reduce_asm)
void mlk_poly_reduce_asm(int16_t *);

#define mlk_poly_tomont_asm_clean MLK_NAMESPACE(poly_tomont_asm_clean)
void mlk_poly_tomont_asm_clean(int16_t *);
#define mlk_poly_tomont_asm MLK_NAMESPACE(poly_tomont_asm)
void mlk_poly_tomont_asm(int16_t *);

#define mlk_poly_mulcache_compute_asm_clean \
MLK_NAMESPACE(poly_mulcache_compute_asm_clean)
void mlk_poly_mulcache_compute_asm_clean(int16_t *, const int16_t *,
const int16_t *, const int16_t *);
#define mlk_poly_mulcache_compute_asm MLK_NAMESPACE(poly_mulcache_compute_asm)
void mlk_poly_mulcache_compute_asm(int16_t *, const int16_t *, const int16_t *,
const int16_t *);


#define mlk_poly_tobytes_asm_clean MLK_NAMESPACE(poly_tobytes_asm_clean)
void mlk_poly_tobytes_asm_clean(uint8_t *r, const int16_t *a);
#define mlk_poly_tobytes_asm MLK_NAMESPACE(poly_tobytes_asm)
void mlk_poly_tobytes_asm(uint8_t *r, const int16_t *a);

#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k2_clean \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_clean)
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k2_clean(
int16_t *r, const int16_t *a, const int16_t *b, const int16_t *b_cache);
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k2 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2)
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(int16_t *r,
const int16_t *a,
const int16_t *b,
const int16_t *b_cache);

#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k3_clean \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_clean)
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k3_clean(
int16_t *r, const int16_t *a, const int16_t *b, const int16_t *b_cache);
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k3 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3)
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(int16_t *r,
const int16_t *a,
const int16_t *b,
const int16_t *b_cache);

#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k4_clean \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_clean)
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k4_clean(
int16_t *r, const int16_t *a, const int16_t *b, const int16_t *b_cache);
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k4 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4)
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(int16_t *r,
const int16_t *a,
const int16_t *b,
const int16_t *b_cache);

#endif /* MLK_DEV_AARCH64_CLEAN_SRC_ARITH_NATIVE_AARCH64_H */
27 changes: 13 additions & 14 deletions dev/aarch64_clean/src/clean_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,59 +26,58 @@

static MLK_INLINE void mlk_ntt_native(int16_t data[MLKEM_N])
{
mlk_ntt_asm_clean(data, mlk_aarch64_ntt_zetas_layer12345,
mlk_aarch64_ntt_zetas_layer67);
mlk_ntt_asm(data, mlk_aarch64_ntt_zetas_layer12345,
mlk_aarch64_ntt_zetas_layer67);
}

static MLK_INLINE void mlk_intt_native(int16_t data[MLKEM_N])
{
mlk_intt_asm_clean(data, mlk_aarch64_invntt_zetas_layer12345,
mlk_aarch64_invntt_zetas_layer67);
mlk_intt_asm(data, mlk_aarch64_invntt_zetas_layer12345,
mlk_aarch64_invntt_zetas_layer67);
}

static MLK_INLINE void mlk_poly_reduce_native(int16_t data[MLKEM_N])
{
mlk_poly_reduce_asm_clean(data);
mlk_poly_reduce_asm(data);
}

static MLK_INLINE void mlk_poly_tomont_native(int16_t data[MLKEM_N])
{
mlk_poly_tomont_asm_clean(data);
mlk_poly_tomont_asm(data);
}

static MLK_INLINE void mlk_poly_mulcache_compute_native(
int16_t x[MLKEM_N / 2], const int16_t y[MLKEM_N])
{
mlk_poly_mulcache_compute_asm_clean(
x, y, mlk_aarch64_zetas_mulcache_native,
mlk_aarch64_zetas_mulcache_twisted_native);
mlk_poly_mulcache_compute_asm(x, y, mlk_aarch64_zetas_mulcache_native,
mlk_aarch64_zetas_mulcache_twisted_native);
}

static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N],
const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)])
{
mlk_polyvec_basemul_acc_montgomery_cached_asm_k2_clean(r, a, b, b_cache);
mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(r, a, b, b_cache);
}

static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N],
const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)])
{
mlk_polyvec_basemul_acc_montgomery_cached_asm_k3_clean(r, a, b, b_cache);
mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(r, a, b, b_cache);
}

static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N],
const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)])
{
mlk_polyvec_basemul_acc_montgomery_cached_asm_k4_clean(r, a, b, b_cache);
mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(r, a, b, b_cache);
}

static MLK_INLINE void mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
const int16_t a[MLKEM_N])
{
mlk_poly_tobytes_asm_clean(r, a);
mlk_poly_tobytes_asm(r, a);
}

static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
Expand All @@ -89,7 +88,7 @@ static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
{
return -1;
}
return (int)mlk_rej_uniform_asm_clean(r, buf, buflen, mlk_rej_uniform_table);
return (int)mlk_rej_uniform_asm(r, buf, buflen, mlk_rej_uniform_table);
}

#endif /* MLK_ARITH_PROFILE_IMPL_H */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,9 +194,9 @@
ninv_tw .req v30

.text
.global MLK_ASM_NAMESPACE(intt_asm_clean)
.global MLK_ASM_NAMESPACE(intt_asm)
.balign 4
MLK_ASM_FN_SYMBOL(intt_asm_clean)
MLK_ASM_FN_SYMBOL(intt_asm)
push_stack

// Setup constants
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,9 @@
t3 .req v28

.text
.global MLK_ASM_NAMESPACE(ntt_asm_clean)
.global MLK_ASM_NAMESPACE(ntt_asm)
.balign 4
MLK_ASM_FN_SYMBOL(ntt_asm_clean)
MLK_ASM_FN_SYMBOL(ntt_asm)
push_stack

mov wtmp, #3329
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@
modulus .req v6
modulus_twisted .req v7

.global MLK_ASM_NAMESPACE(poly_mulcache_compute_asm_clean)
.global MLK_ASM_NAMESPACE(poly_mulcache_compute_asm)
.text
.balign 4
MLK_ASM_FN_SYMBOL(poly_mulcache_compute_asm_clean)
MLK_ASM_FN_SYMBOL(poly_mulcache_compute_asm)
mov wtmp, #3329
dup modulus.8h, wtmp

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@
modulus_twisted .req v4

.text
.global MLK_ASM_NAMESPACE(poly_reduce_asm_clean)
.global MLK_ASM_NAMESPACE(poly_reduce_asm)
.balign 4
MLK_ASM_FN_SYMBOL(poly_reduce_asm_clean)
MLK_ASM_FN_SYMBOL(poly_reduce_asm)

mov wtmp, #3329 // ML-KEM modulus
dup modulus.8h, wtmp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@
count .req x2

.text
.global MLK_ASM_NAMESPACE(poly_tobytes_asm_clean)
.global MLK_ASM_NAMESPACE(poly_tobytes_asm)
.balign 4
MLK_ASM_FN_SYMBOL(poly_tobytes_asm_clean)
MLK_ASM_FN_SYMBOL(poly_tobytes_asm)

mov count, #16
poly_tobytes_asm_clean_asm_loop_start:
poly_tobytes_asm_asm_loop_start:
ld2 {data0.8h, data1.8h}, [src], #32

// r[3 * i + 0] = (t0 >> 0);
Expand All @@ -47,7 +47,7 @@ poly_tobytes_asm_clean_asm_loop_start:
st3 {out0.8b, out1.8b, out2.8b}, [dst], #24

subs count, count, #1
cbnz count, poly_tobytes_asm_clean_asm_loop_start
cbnz count, poly_tobytes_asm_asm_loop_start
ret

.unreq data0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@
tmp0 .req v6

.text
.global MLK_ASM_NAMESPACE(poly_tomont_asm_clean)
.global MLK_ASM_NAMESPACE(poly_tomont_asm)
.balign 4
MLK_ASM_FN_SYMBOL(poly_tomont_asm_clean)
MLK_ASM_FN_SYMBOL(poly_tomont_asm)

mov wtmp, #3329 // ML-KEM modulus
dup modulus.8h, wtmp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,9 @@
t0 .req v28

.text
.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_clean)
.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2)
.balign 4
MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k2_clean)
MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k2)
push_stack

mov wtmp, #3329
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,9 @@
t0 .req v28

.text
.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_clean)
.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3)
.balign 4
MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k3_clean)
MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k3)
push_stack
mov wtmp, #3329
dup modulus.8h, wtmp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,9 @@
t0 .req v28

.text
.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_clean)
.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4)
.balign 4
MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k4_clean)
MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k4)
push_stack
mov wtmp, #3329
dup modulus.8h, wtmp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
*/

/*************************************************
* Name: mlk_rej_uniform_asm_clean
* Name: mlk_rej_uniform_asm
*
* Description: Run rejection sampling on uniform random bytes to generate
* uniform random integers mod q
Expand Down Expand Up @@ -114,9 +114,9 @@
bits .req v31

.text
.global MLK_ASM_NAMESPACE(rej_uniform_asm_clean)
.global MLK_ASM_NAMESPACE(rej_uniform_asm)
.balign 4
MLK_ASM_FN_SYMBOL(rej_uniform_asm_clean)
MLK_ASM_FN_SYMBOL(rej_uniform_asm)
push_stack

// Load 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
Expand Down
Loading

18 comments on commit e319849

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intel Xeon 4th gen (c7i)

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 9637 cycles 9643 cycles 1.00
ML-KEM-512 encaps 11239 cycles 11232 cycles 1.00
ML-KEM-512 decaps 15319 cycles 15322 cycles 1.00
ML-KEM-768 keypair 16362 cycles 16334 cycles 1.00
ML-KEM-768 encaps 17906 cycles 17866 cycles 1.00
ML-KEM-768 decaps 23682 cycles 23596 cycles 1.00
ML-KEM-1024 keypair 22209 cycles 22266 cycles 1.00
ML-KEM-1024 encaps 24058 cycles 24113 cycles 1.00
ML-KEM-1024 decaps 31886 cycles 31884 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intel Xeon 4th gen (c7i) (no-opt)

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 29308 cycles 29328 cycles 1.00
ML-KEM-512 encaps 34168 cycles 34235 cycles 1.00
ML-KEM-512 decaps 44426 cycles 44486 cycles 1.00
ML-KEM-768 keypair 47920 cycles 47936 cycles 1.00
ML-KEM-768 encaps 56174 cycles 56218 cycles 1.00
ML-KEM-768 decaps 67933 cycles 67888 cycles 1.00
ML-KEM-1024 keypair 72077 cycles 72136 cycles 1.00
ML-KEM-1024 encaps 84744 cycles 84410 cycles 1.00
ML-KEM-1024 decaps 101586 cycles 101623 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arm Cortex-A76 (Raspberry Pi 5) benchmarks

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 29536 cycles 29541 cycles 1.00
ML-KEM-512 encaps 35129 cycles 35129 cycles 1
ML-KEM-512 decaps 45749 cycles 45749 cycles 1
ML-KEM-768 keypair 50467 cycles 50466 cycles 1.00
ML-KEM-768 encaps 55835 cycles 55829 cycles 1.00
ML-KEM-768 decaps 70808 cycles 70807 cycles 1.00
ML-KEM-1024 keypair 73374 cycles 73372 cycles 1.00
ML-KEM-1024 encaps 82313 cycles 82311 cycles 1.00
ML-KEM-1024 decaps 102575 cycles 102576 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMD EPYC 3rd gen (c6a)

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 17260 cycles 17265 cycles 1.00
ML-KEM-512 encaps 19046 cycles 19172 cycles 0.99
ML-KEM-512 decaps 24603 cycles 24524 cycles 1.00
ML-KEM-768 keypair 29384 cycles 29407 cycles 1.00
ML-KEM-768 encaps 30621 cycles 30649 cycles 1.00
ML-KEM-768 decaps 38570 cycles 38549 cycles 1.00
ML-KEM-1024 keypair 43674 cycles 44017 cycles 0.99
ML-KEM-1024 encaps 45122 cycles 45093 cycles 1.00
ML-KEM-1024 decaps 55599 cycles 55580 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMD EPYC 4th gen (c7a)

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 11665 cycles 11631 cycles 1.00
ML-KEM-512 encaps 13345 cycles 13299 cycles 1.00
ML-KEM-512 decaps 18182 cycles 18181 cycles 1.00
ML-KEM-768 keypair 20143 cycles 20213 cycles 1.00
ML-KEM-768 encaps 21195 cycles 21364 cycles 0.99
ML-KEM-768 decaps 28417 cycles 28684 cycles 0.99
ML-KEM-1024 keypair 26982 cycles 26988 cycles 1.00
ML-KEM-1024 encaps 29040 cycles 29018 cycles 1.00
ML-KEM-1024 decaps 38576 cycles 38524 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intel Xeon 3rd gen (c6i)

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 16167 cycles 16172 cycles 1.00
ML-KEM-512 encaps 18385 cycles 18401 cycles 1.00
ML-KEM-512 decaps 24949 cycles 25397 cycles 0.98
ML-KEM-768 keypair 27810 cycles 27828 cycles 1.00
ML-KEM-768 encaps 29525 cycles 29545 cycles 1.00
ML-KEM-768 decaps 38975 cycles 39043 cycles 1.00
ML-KEM-1024 keypair 37730 cycles 37758 cycles 1.00
ML-KEM-1024 encaps 40730 cycles 40781 cycles 1.00
ML-KEM-1024 decaps 53291 cycles 53299 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton2

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 29540 cycles 29570 cycles 1.00
ML-KEM-512 encaps 35130 cycles 35069 cycles 1.00
ML-KEM-512 decaps 45737 cycles 45785 cycles 1.00
ML-KEM-768 keypair 50473 cycles 50425 cycles 1.00
ML-KEM-768 encaps 55828 cycles 55943 cycles 1.00
ML-KEM-768 decaps 70820 cycles 70891 cycles 1.00
ML-KEM-1024 keypair 73382 cycles 73381 cycles 1.00
ML-KEM-1024 encaps 82322 cycles 82317 cycles 1.00
ML-KEM-1024 decaps 102594 cycles 102595 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMD EPYC 3rd gen (c6a) (no-opt)

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 39913 cycles 39830 cycles 1.00
ML-KEM-512 encaps 48159 cycles 48166 cycles 1.00
ML-KEM-512 decaps 62421 cycles 62437 cycles 1.00
ML-KEM-768 keypair 64641 cycles 64628 cycles 1.00
ML-KEM-768 encaps 75801 cycles 75806 cycles 1.00
ML-KEM-768 decaps 94455 cycles 94595 cycles 1.00
ML-KEM-1024 keypair 96080 cycles 95960 cycles 1.00
ML-KEM-1024 encaps 109578 cycles 109467 cycles 1.00
ML-KEM-1024 decaps 133163 cycles 133196 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMD EPYC 4th gen (c7a) (no-opt)

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 36405 cycles 36410 cycles 1.00
ML-KEM-512 encaps 42958 cycles 43064 cycles 1.00
ML-KEM-512 decaps 55876 cycles 55885 cycles 1.00
ML-KEM-768 keypair 59065 cycles 59093 cycles 1.00
ML-KEM-768 encaps 67811 cycles 67590 cycles 1.00
ML-KEM-768 decaps 84528 cycles 84529 cycles 1.00
ML-KEM-1024 keypair 87401 cycles 87404 cycles 1.00
ML-KEM-1024 encaps 98318 cycles 98384 cycles 1.00
ML-KEM-1024 decaps 119655 cycles 119684 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton4

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 18050 cycles 18047 cycles 1.00
ML-KEM-512 encaps 21417 cycles 21417 cycles 1
ML-KEM-512 decaps 28120 cycles 28128 cycles 1.00
ML-KEM-768 keypair 31073 cycles 31074 cycles 1.00
ML-KEM-768 encaps 34156 cycles 34152 cycles 1.00
ML-KEM-768 decaps 43785 cycles 43782 cycles 1.00
ML-KEM-1024 keypair 44906 cycles 44908 cycles 1.00
ML-KEM-1024 encaps 50336 cycles 50333 cycles 1.00
ML-KEM-1024 decaps 63271 cycles 63261 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intel Xeon 3rd gen (c6i) (no-opt)

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 46809 cycles 46835 cycles 1.00
ML-KEM-512 encaps 55392 cycles 55428 cycles 1.00
ML-KEM-512 decaps 71218 cycles 71293 cycles 1.00
ML-KEM-768 keypair 76403 cycles 76290 cycles 1.00
ML-KEM-768 encaps 87519 cycles 87548 cycles 1.00
ML-KEM-768 decaps 108215 cycles 108356 cycles 1.00
ML-KEM-1024 keypair 112359 cycles 112395 cycles 1.00
ML-KEM-1024 encaps 126390 cycles 126596 cycles 1.00
ML-KEM-1024 decaps 152685 cycles 152911 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton3

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 19154 cycles 19153 cycles 1.00
ML-KEM-512 encaps 22936 cycles 22935 cycles 1.00
ML-KEM-512 decaps 30236 cycles 30232 cycles 1.00
ML-KEM-768 keypair 32830 cycles 32831 cycles 1.00
ML-KEM-768 encaps 36513 cycles 36515 cycles 1.00
ML-KEM-768 decaps 46950 cycles 46951 cycles 1.00
ML-KEM-1024 keypair 47388 cycles 47391 cycles 1.00
ML-KEM-1024 encaps 53362 cycles 53371 cycles 1.00
ML-KEM-1024 decaps 67325 cycles 67329 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton4 (no-opt)

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 35743 cycles 35760 cycles 1.00
ML-KEM-512 encaps 40754 cycles 40753 cycles 1.00
ML-KEM-512 decaps 52087 cycles 52088 cycles 1.00
ML-KEM-768 keypair 63194 cycles 63196 cycles 1.00
ML-KEM-768 encaps 67449 cycles 67464 cycles 1.00
ML-KEM-768 decaps 81164 cycles 81174 cycles 1.00
ML-KEM-1024 keypair 88815 cycles 88823 cycles 1.00
ML-KEM-1024 encaps 98809 cycles 98816 cycles 1.00
ML-KEM-1024 decaps 117446 cycles 117455 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton2 (no-opt)

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 59620 cycles 59698 cycles 1.00
ML-KEM-512 encaps 68259 cycles 68296 cycles 1.00
ML-KEM-512 decaps 87001 cycles 87036 cycles 1.00
ML-KEM-768 keypair 99327 cycles 99273 cycles 1.00
ML-KEM-768 encaps 110875 cycles 110841 cycles 1.00
ML-KEM-768 decaps 135152 cycles 135104 cycles 1.00
ML-KEM-1024 keypair 149104 cycles 149057 cycles 1.00
ML-KEM-1024 encaps 164582 cycles 164522 cycles 1.00
ML-KEM-1024 decaps 195928 cycles 195859 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton3 (no-opt)

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 39107 cycles 39076 cycles 1.00
ML-KEM-512 encaps 44879 cycles 44840 cycles 1.00
ML-KEM-512 decaps 56747 cycles 56719 cycles 1.00
ML-KEM-768 keypair 64455 cycles 64488 cycles 1.00
ML-KEM-768 encaps 72601 cycles 73156 cycles 0.99
ML-KEM-768 decaps 87857 cycles 87925 cycles 1.00
ML-KEM-1024 keypair 96102 cycles 96095 cycles 1.00
ML-KEM-1024 encaps 106150 cycles 106154 cycles 1.00
ML-KEM-1024 decaps 127055 cycles 127057 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arm Cortex-A55 (Snapdragon 888) benchmarks

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 59582 cycles 59577 cycles 1.00
ML-KEM-512 encaps 67220 cycles 67242 cycles 1.00
ML-KEM-512 decaps 86526 cycles 86394 cycles 1.00
ML-KEM-768 keypair 101475 cycles 101354 cycles 1.00
ML-KEM-768 encaps 112559 cycles 112562 cycles 1.00
ML-KEM-768 decaps 139523 cycles 140024 cycles 1.00
ML-KEM-1024 keypair 153597 cycles 154845 cycles 0.99
ML-KEM-1024 encaps 171003 cycles 171704 cycles 1.00
ML-KEM-1024 decaps 208052 cycles 209229 cycles 0.99

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SpacemiT K1 8 (Banana Pi F3) benchmarks

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 225723 cycles 225737 cycles 1.00
ML-KEM-512 encaps 272677 cycles 272706 cycles 1.00
ML-KEM-512 decaps 347782 cycles 347799 cycles 1.00
ML-KEM-768 keypair 373216 cycles 373191 cycles 1.00
ML-KEM-768 encaps 435336 cycles 435199 cycles 1.00
ML-KEM-768 decaps 533660 cycles 533570 cycles 1.00
ML-KEM-1024 keypair 555093 cycles 555146 cycles 1.00
ML-KEM-1024 encaps 636049 cycles 636097 cycles 1.00
ML-KEM-1024 decaps 758785 cycles 758666 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arm Cortex-A72 (Raspberry Pi 4) benchmarks

Benchmark suite Current: e319849 Previous: 227195e Ratio
ML-KEM-512 keypair 53095 cycles 53210 cycles 1.00
ML-KEM-512 encaps 61097 cycles 61317 cycles 1.00
ML-KEM-512 decaps 78648 cycles 77894 cycles 1.01
ML-KEM-768 keypair 90072 cycles 90629 cycles 0.99
ML-KEM-768 encaps 98254 cycles 98705 cycles 1.00
ML-KEM-768 decaps 122610 cycles 123586 cycles 0.99
ML-KEM-1024 keypair 134991 cycles 135421 cycles 1.00
ML-KEM-1024 encaps 148668 cycles 149544 cycles 0.99
ML-KEM-1024 decaps 181324 cycles 181411 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.