Skip to content

Commit e67e56a

Browse files
committed
Reorder arguments to vaeseq_u8 to improve AArch64 performance
The register arguments to the AESE instruction are commutative, however the first input register is constrained in that it also names the output register. The constraint on register allocation causes recent LLVM versions to emit a lot of MOV instructions, significantly impacting performance. Swapping the register operands allows the compiler to emit significantly fewer MOV instructions. This change improves performance on Arm infrastructure micro-architectures by 14-36% depending on the micro-architecture, except for the AEGIS-256x4 sub-test which regresses slightly.
1 parent 5726c4a commit e67e56a

File tree

6 files changed

+14
-14
lines changed

6 files changed

+14
-14
lines changed

src/aegis128l/aegis128l_neon_aes.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ typedef uint8x16_t aes_block_t;
3232
# define AES_BLOCK_LOAD(A) vld1q_u8(A)
3333
# define AES_BLOCK_LOAD_64x2(A, B) vreinterpretq_u8_u64(vsetq_lane_u64((A), vmovq_n_u64(B), 1))
3434
# define AES_BLOCK_STORE(A, B) vst1q_u8((A), (B))
35-
# define AES_ENC(A, B) veorq_u8(vaesmcq_u8(vaeseq_u8((A), vmovq_n_u8(0))), (B))
35+
# define AES_ENC(A, B) veorq_u8(vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), (A))), (B))
3636

3737
static inline void
3838
aegis128l_update(aes_block_t *const state, const aes_block_t d1, const aes_block_t d2)

src/aegis128x2/aegis128x2_neon_aes.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ AES_BLOCK_STORE(uint8_t *a, const aes_block_t b)
6464
static inline aes_block_t
6565
AES_ENC(const aes_block_t a, const aes_block_t b)
6666
{
67-
return (aes_block_t) { veorq_u8(vaesmcq_u8(vaeseq_u8((a.b0), vmovq_n_u8(0))), (b.b0)),
68-
veorq_u8(vaesmcq_u8(vaeseq_u8((a.b1), vmovq_n_u8(0))), (b.b1)) };
67+
return (aes_block_t) { veorq_u8(vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b0)), b.b0),
68+
veorq_u8(vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b1)), b.b1) };
6969
}
7070

7171
static inline void

src/aegis128x4/aegis128x4_neon_aes.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,10 @@ AES_BLOCK_STORE(uint8_t *a, const aes_block_t b)
7070
static inline aes_block_t
7171
AES_ENC(const aes_block_t a, const aes_block_t b)
7272
{
73-
return (aes_block_t) { veorq_u8(vaesmcq_u8(vaeseq_u8((a.b0), vmovq_n_u8(0))), (b.b0)),
74-
veorq_u8(vaesmcq_u8(vaeseq_u8((a.b1), vmovq_n_u8(0))), (b.b1)),
75-
veorq_u8(vaesmcq_u8(vaeseq_u8((a.b2), vmovq_n_u8(0))), (b.b2)),
76-
veorq_u8(vaesmcq_u8(vaeseq_u8((a.b3), vmovq_n_u8(0))), (b.b3)) };
73+
return (aes_block_t) { veorq_u8(vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b0)), b.b0),
74+
veorq_u8(vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b1)), b.b1),
75+
veorq_u8(vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b2)), b.b2),
76+
veorq_u8(vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b3)), b.b3) };
7777
}
7878

7979
static inline void

src/aegis256/aegis256_neon_aes.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ typedef uint8x16_t aes_block_t;
3232
# define AES_BLOCK_LOAD(A) vld1q_u8(A)
3333
# define AES_BLOCK_LOAD_64x2(A, B) vreinterpretq_u8_u64(vsetq_lane_u64((A), vmovq_n_u64(B), 1))
3434
# define AES_BLOCK_STORE(A, B) vst1q_u8((A), (B))
35-
# define AES_ENC(A, B) veorq_u8(vaesmcq_u8(vaeseq_u8((A), vmovq_n_u8(0))), (B))
35+
# define AES_ENC(A, B) veorq_u8(vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), (A))), (B))
3636

3737
static inline void
3838
aegis256_update(aes_block_t *const state, const aes_block_t d)

src/aegis256x2/aegis256x2_neon_aes.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ AES_BLOCK_STORE(uint8_t *a, const aes_block_t b)
6464
static inline aes_block_t
6565
AES_ENC(const aes_block_t a, const aes_block_t b)
6666
{
67-
return (aes_block_t) { veorq_u8(vaesmcq_u8(vaeseq_u8((a.b0), vmovq_n_u8(0))), (b.b0)),
68-
veorq_u8(vaesmcq_u8(vaeseq_u8((a.b1), vmovq_n_u8(0))), (b.b1)) };
67+
return (aes_block_t) { veorq_u8(vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b0)), b.b0),
68+
veorq_u8(vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b1)), b.b1) };
6969
}
7070

7171
static inline void

src/aegis256x4/aegis256x4_neon_aes.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,10 @@ AES_BLOCK_STORE(uint8_t *a, const aes_block_t b)
7070
static inline aes_block_t
7171
AES_ENC(const aes_block_t a, const aes_block_t b)
7272
{
73-
return (aes_block_t) { veorq_u8(vaesmcq_u8(vaeseq_u8((a.b0), vmovq_n_u8(0))), (b.b0)),
74-
veorq_u8(vaesmcq_u8(vaeseq_u8((a.b1), vmovq_n_u8(0))), (b.b1)),
75-
veorq_u8(vaesmcq_u8(vaeseq_u8((a.b2), vmovq_n_u8(0))), (b.b2)),
76-
veorq_u8(vaesmcq_u8(vaeseq_u8((a.b3), vmovq_n_u8(0))), (b.b3)) };
73+
return (aes_block_t) { veorq_u8(vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b0)), b.b0),
74+
veorq_u8(vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b1)), b.b1),
75+
veorq_u8(vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b2)), b.b2),
76+
veorq_u8(vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b3)), b.b3) };
7777
}
7878

7979
static inline void

0 commit comments

Comments
 (0)