From 2f4c339d5244d77b85e6d73a9fa8aecad83b4311 Mon Sep 17 00:00:00 2001 From: Rod Chapman Date: Thu, 13 Feb 2025 14:00:45 +0000 Subject: [PATCH] Faster INTT on AArch64 - removes 1 redundant reduction step Signed-off-by: Rod Chapman --- dev/aarch64_clean/src/intt.S | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/dev/aarch64_clean/src/intt.S b/dev/aarch64_clean/src/intt.S index 4bbc9e9e2..38b6bf1ae 100644 --- a/dev/aarch64_clean/src/intt.S +++ b/dev/aarch64_clean/src/intt.S @@ -267,21 +267,25 @@ layer4567_start: // Layer 5 gs_butterfly data0, data1, root0, 2, 3 gs_butterfly data2, data3, root0, 4, 5 - // Max bound: 8q + // data0, data2: < 8q + // data1, data3: < q - // Not all of those reductions are needed, but the bounds tracking - // is easier if we uniformly reduce at this point. + // data0 and data2 have reached a bound of 8q now, so + // reduction of them is required. barrett_reduce data0 barrett_reduce data2 - barrett_reduce data1 - barrett_reduce data3 - - // Bounds: q/2 + // data0, data2: < q/2 + // data1, data3: < q - // Layer 4 gs_butterfly data0, data2, root0, 0, 1 gs_butterfly data1, data3, root0, 0, 1 - // Bounds: < q + // data0, data2, data3: < q + // data1: < 2q + + barrett_reduce data1 + // data1: < q/2 < q + // data0, data2, data3: < q + // Therefore, all < q str q_data0, [inp], #(64) str q_data1, [inp, #(-64 + 16*1)]