From f89331c06ee1c7b277f16cb983e676e0926148a3 Mon Sep 17 00:00:00 2001 From: Luke Parker Date: Mon, 13 Feb 2023 16:01:18 -0500 Subject: [PATCH 1/2] Optimize scalar multiplication with a 4-bit window This moves from 255 doubles and 255 additions to 259 doubles and 71 additions. If doubling is twice as fast, which is roughly the case as far as I can tell, this shifts the function from executing in (255 + (255 * 2)) = 765 time to (259 + (71 * 2)) = 401 time, a 48% speedup. --- src/curves.rs | 114 +++++++++++++++++++++++++++++--------------------- 1 file changed, 66 insertions(+), 48 deletions(-) diff --git a/src/curves.rs b/src/curves.rs index 41ccb9a..e15975b 100644 --- a/src/curves.rs +++ b/src/curves.rs @@ -466,32 +466,41 @@ macro_rules! new_curve_impl { type Output = $name; fn mul(self, other: &'b $scalar) -> Self::Output { - // TODO: make this faster - - let mut acc = $name::identity(); - - // This is a simple double-and-add implementation of point - // multiplication, moving from most significant to least - // significant bit of the scalar. - // - // We don't use `PrimeFieldBits::.to_le_bits` here, because that would - // force users of this crate to depend on `bitvec` where they otherwise - // might not need to. - // - // NOTE: We skip the leading bit because it's always unset (we are turning - // the 32-byte repr into 256 bits, and $scalar::NUM_BITS = 255). - for bit in other - .to_repr() - .iter() - .rev() - .flat_map(|byte| (0..8).rev().map(move |i| Choice::from((byte >> i) & 1u8))) - .skip(1) - { - acc = acc.double(); - acc = $name::conditional_select(&acc, &(acc + self), bit); + // Create a table out of the point + // TODO: This can be made more efficient with a 5-bit window + let mut arr = [$name::identity(); 16]; + arr[1] = *self; + for i in 2 .. 16 { + arr[i] = if (i % 2) == 0 { + arr[i / 2].double() + } else { + arr[i - 1] + arr[1] + }; + } + + let mut res = $name::identity(); + let mut first = true; + // Iterate from significant byte to least significant byte + for byte in other.to_repr().iter().rev() { + // Shift the result over 4 bits + if !first { + for _ in 0 .. 4 { + res = res.double(); + } + } + first = false; + + // Add the top-nibble from this byte into the result + res += arr[usize::from(byte >> 4)]; + // Shift the result over + for _ in 0 .. 4 { + res = res.double(); + } + // Add the bottom-nibble from this byte into the result + res += arr[usize::from(byte & 0b1111)]; } - acc + res } } @@ -581,32 +590,41 @@ macro_rules! new_curve_impl { type Output = $name; fn mul(self, other: &'b $scalar) -> Self::Output { - // TODO: make this faster - - let mut acc = $name::identity(); - - // This is a simple double-and-add implementation of point - // multiplication, moving from most significant to least - // significant bit of the scalar. - // - // We don't use `PrimeFieldBits::.to_le_bits` here, because that would - // force users of this crate to depend on `bitvec` where they otherwise - // might not need to. - // - // NOTE: We skip the leading bit because it's always unset (we are turning - // the 32-byte repr into 256 bits, and $scalar::NUM_BITS = 255). - for bit in other - .to_repr() - .iter() - .rev() - .flat_map(|byte| (0..8).rev().map(move |i| Choice::from((byte >> i) & 1u8))) - .skip(1) - { - acc = acc.double(); - acc = $name::conditional_select(&acc, &(acc + self), bit); + // Create a table out of the point + // TODO: This can be made more efficient with a 5-bit window + let mut arr = [$name::identity(); 16]; + arr[1] = (*self).into(); + for i in 2 .. 16 { + arr[i] = if (i % 2) == 0 { + arr[i / 2].double() + } else { + arr[i - 1] + arr[1] + }; + } + + let mut res = $name::identity(); + let mut first = true; + // Iterate from significant byte to least significant byte + for byte in other.to_repr().iter().rev() { + // Shift the result over 4 bits + if !first { + for _ in 0 .. 4 { + res = res.double(); + } + } + first = false; + + // Add the top-nibble from this byte into the result + res += arr[usize::from(byte >> 4)]; + // Shift the result over + for _ in 0 .. 4 { + res = res.double(); + } + // Add the bottom-nibble from this byte into the result + res += arr[usize::from(byte & 0b1111)]; } - acc + res } } From a46b5be95cacbff54d06aad8d3bbcba42e05d616 Mon Sep 17 00:00:00 2001 From: Luke Parker Date: Thu, 18 May 2023 18:26:06 -0400 Subject: [PATCH 2/2] Add zeroize --- Cargo.toml | 1 + src/curves.rs | 2 +- src/fields/fp.rs | 2 +- src/fields/fq.rs | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 93091a5..60771f3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,6 +50,7 @@ group = { version = "0.13", default-features = false } rand = { version = "0.8", default-features = false } static_assertions = "1.1.0" subtle = { version = "2.3", default-features = false } +zeroize = { version = "^1.5", default-features = false, features = ["derive"] } # alloc dependencies blake2b_simd = { version = "1", optional = true, default-features = false } diff --git a/src/curves.rs b/src/curves.rs index e15975b..3b03441 100644 --- a/src/curves.rs +++ b/src/curves.rs @@ -30,7 +30,7 @@ macro_rules! new_curve_impl { (($($privacy:tt)*), $name:ident, $name_affine:ident, $iso:ident, $base:ident, $scalar:ident, $curve_id:literal, $a_raw:expr, $b_raw:expr, $curve_type:ident) => { /// Represents a point in the projective coordinate space. - #[derive(Copy, Clone, Debug)] + #[derive(Copy, Clone, Debug, zeroize::Zeroize)] #[cfg_attr(feature = "repr-c", repr(C))] $($privacy)* struct $name { x: $base, diff --git a/src/fields/fp.rs b/src/fields/fp.rs index 3264cba..35ab1ff 100644 --- a/src/fields/fp.rs +++ b/src/fields/fp.rs @@ -24,7 +24,7 @@ use crate::arithmetic::SqrtTables; // The internal representation of this type is four 64-bit unsigned // integers in little-endian order. `Fp` values are always in // Montgomery form; i.e., Fp(a) = aR mod p, with R = 2^256. -#[derive(Clone, Copy, Eq)] +#[derive(Clone, Copy, Eq, zeroize::Zeroize)] #[repr(transparent)] pub struct Fp(pub(crate) [u64; 4]); diff --git a/src/fields/fq.rs b/src/fields/fq.rs index 8177fa4..2ccabe8 100644 --- a/src/fields/fq.rs +++ b/src/fields/fq.rs @@ -24,7 +24,7 @@ use crate::arithmetic::SqrtTables; // The internal representation of this type is four 64-bit unsigned // integers in little-endian order. `Fq` values are always in // Montgomery form; i.e., Fq(a) = aR mod q, with R = 2^256. -#[derive(Clone, Copy, Eq)] +#[derive(Clone, Copy, Eq, zeroize::Zeroize)] #[repr(transparent)] pub struct Fq(pub(crate) [u64; 4]);