From 277d4d1579f4a1ae861ab5725c5cf3262af1cb38 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Mon, 2 Sep 2024 16:10:55 +0100
Subject: [PATCH 01/25] add booth encoding file

---
 cryptography/bls12_381/src/booth_encoding.rs | 90 ++++++++++++++++++++
 cryptography/bls12_381/src/lib.rs            |  1 +
 2 files changed, 91 insertions(+)
 create mode 100644 cryptography/bls12_381/src/booth_encoding.rs

diff --git a/cryptography/bls12_381/src/booth_encoding.rs b/cryptography/bls12_381/src/booth_encoding.rs
new file mode 100644
index 00000000..ef5df325
--- /dev/null
+++ b/cryptography/bls12_381/src/booth_encoding.rs
@@ -0,0 +1,90 @@
+use std::ops::Neg;
+
+use blstrs::{G1Projective, Scalar};
+use ff::PrimeField;
+
+use crate::G1Point;
+// TODO: Link to halo2 file + docs + comments
+pub fn get_booth_index(window_index: usize, window_size: usize, el: &[u8]) -> i32 {
+    // Booth encoding:
+    // * step by `window` size
+    // * slice by size of `window + 1``
+    // * each window overlap by 1 bit
+    // * append a zero bit to the least significant end
+    // Indexing rule for example window size 3 where we slice by 4 bits:
+    // `[0, +1, +1, +2, +2, +3, +3, +4, -4, -3, -3 -2, -2, -1, -1, 0]``
+    // So we can reduce the bucket size without preprocessing scalars
+    // and remembering them as in classic signed digit encoding
+
+    let skip_bits = (window_index * window_size).saturating_sub(1);
+    let skip_bytes = skip_bits / 8;
+
+    // fill into a u32
+    let mut v: [u8; 4] = [0; 4];
+    for (dst, src) in v.iter_mut().zip(el.iter().skip(skip_bytes)) {
+        *dst = *src
+    }
+    let mut tmp = u32::from_le_bytes(v);
+
+    // pad with one 0 if slicing the least significant window
+    if window_index == 0 {
+        tmp <<= 1;
+    }
+
+    // remove further bits
+    tmp >>= skip_bits - (skip_bytes * 8);
+    // apply the booth window
+    tmp &= (1 << (window_size + 1)) - 1;
+
+    let sign = tmp & (1 << window_size) == 0;
+
+    // div ceil by 2
+    tmp = (tmp + 1) >> 1;
+
+    // find the booth action index
+    if sign {
+        tmp as i32
+    } else {
+        ((!(tmp - 1) & ((1 << window_size) - 1)) as i32).neg()
+    }
+}
+
+#[test]
+fn smoke_scalar_mul() {
+    use group::prime::PrimeCurveAffine;
+    let gen = G1Point::generator();
+    let s = -Scalar::ONE;
+
+    let res = gen * s;
+
+    let got = mul(&s, &gen, 4);
+
+    assert_eq!(G1Point::from(res), got)
+}
+
+fn mul(scalar: &Scalar, point: &G1Point, window: usize) -> G1Point {
+    let u = scalar.to_bytes_le();
+    let n = Scalar::NUM_BITS as usize / window + 1;
+
+    let table = (0..=1 << (window - 1))
+        .map(|i| point * Scalar::from(i as u64))
+        .collect::<Vec<_>>();
+
+    let mut acc: G1Projective = G1Point::default().into();
+    for i in (0..n).rev() {
+        for _ in 0..window {
+            acc = acc + acc;
+        }
+
+        let idx = get_booth_index(i as usize, window, u.as_ref());
+
+        if idx.is_negative() {
+            acc += table[idx.unsigned_abs() as usize].neg();
+        }
+        if idx.is_positive() {
+            acc += table[idx.unsigned_abs() as usize];
+        }
+    }
+
+    acc.into()
+}
diff --git a/cryptography/bls12_381/src/lib.rs b/cryptography/bls12_381/src/lib.rs
index 2d9095ff..e21a5a26 100644
--- a/cryptography/bls12_381/src/lib.rs
+++ b/cryptography/bls12_381/src/lib.rs
@@ -1,4 +1,5 @@
 pub mod batch_inversion;
+mod booth_encoding;
 pub mod fixed_base_msm;
 pub mod lincomb;
 

From 5806a861cd62fd46ec731637852d080a2a3e39b8 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Mon, 2 Sep 2024 16:12:18 +0100
Subject: [PATCH 02/25] temp: fix

---
 cryptography/bls12_381/src/booth_encoding.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cryptography/bls12_381/src/booth_encoding.rs b/cryptography/bls12_381/src/booth_encoding.rs
index ef5df325..51a90f70 100644
--- a/cryptography/bls12_381/src/booth_encoding.rs
+++ b/cryptography/bls12_381/src/booth_encoding.rs
@@ -1,7 +1,7 @@
 use std::ops::Neg;
 
 use blstrs::{G1Projective, Scalar};
-use ff::PrimeField;
+use ff::{Field, PrimeField};
 
 use crate::G1Point;
 // TODO: Link to halo2 file + docs + comments

From 231b79d30d8a42c3a17a02afabfbffd04f28c003 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Mon, 2 Sep 2024 16:14:47 +0100
Subject: [PATCH 03/25] add batch_add module

---
 cryptography/bls12_381/src/batch_add.rs | 243 ++++++++++++++++++++++++
 cryptography/bls12_381/src/lib.rs       |   1 +
 2 files changed, 244 insertions(+)
 create mode 100644 cryptography/bls12_381/src/batch_add.rs

diff --git a/cryptography/bls12_381/src/batch_add.rs b/cryptography/bls12_381/src/batch_add.rs
new file mode 100644
index 00000000..f79be827
--- /dev/null
+++ b/cryptography/bls12_381/src/batch_add.rs
@@ -0,0 +1,243 @@
+use crate::batch_inversion::batch_inverse;
+use blstrs::{Fp, G1Affine};
+
+/// Adds multiple points together in affine representation, batching the inversions
+pub fn batch_addition(mut points: Vec<G1Affine>) -> G1Affine {
+    #[inline(always)]
+    fn point_add(p1: G1Affine, p2: G1Affine, inv: &blstrs::Fp) -> G1Affine {
+        use ff::Field;
+
+        let lambda = (p2.y() - p1.y()) * inv;
+        let x = lambda.square() - p1.x() - p2.x();
+        let y = lambda * (p1.x() - x) - p1.y();
+        G1Affine::from_raw_unchecked(x, y, false)
+    }
+
+    if points.is_empty() {
+        use group::prime::PrimeCurveAffine;
+        return G1Affine::identity();
+    }
+
+    let mut stride = 1;
+
+    let mut new_differences = Vec::with_capacity(points.len());
+
+    while stride < points.len() {
+        new_differences.clear();
+
+        for i in (0..points.len()).step_by(stride * 2) {
+            if i + stride < points.len() {
+                new_differences.push(points[i + stride].x() - points[i].x());
+            }
+        }
+        batch_inverse(&mut new_differences);
+        for (i, inv) in new_differences.iter().enumerate() {
+            let p1 = points[i * stride * 2];
+            let p2 = points[i * stride * 2 + stride];
+            points[i * stride * 2] = point_add(p1, p2, inv);
+        }
+
+        stride *= 2;
+    }
+
+    points[0]
+}
+// This method assumes that adjacent points are not the same
+// This will lead to an inversion by zero
+pub fn batch_addition_mut(points: &mut [G1Affine]) -> G1Affine {
+    fn point_add(p1: G1Affine, p2: G1Affine, inv: &blstrs::Fp) -> G1Affine {
+        use ff::Field;
+
+        let lambda = (p2.y() - p1.y()) * inv;
+        let x = lambda.square() - p1.x() - p2.x();
+        let y = lambda * (p1.x() - x) - p1.y();
+        G1Affine::from_raw_unchecked(x, y, false)
+    }
+
+    if points.is_empty() {
+        use group::prime::PrimeCurveAffine;
+        return G1Affine::identity();
+    }
+
+    let mut stride = 1;
+
+    let mut new_differences = Vec::with_capacity(points.len());
+    while stride < points.len() {
+        new_differences.clear();
+
+        for i in (0..points.len()).step_by(stride * 2) {
+            if i + stride < points.len() {
+                new_differences.push(points[i + stride].x() - points[i].x());
+            }
+        }
+        batch_inverse(&mut new_differences);
+        for (i, inv) in new_differences.iter().enumerate() {
+            let p1 = points[i * stride * 2];
+            let p2 = points[i * stride * 2 + stride];
+            points[i * stride * 2] = point_add(p1, p2, inv);
+        }
+
+        stride *= 2;
+    }
+
+    points[0]
+}
+
+pub fn naive_multi_batch_addition(mut multi_points: Vec<Vec<G1Affine>>) -> Vec<G1Affine> {
+    // TODO: Try using a scratch pad here
+    multi_points
+        .into_iter()
+        .map(|points| batch_addition(points))
+        .collect()
+}
+
+// Similar to batch addition, however we amortize across different batches
+// TODO: Clean up -- This has a greater complexity than the regular algorithm
+// TODO so we want to check if it makes a difference in our usecase.
+pub fn multi_batch_addition(mut multi_points: Vec<Vec<G1Affine>>) -> Vec<G1Affine> {
+    #[inline(always)]
+    fn point_add_double(p1: G1Affine, p2: G1Affine, inv: &blstrs::Fp) -> G1Affine {
+        use ff::Field;
+
+        let lambda = if p1 == p2 {
+            p1.x().square().mul3() * inv
+        } else {
+            (p2.y() - p1.y()) * inv
+        };
+
+        let x = lambda.square() - p1.x() - p2.x();
+        let y = lambda * (p1.x() - x) - p1.y();
+        G1Affine::from_raw_unchecked(x, y, false)
+    }
+    #[inline(always)]
+    // Note: We do not handle the case where p1 == -p2
+    fn choose_add_or_double(p1: G1Affine, p2: G1Affine) -> Fp {
+        use ff::Field;
+
+        if p1 == p2 {
+            p2.y().double()
+        } else {
+            p1.x() - p2.x()
+        }
+    }
+    let total_num_points: usize = multi_points.iter().map(|p| p.len()).sum();
+    // let mut scratchpad = Vec::with_capacity(total_num_points);
+
+    // Find the largest buckets, this will be the bottleneck for the number of iterations
+    let mut max_bucket_length = 0;
+    for i in 0..multi_points.len() {
+        max_bucket_length = std::cmp::max(max_bucket_length, multi_points[i].len());
+    }
+
+    let mut new_differences = Vec::with_capacity(max_bucket_length);
+    // (a,b) ; a is the length before adding points and b is the length after adding points. so a range
+    let mut collected_points = vec![(0, 0); multi_points.len()]; // We want to know how many points each bucket has accumulated
+    let mut multi_strides = vec![1; multi_points.len()]; // We want to know the stride for each bucket
+    let mut bucket_complete = vec![false; multi_points.len()]; // We want to know if a bucket is complete
+                                                               // Iterate over each bucket
+    let max_num_iterations = max_bucket_length.next_power_of_two().ilog2();
+    for _ in 0..max_num_iterations {
+        new_differences.clear();
+        // Iterate over each bucket
+        for i in 0..multi_points.len() {
+            if bucket_complete[i] {
+                continue;
+            }
+            let points = &multi_points[i];
+            let stride = multi_strides[i];
+            let old_diff_len = new_differences.len();
+
+            // Skip the bucket if the stride is too long,
+            // This happens if the buckets are not evenly distributed
+            // in terms of points.
+            if stride < points.len() {
+                for k in (0..points.len()).step_by(stride * 2) {
+                    if k + stride < points.len() {
+                        new_differences.push(choose_add_or_double(points[k + stride], points[k]));
+                        // new_differences.push(points[k + stride].x() - points[k].x());
+                    }
+                }
+
+                // Save the number of points going into this bucket for the batch inversion
+                collected_points[i] = (old_diff_len, new_differences.len());
+            } else {
+                collected_points[i] = (old_diff_len, new_differences.len());
+                bucket_complete[i] = true;
+            }
+        }
+
+        // We have iterated over each bucket, so now we need to do a batch inversion
+        batch_inverse(&mut new_differences);
+        // Now we update each bucket using the batch inversion we have computed and the collected points
+        for i in 0..multi_points.len() {
+            if bucket_complete[i] {
+                continue;
+            }
+            let points = &mut multi_points[i];
+            let stride = multi_strides[i];
+            let (start, end) = collected_points[i];
+            for (k, new_difference_offset) in (start..end).enumerate() {
+                let inv = &new_differences[new_difference_offset];
+                let p1 = points[k * stride * 2];
+                let p2 = points[k * stride * 2 + stride];
+                points[k * stride * 2] = point_add_double(p1, p2, inv);
+            }
+
+            // Update the stride for this bucket
+            multi_strides[i] *= 2;
+        }
+    }
+
+    // multi_points.into_iter().map(|points| points[0]).collect()
+    // TODO: using this incase we have empty vectors passed in
+    multi_points
+        .into_iter()
+        .map(|points| points.get(0).copied().unwrap_or(G1Affine::default()))
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::{batch_addition, multi_batch_addition};
+    use blstrs::{G1Affine, G1Projective};
+    use group::Group;
+
+    #[test]
+    fn test_batch_addition() {
+        let num_points = 100;
+        let points: Vec<G1Affine> = (0..num_points)
+            .map(|_| G1Projective::random(&mut rand::thread_rng()).into())
+            .collect();
+
+        let expected_result: G1Affine = points
+            .iter()
+            .fold(G1Projective::identity(), |acc, p| acc + p)
+            .into();
+
+        let got_result = batch_addition(points.clone());
+        assert_eq!(expected_result, got_result);
+    }
+
+    #[test]
+    fn test_multi_batch_addition() {
+        let num_points = 100;
+        let num_sets = 5;
+        let random_sets_of_points: Vec<Vec<G1Affine>> = (0..num_sets)
+            .map(|_| {
+                (0..num_points)
+                    .map(|_| G1Projective::random(&mut rand::thread_rng()).into())
+                    .collect()
+            })
+            .collect();
+        let random_sets_of_points_clone = random_sets_of_points.clone();
+
+        let expected_results: Vec<_> = random_sets_of_points
+            .into_iter()
+            .map(|points| batch_addition(points))
+            .collect();
+
+        let got_results = multi_batch_addition(random_sets_of_points_clone);
+        assert_eq!(got_results, expected_results);
+    }
+}
diff --git a/cryptography/bls12_381/src/lib.rs b/cryptography/bls12_381/src/lib.rs
index e21a5a26..0214b9b6 100644
--- a/cryptography/bls12_381/src/lib.rs
+++ b/cryptography/bls12_381/src/lib.rs
@@ -1,3 +1,4 @@
+mod batch_add;
 pub mod batch_inversion;
 mod booth_encoding;
 pub mod fixed_base_msm;

From 296cf21e1ea059c57ab848d57ca52f6d8be1e2c3 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Mon, 2 Sep 2024 16:32:21 +0100
Subject: [PATCH 04/25] put booth testing code under cfg

---
 cryptography/bls12_381/src/booth_encoding.rs | 70 +++++++++++---------
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/cryptography/bls12_381/src/booth_encoding.rs b/cryptography/bls12_381/src/booth_encoding.rs
index 51a90f70..67f4e12c 100644
--- a/cryptography/bls12_381/src/booth_encoding.rs
+++ b/cryptography/bls12_381/src/booth_encoding.rs
@@ -1,9 +1,5 @@
 use std::ops::Neg;
 
-use blstrs::{G1Projective, Scalar};
-use ff::{Field, PrimeField};
-
-use crate::G1Point;
 // TODO: Link to halo2 file + docs + comments
 pub fn get_booth_index(window_index: usize, window_size: usize, el: &[u8]) -> i32 {
     // Booth encoding:
@@ -49,42 +45,52 @@ pub fn get_booth_index(window_index: usize, window_size: usize, el: &[u8]) -> i3
     }
 }
 
-#[test]
-fn smoke_scalar_mul() {
-    use group::prime::PrimeCurveAffine;
-    let gen = G1Point::generator();
-    let s = -Scalar::ONE;
+#[cfg(test)]
+mod tests {
+    use std::ops::Neg;
 
-    let res = gen * s;
+    use super::get_booth_index;
+    use crate::G1Point;
+    use blstrs::{G1Projective, Scalar};
+    use ff::{Field, PrimeField};
 
-    let got = mul(&s, &gen, 4);
+    #[test]
+    fn smoke_scalar_mul() {
+        use group::prime::PrimeCurveAffine;
+        let gen = G1Point::generator();
+        let s = -Scalar::ONE;
 
-    assert_eq!(G1Point::from(res), got)
-}
+        let res = gen * s;
 
-fn mul(scalar: &Scalar, point: &G1Point, window: usize) -> G1Point {
-    let u = scalar.to_bytes_le();
-    let n = Scalar::NUM_BITS as usize / window + 1;
+        let got = mul(&s, &gen, 4);
 
-    let table = (0..=1 << (window - 1))
-        .map(|i| point * Scalar::from(i as u64))
-        .collect::<Vec<_>>();
+        assert_eq!(G1Point::from(res), got)
+    }
 
-    let mut acc: G1Projective = G1Point::default().into();
-    for i in (0..n).rev() {
-        for _ in 0..window {
-            acc = acc + acc;
-        }
+    fn mul(scalar: &Scalar, point: &G1Point, window: usize) -> G1Point {
+        let u = scalar.to_bytes_le();
+        let n = Scalar::NUM_BITS as usize / window + 1;
 
-        let idx = get_booth_index(i as usize, window, u.as_ref());
+        let table = (0..=1 << (window - 1))
+            .map(|i| point * Scalar::from(i as u64))
+            .collect::<Vec<_>>();
 
-        if idx.is_negative() {
-            acc += table[idx.unsigned_abs() as usize].neg();
-        }
-        if idx.is_positive() {
-            acc += table[idx.unsigned_abs() as usize];
+        let mut acc: G1Projective = G1Point::default().into();
+        for i in (0..n).rev() {
+            for _ in 0..window {
+                acc = acc + acc;
+            }
+
+            let idx = get_booth_index(i as usize, window, u.as_ref());
+
+            if idx.is_negative() {
+                acc += table[idx.unsigned_abs() as usize].neg();
+            }
+            if idx.is_positive() {
+                acc += table[idx.unsigned_abs() as usize];
+            }
         }
-    }
 
-    acc.into()
+        acc.into()
+    }
 }

From d72fd3e5a5f2b08ba18a315470bded5568ccd809 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Mon, 2 Sep 2024 16:32:38 +0100
Subject: [PATCH 05/25] initial msm code

---
 cryptography/bls12_381/src/lib.rs |   2 +-
 cryptography/bls12_381/src/msm.rs | 445 ++++++++++++++++++++++++++++++
 2 files changed, 446 insertions(+), 1 deletion(-)
 create mode 100644 cryptography/bls12_381/src/msm.rs

diff --git a/cryptography/bls12_381/src/lib.rs b/cryptography/bls12_381/src/lib.rs
index 0214b9b6..aa33901b 100644
--- a/cryptography/bls12_381/src/lib.rs
+++ b/cryptography/bls12_381/src/lib.rs
@@ -3,7 +3,7 @@ pub mod batch_inversion;
 mod booth_encoding;
 pub mod fixed_base_msm;
 pub mod lincomb;
-
+mod msm;
 // Re-exporting the blstrs crate
 
 // Re-export ff and group, so other crates do not need to directly import(and independently version) them
diff --git a/cryptography/bls12_381/src/msm.rs b/cryptography/bls12_381/src/msm.rs
new file mode 100644
index 00000000..7f30422e
--- /dev/null
+++ b/cryptography/bls12_381/src/msm.rs
@@ -0,0 +1,445 @@
+use blstrs::G1Affine;
+use blstrs::G1Projective;
+use blstrs::Scalar;
+use ff::PrimeField;
+use group::Group;
+
+use crate::booth_encoding::get_booth_index;
+use crate::g1_batch_normalize;
+use crate::G1Point;
+
+#[derive(Debug, Clone, Copy)]
+pub struct Info {
+    base_idx: u64,
+    bucket_idx: u64,
+    // We use precomputations which removes the window_idx
+    // window_idx: u64,
+    sign: bool,
+}
+
+pub fn precompute(
+    window_size: usize,
+    number_of_windows: usize,
+    points: &[G1Point],
+) -> Vec<G1Point> {
+    // For each point, we compute number_of_windows-1 points
+    let mut results = Vec::new();
+    for point in points {
+        // First add the original point
+        results.push(point.into());
+
+        // Then scale each successive point by 2^window_size
+        for _ in 0..number_of_windows - 1 {
+            let mut last_point_scaled_window_size: G1Projective = *results.last().unwrap();
+            for _ in 0..window_size {
+                last_point_scaled_window_size = last_point_scaled_window_size.double()
+            }
+            results.push(last_point_scaled_window_size)
+        }
+    }
+    g1_batch_normalize(&results)
+}
+
+// Note: This does not work if the input points are [P, -P] for example
+// We could iterate for that case, but its unlikely given the points are random
+pub fn msm_best2(
+    coeffs: &[Scalar],
+    bases_precomputed: &[G1Point],
+    window_size: usize,
+) -> G1Projective {
+    // assert_eq!(coeffs.len(), bases.len());
+
+    let c = window_size;
+
+    // coeffs to byte representation
+    let coeffs: Vec<_> = coeffs.iter().map(|a| a.to_bytes_le()).collect();
+
+    // Information on the points we want to add
+    let mut all_information = vec![vec![]; 1 << (c - 1)];
+
+    // number of windows
+    let number_of_windows = Scalar::NUM_BITS as usize / c + 1;
+
+    for window_idx in 0..number_of_windows {
+        for (base_idx, coeff) in coeffs.iter().enumerate() {
+            let buck_idx = get_booth_index(window_idx, c, coeff.as_ref());
+
+            if buck_idx != 0 {
+                // parse bucket index
+                let sign = buck_idx.is_positive();
+                let buck_idx = buck_idx.unsigned_abs() as usize - 1;
+                //
+                // Since we are using precomputed points, the base_idx is augmented
+                //
+                // We need to modify the base index to take into account:
+                // - The window, so we fetch the precomputed base for that window
+                // - The position of the point in the precomputed bases,
+                // relative to the original bases vector
+                //
+                // If you imagine we had:
+                // [P1, P2, P3]
+                // precomp = [P1, c*P1,..., (num_window-1)*c*P1, P2,...]
+                //
+                // The index of P1, P2, etc can be computed by:
+                // augmented_base_idx = base_idx * num_windows
+                // Then in order to get the correct point, we do:
+                // augmented_base_idx += window_idx
+                let base_idx = (base_idx * number_of_windows) + window_idx;
+
+                let info = Info {
+                    bucket_idx: buck_idx as u64,
+                    sign,
+                    base_idx: base_idx as u64,
+                };
+
+                all_information[buck_idx].push(info);
+            }
+        }
+    }
+
+    // All of the above costs about 200 microseconds on 64 points.
+    // Using a vector is about 3 times faster, but the points are not ordered by bucket index
+    // so we could try and do a second pass on the vector to see if thats quicker for small numPoints
+    //
+    // Note: for duplicate points, we could either put them in the running sum
+    // or use the optimized formulas
+    let mut all_points = Vec::new();
+    let mut bucket_indices = Vec::new();
+    for (bucket_idx, points) in all_information.into_iter().enumerate() {
+        if points.is_empty() {
+            continue;
+        }
+
+        // batch add each bucket
+        let res: Vec<_> = points
+            .into_iter()
+            .map(|point_info| {
+                let mut p = bases_precomputed[point_info.base_idx as usize];
+                if !point_info.sign {
+                    p = -p;
+                }
+                p
+            })
+            .collect();
+        // TODO: We should make sure that we cannot get two points being added together or
+        // TODO: have the formula deal with it
+        all_points.push(res);
+        bucket_indices.push((bucket_idx + 1) as u64); // Add one here since the zeroth bucket will bucket_1, bucket_K eventually translates to K * sum_of_bucket
+    }
+
+    let buckets_added = crate::batch_add::multi_batch_addition(all_points);
+
+    subsum_accumulation(&bucket_indices, &buckets_added)
+    // Now we have all of the information needed
+    // The precomputations that we did, effectively allowed us
+    // to remove the notion of a "window" -- there is only
+    // one window, effectively.
+    //
+    // Note: For 64 points, this is about 3200 elements.
+    //
+    // Do some more preprocessing to reduce the work needed
+}
+
+pub fn msm_best2_noinfo(
+    coeffs: &[Scalar],
+    bases_precomputed: &[G1Point],
+    window_size: usize,
+) -> G1Projective {
+    // assert_eq!(coeffs.len(), bases.len());
+
+    let c = window_size;
+
+    // coeffs to byte representation
+    let coeffs: Vec<_> = coeffs.iter().map(|a| a.to_bytes_le()).collect();
+
+    // Information on the points we want to add
+    let mut all_information = vec![vec![]; 1 << (c - 1)];
+
+    // number of windows
+    let number_of_windows = Scalar::NUM_BITS as usize / c + 1;
+
+    for window_idx in 0..number_of_windows {
+        for (base_idx, coeff) in coeffs.iter().enumerate() {
+            let buck_idx = get_booth_index(window_idx, c, coeff.as_ref());
+
+            if buck_idx != 0 {
+                // parse bucket index
+                let sign = buck_idx.is_positive();
+                let buck_idx = buck_idx.unsigned_abs() as usize - 1;
+                //
+                // Since we are using precomputed points, the base_idx is augmented
+                //
+                // We need to modify the base index to take into account:
+                // - The window, so we fetch the precomputed base for that window
+                // - The position of the point in the precomputed bases,
+                // relative to the original bases vector
+                //
+                // If you imagine we had:
+                // [P1, P2, P3]
+                // precomp = [P1, c*P1,..., (num_window-1)*c*P1, P2,...]
+                //
+                // The index of P1, P2, etc can be computed by:
+                // augmented_base_idx = base_idx * num_windows
+                // Then in order to get the correct point, we do:
+                // augmented_base_idx += window_idx
+                let base_idx = (base_idx * number_of_windows) + window_idx;
+
+                let point = if sign {
+                    bases_precomputed[base_idx as usize]
+                } else {
+                    -bases_precomputed[base_idx as usize]
+                };
+
+                all_information[buck_idx].push(point);
+            }
+        }
+    }
+
+    // All of the above costs about 200 microseconds on 64 points.
+    // Using a vector is about 3 times faster, but the points are not ordered by bucket index
+    // so we could try and do a second pass on the vector to see if thats quicker for small numPoints
+    //
+    // Note: for duplicate points, we could either put them in the running sum
+    // or use the optimized formulas
+    // let mut all_points = Vec::new();
+    // let mut bucket_indices = Vec::new();
+    // TODO: This should return the points too, ie skip the empty buckets
+    let bucket_indices: Vec<_> = all_information
+        .iter()
+        .enumerate()
+        .filter(|(_, points)| !points.is_empty())
+        .map(|(index, _)| (index + 1) as u64)
+        .collect();
+
+    let buckets_added = crate::batch_add::multi_batch_addition(all_information);
+
+    subsum_accumulation(&bucket_indices, &buckets_added)
+}
+
+// Algorithm1 from the LFG paper
+// TODO: Fix later, this algorithm is broken in the POC and the paper
+// fn subsum_accumulation(b: &[u64], s: &[G1Affine]) -> G1Projective {
+//     assert_eq!(b.len(), s.len(), "Input arrays must have the same length");
+//     let d = *b.iter().max().unwrap() as usize;
+
+//     // Define a length-(d + 1) array tmp = [0] × (d + 1)
+//     let mut tmp_d = vec![G1Projective::identity(); d + 1];
+//     let mut tmp = G1Projective::identity();
+
+//     // Iterate from |B| to 1 by -1
+//     for i in (1..b.len()).rev() {
+//         // tmp[0] = tmp[0] + S_i
+//         tmp += s[i];
+
+//         // k = b_i - b_{i-1}
+//         let k = (b[i] - b[i - 1]) as usize;
+
+//         // if k >= 1 then tmp[k] = tmp[k] + tmp[0]
+//         // if k >= 1 {
+//         //     let t0 = tmp_d[0];
+//         //     tmp_d[k] += t0;
+//         // }
+//         tmp_d[k] += tmp;
+//     }
+
+//     // The original paper has a bug and does not deal with the case
+//     // when there is only 1 point
+//     if b.len() == 1 {
+//         tmp_d[(b[0] - 1) as usize] = s[0].into()
+//     }
+
+//     // Now do running sum stuff
+//     // summation by parts
+//     // e.g. 3a + 2b + 1c = a +
+//     //                    (a) + b +
+//     //                    ((a) + b) + c
+//     let mut running_sum = G1Projective::identity();
+//     let mut res = G1Projective::identity();
+//     // for i in (0..d).rev() {
+//     //     running_sum += &tmp_d[i];
+//     //     res += &running_sum;
+//     // }
+//     // We can use d to skip top buckets that are empty (done above)
+//     tmp_d.into_iter().rev().for_each(|b| {
+//         running_sum += &b;
+//         res += &running_sum;
+//     });
+//     res
+// }
+
+// This is poormans version of Algorithm 1 from LFG
+//
+// It seems to be faster, but thats likely because the actual one is not implemented
+// correctly and does not have the short cuts for bucket sizes 0 and 1
+fn subsum_accumulation(b: &[u64], s: &[G1Affine]) -> G1Projective {
+    // If we only have one, then we can return the scalar multiplication
+    // This is an assumption that LFG was making too.
+    if b.len() == 0 {
+        return G1Projective::identity();
+    }
+    if b.len() == 1 {
+        return s[0] * Scalar::from(b[0]);
+    }
+
+    // Now do running sum stuff
+    // summation by parts but it does not need to be continuos
+    let mut running_sum = G1Projective::identity();
+    let mut res = G1Projective::identity();
+
+    s.into_iter().enumerate().rev().for_each(|(index, point)| {
+        running_sum += point;
+        res += &running_sum;
+
+        // Check that we are not at the last point
+        if index > 0 {
+            // We cannot fail here since we know the length of b is atleast 2
+            let diff = b[index] - b[index - 1] - 1; // Note the -1 because if we have 2a + 1b, the diff will be 0 and the for loop will be skipped
+                                                    // Before going to the next point, we need to account
+                                                    // for the possible difference in scalars.
+                                                    // ie we could be doing 3 * a + 1 * b
+            for _ in 0..diff {
+                res += running_sum
+            }
+        }
+    });
+    res
+}
+
+// summation by parts
+// e.g. 3a + 2b + 1c = a +
+//                    (a) + b +
+//                    ((a) + b) + c
+//
+// Note: This assumes the points are in ascending order.
+// ie 1 * points[0] + 2 * points[1] + ... + n * points[n-1]
+#[inline(always)]
+fn horners_rule_sum(points: &[G1Point]) -> G1Projective {
+    let mut running_sum = G1Projective::identity();
+    let mut res = G1Projective::identity();
+    points.into_iter().rev().for_each(|b| {
+        running_sum += b;
+        res += &running_sum;
+    });
+    res
+}
+
+#[cfg(test)]
+mod test {
+
+    use crate::{
+        msm::{horners_rule_sum, msm_best2, precompute},
+        G1Point, G1Projective, Scalar,
+    };
+
+    use blstrs::G1Affine;
+    use group::{prime::PrimeCurveAffine, Group};
+
+    use super::subsum_accumulation;
+
+    #[test]
+    fn subsum_smoke_test() {
+        let result = subsum_accumulation(&[1], &[G1Affine::generator()]);
+        assert_eq!(G1Projective::generator(), result);
+
+        let result = subsum_accumulation(&[2], &[G1Affine::generator()]);
+        assert_eq!(G1Projective::generator() * Scalar::from(2u64), result);
+
+        let result = subsum_accumulation(&[1, 2], &[G1Affine::generator(), G1Affine::generator()]);
+        assert_eq!(G1Projective::generator() * Scalar::from(3u64), result);
+
+        let result = subsum_accumulation(&[1, 3], &[G1Affine::generator(), G1Affine::generator()]);
+        assert_eq!(G1Projective::generator() * Scalar::from(4u64), result);
+
+        let result =
+            subsum_accumulation(&[1, 300], &[-G1Affine::generator(), G1Affine::generator()]);
+        assert_eq!(G1Projective::generator() * Scalar::from(299u64), result);
+
+        let result = subsum_accumulation(
+            &[1, 2, 3, 4, 10, 22, 100],
+            &[
+                G1Affine::generator(),
+                G1Affine::generator(),
+                G1Affine::generator(),
+                G1Affine::generator(),
+                G1Affine::generator(),
+                G1Affine::generator(),
+                G1Affine::generator(),
+            ],
+        );
+        assert_eq!(
+            G1Projective::generator() * Scalar::from(1 + 2 + 3 + 4 + 10 + 22 + 100),
+            result
+        );
+    }
+
+    #[test]
+    fn horners_sum_smoke_test() {
+        let result = horners_rule_sum(&[G1Affine::generator()]);
+        assert_eq!(G1Projective::generator(), result);
+
+        let result = horners_rule_sum(&[
+            -G1Affine::generator(),
+            G1Affine::generator(),
+            G1Affine::generator(),
+        ]);
+        assert_eq!(
+            G1Projective::generator() * Scalar::from(3u64)
+                + G1Projective::generator() * Scalar::from(2u64)
+                + -G1Projective::generator(),
+            result
+        );
+    }
+
+    #[test]
+    fn smoke_test_msm_best2() {
+        use crate::ff::PrimeField;
+        let window_size = 7;
+        let number_of_windows = Scalar::NUM_BITS as usize / window_size + 1;
+
+        let precomp_bases = precompute(window_size, number_of_windows, &[G1Point::generator()]);
+        let scalar = -Scalar::from(2);
+
+        let res = msm_best2(&[scalar], &precomp_bases, window_size);
+        assert_eq!(res, G1Projective::generator() * scalar);
+    }
+
+    #[test]
+    fn smoke_test_msm_best2_neg() {
+        use crate::ff::PrimeField;
+        let window_size = 7;
+        let number_of_windows = Scalar::NUM_BITS as usize / window_size + 1;
+
+        let input_points = vec![G1Point::generator(), G1Point::generator()];
+        let input_scalars = vec![-Scalar::from(1), -Scalar::from(2)];
+        let precomp_bases = precompute(window_size, number_of_windows, &input_points);
+
+        let res = msm_best2(&input_scalars, &precomp_bases, window_size);
+        assert_eq!(res, naive_msm(&input_points, &input_scalars));
+    }
+
+    #[test]
+    fn smoke_test_msm_best2_double_scalar() {
+        use crate::ff::PrimeField;
+        let window_size = 7;
+        let number_of_windows = Scalar::NUM_BITS as usize / window_size + 1;
+
+        let point_b: G1Affine = (G1Projective::generator() + G1Projective::generator()).into();
+        let point_c: G1Affine =
+            (G1Projective::generator().double() + G1Projective::generator().double()).into();
+        let input_points = vec![G1Point::generator(), point_b, point_c];
+        let input_scalars = vec![Scalar::from(1), Scalar::from(2), Scalar::from(3u64)];
+        let precomp_bases = precompute(window_size, number_of_windows, &input_points);
+
+        let res = msm_best2(&input_scalars, &precomp_bases, window_size);
+        assert_eq!(res, naive_msm(&input_points, &input_scalars));
+    }
+
+    fn naive_msm(points: &[G1Point], scalars: &[Scalar]) -> G1Projective {
+        assert!(points.len() == scalars.len());
+        let mut result = G1Projective::identity();
+        for (scalar, point) in scalars.into_iter().zip(points) {
+            result += point * scalar
+        }
+        result
+    }
+}

From 3eca854200419c4ef789b4f4a5170633cc32b015 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Mon, 2 Sep 2024 16:51:24 +0100
Subject: [PATCH 06/25] add benchmarks

---
 cryptography/bls12_381/benches/benchmark.rs | 16 +++++++++++++++-
 cryptography/bls12_381/src/lib.rs           |  4 ++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/cryptography/bls12_381/benches/benchmark.rs b/cryptography/bls12_381/benches/benchmark.rs
index 7db62c3b..4ab34db5 100644
--- a/cryptography/bls12_381/benches/benchmark.rs
+++ b/cryptography/bls12_381/benches/benchmark.rs
@@ -5,6 +5,7 @@ use crate_crypto_internal_eth_kzg_bls12_381::{
     g1_batch_normalize, g2_batch_normalize,
     group::Group,
     lincomb::{g1_lincomb, g1_lincomb_unsafe, g2_lincomb, g2_lincomb_unsafe},
+    msm::{msm_best2, msm_best2_noinfo, precompute},
     G1Projective, G2Projective, Scalar,
 };
 use criterion::{criterion_group, criterion_main, Criterion};
@@ -28,12 +29,25 @@ pub fn fixed_base_msm(c: &mut Criterion) {
         .into_iter()
         .map(|p| p.into())
         .collect();
-    let fbm = FixedBaseMSM::new(generators, UsePrecomp::Yes { width: 8 });
+    let fbm = FixedBaseMSM::new(generators.clone(), UsePrecomp::Yes { width: 8 });
     let scalars: Vec<_> = random_scalars(length);
 
     c.bench_function("bls12_381 fixed_base_msm length=64 width=8", |b| {
         b.iter(|| fbm.msm(scalars.clone()))
     });
+
+    use crate_crypto_internal_eth_kzg_bls12_381::ff::PrimeField;
+    let window_size = (f64::from(length as u32)).ln().ceil() as usize + 2;
+    let number_of_windows = Scalar::NUM_BITS as usize / window_size + 1;
+
+    let precomp_bases = precompute(window_size, number_of_windows, &generators);
+
+    c.bench_function("bls12_381 fixed_base_msm best2 no info", |b| {
+        b.iter(|| msm_best2_noinfo(&scalars, &precomp_bases, window_size))
+    });
+    c.bench_function("bls12_381 fixed_base_msm using pippenger bes32", |b| {
+        b.iter(|| msm_best2(&scalars, &precomp_bases, window_size))
+    });
 }
 
 pub fn bench_msm(c: &mut Criterion) {
diff --git a/cryptography/bls12_381/src/lib.rs b/cryptography/bls12_381/src/lib.rs
index aa33901b..f0527015 100644
--- a/cryptography/bls12_381/src/lib.rs
+++ b/cryptography/bls12_381/src/lib.rs
@@ -3,8 +3,8 @@ pub mod batch_inversion;
 mod booth_encoding;
 pub mod fixed_base_msm;
 pub mod lincomb;
-mod msm;
-// Re-exporting the blstrs crate
+pub mod msm; // TODO: rename since msm is ambiguous
+             // Re-exporting the blstrs crate
 
 // Re-export ff and group, so other crates do not need to directly import(and independently version) them
 pub use ff;

From d1265879f06c8abfc89ce56988a2940142375acc Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Mon, 2 Sep 2024 17:07:53 +0100
Subject: [PATCH 07/25] add scratchpad

---
 cryptography/bls12_381/src/batch_add.rs       |  6 ++--
 cryptography/bls12_381/src/batch_inversion.rs | 32 +++++++++++++------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/cryptography/bls12_381/src/batch_add.rs b/cryptography/bls12_381/src/batch_add.rs
index f79be827..1009b7d4 100644
--- a/cryptography/bls12_381/src/batch_add.rs
+++ b/cryptography/bls12_381/src/batch_add.rs
@@ -1,4 +1,4 @@
-use crate::batch_inversion::batch_inverse;
+use crate::batch_inversion::{batch_inverse, batch_inverse_scratch_pad};
 use blstrs::{Fp, G1Affine};
 
 /// Adds multiple points together in affine representation, batching the inversions
@@ -121,7 +121,7 @@ pub fn multi_batch_addition(mut multi_points: Vec<Vec<G1Affine>>) -> Vec<G1Affin
         }
     }
     let total_num_points: usize = multi_points.iter().map(|p| p.len()).sum();
-    // let mut scratchpad = Vec::with_capacity(total_num_points);
+    let mut scratchpad = Vec::with_capacity(total_num_points);
 
     // Find the largest buckets, this will be the bottleneck for the number of iterations
     let mut max_bucket_length = 0;
@@ -167,7 +167,7 @@ pub fn multi_batch_addition(mut multi_points: Vec<Vec<G1Affine>>) -> Vec<G1Affin
         }
 
         // We have iterated over each bucket, so now we need to do a batch inversion
-        batch_inverse(&mut new_differences);
+        batch_inverse_scratch_pad(&mut new_differences, &mut scratchpad);
         // Now we update each bucket using the batch inversion we have computed and the collected points
         for i in 0..multi_points.len() {
             if bucket_complete[i] {
diff --git a/cryptography/bls12_381/src/batch_inversion.rs b/cryptography/bls12_381/src/batch_inversion.rs
index 3ba32321..8d67de98 100644
--- a/cryptography/bls12_381/src/batch_inversion.rs
+++ b/cryptography/bls12_381/src/batch_inversion.rs
@@ -1,20 +1,34 @@
 /// Given a vector of field elements {v_i}, compute the vector {v_i^(-1)}
+///
+/// Panics if any of the elements are zero
 pub fn batch_inverse<F: ff::Field>(v: &mut [F]) {
-    // Montgomery’s Trick and Fast Implementation of Masked AES
+    let mut scratch_pad = Vec::with_capacity(v.len());
+    batch_inverse_scratch_pad(v, &mut scratch_pad);
+}
+
+/// Given a vector of field elements {v_i}, compute the vector {v_i^(-1)}
+///
+/// A scratchpad is used to avoid excessive allocations in the case that this method is
+/// called repeatedly.
+///
+/// Panics if any of the elements are zero
+pub fn batch_inverse_scratch_pad<F: ff::Field>(v: &mut [F], scratchpad: &mut Vec<F>) {
+    // Montgomery's Trick and Fast Implementation of Masked AES
     // Genelle, Prouff and Quisquater
     // Section 3.2
     // but with an optimization to multiply every element in the returned vector by coeff
 
+    // Clear the scratchpad and ensure it has enough capacity
+    scratchpad.clear();
+    scratchpad.reserve(v.len());
+
     // First pass: compute [a, ab, abc, ...]
-    let mut prod = Vec::with_capacity(v.len());
     let mut tmp = F::ONE;
-    for f in v.iter().filter(|f| !f.is_zero_vartime()) {
+    for f in v.iter() {
         tmp.mul_assign(f);
-        prod.push(tmp);
+        scratchpad.push(tmp);
     }
 
-    assert_eq!(prod.len(), v.len(), "inversion by zero is not allowed");
-
     // Invert `tmp`.
     tmp = tmp
         .invert()
@@ -25,14 +39,12 @@ pub fn batch_inverse<F: ff::Field>(v: &mut [F]) {
         .iter_mut()
         // Backwards
         .rev()
-        // Ignore normalized elements
-        .filter(|f| !f.is_zero_vartime())
         // Backwards, skip last element, fill in one for last term.
-        .zip(prod.into_iter().rev().skip(1).chain(Some(F::ONE)))
+        .zip(scratchpad.iter().rev().skip(1).chain(Some(&F::ONE)))
     {
         // tmp := tmp * f; f := tmp * s = 1/f
         let new_tmp = tmp * *f;
-        *f = tmp * s;
+        *f = tmp * *s;
         tmp = new_tmp;
     }
 }

From 7589402fb772c596b9af5c8e5e5e4beb042c561a Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Mon, 2 Sep 2024 17:18:17 +0100
Subject: [PATCH 08/25] remove now unused method: scratchpad does not make it
 faster than multi_batch method

---
 cryptography/bls12_381/src/batch_add.rs | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/cryptography/bls12_381/src/batch_add.rs b/cryptography/bls12_381/src/batch_add.rs
index 1009b7d4..6a26d636 100644
--- a/cryptography/bls12_381/src/batch_add.rs
+++ b/cryptography/bls12_381/src/batch_add.rs
@@ -83,14 +83,6 @@ pub fn batch_addition_mut(points: &mut [G1Affine]) -> G1Affine {
     points[0]
 }
 
-pub fn naive_multi_batch_addition(mut multi_points: Vec<Vec<G1Affine>>) -> Vec<G1Affine> {
-    // TODO: Try using a scratch pad here
-    multi_points
-        .into_iter()
-        .map(|points| batch_addition(points))
-        .collect()
-}
-
 // Similar to batch addition, however we amortize across different batches
 // TODO: Clean up -- This has a greater complexity than the regular algorithm
 // TODO so we want to check if it makes a difference in our usecase.

From a4b66f3bf35693637f438217a4978b285f6f4911 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Mon, 2 Sep 2024 17:47:44 +0100
Subject: [PATCH 09/25] fix: remove empty buckets

---
 cryptography/bls12_381/src/msm.rs | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/cryptography/bls12_381/src/msm.rs b/cryptography/bls12_381/src/msm.rs
index 7f30422e..9a646b96 100644
--- a/cryptography/bls12_381/src/msm.rs
+++ b/cryptography/bls12_381/src/msm.rs
@@ -203,12 +203,11 @@ pub fn msm_best2_noinfo(
     // or use the optimized formulas
     // let mut all_points = Vec::new();
     // let mut bucket_indices = Vec::new();
-    // TODO: This should return the points too, ie skip the empty buckets
-    let bucket_indices: Vec<_> = all_information
-        .iter()
+    let (bucket_indices, all_information): (Vec<_>, Vec<_>) = all_information
+        .into_iter()
         .enumerate()
         .filter(|(_, points)| !points.is_empty())
-        .map(|(index, _)| (index + 1) as u64)
+        .map(|(index, points)| (((index + 1) as u64), points))
         .collect();
 
     let buckets_added = crate::batch_add::multi_batch_addition(all_information);

From a8eb053d179ff7dc9c4e691341a91deef8edfdf6 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Mon, 2 Sep 2024 19:24:49 +0100
Subject: [PATCH 10/25] use iterator

---
 cryptography/bls12_381/src/msm.rs | 45 +++++++++++++++----------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/cryptography/bls12_381/src/msm.rs b/cryptography/bls12_381/src/msm.rs
index 9a646b96..8cd32f63 100644
--- a/cryptography/bls12_381/src/msm.rs
+++ b/cryptography/bls12_381/src/msm.rs
@@ -103,29 +103,28 @@ pub fn msm_best2(
     //
     // Note: for duplicate points, we could either put them in the running sum
     // or use the optimized formulas
-    let mut all_points = Vec::new();
-    let mut bucket_indices = Vec::new();
-    for (bucket_idx, points) in all_information.into_iter().enumerate() {
-        if points.is_empty() {
-            continue;
-        }
-
-        // batch add each bucket
-        let res: Vec<_> = points
-            .into_iter()
-            .map(|point_info| {
-                let mut p = bases_precomputed[point_info.base_idx as usize];
-                if !point_info.sign {
-                    p = -p;
-                }
-                p
-            })
-            .collect();
-        // TODO: We should make sure that we cannot get two points being added together or
-        // TODO: have the formula deal with it
-        all_points.push(res);
-        bucket_indices.push((bucket_idx + 1) as u64); // Add one here since the zeroth bucket will bucket_1, bucket_K eventually translates to K * sum_of_bucket
-    }
+    let (all_points, bucket_indices): (Vec<Vec<_>>, Vec<u64>) = all_information
+        .into_iter()
+        .enumerate()
+        .filter_map(|(bucket_idx, points)| {
+            if points.is_empty() {
+                None
+            } else {
+                let res: Vec<_> = points
+                    .into_iter()
+                    .map(|point_info| {
+                        let mut p = bases_precomputed[point_info.base_idx as usize];
+                        if !point_info.sign {
+                            p = -p;
+                        }
+                        p
+                    })
+                    .collect();
+
+                Some((res, (bucket_idx + 1) as u64))
+            }
+        })
+        .unzip();
 
     let buckets_added = crate::batch_add::multi_batch_addition(all_points);
 

From 6ca6224839651531c2e10eda9fb1bad9003eb523 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Mon, 2 Sep 2024 19:29:03 +0100
Subject: [PATCH 11/25] rename module

---
 .../{msm.rs => fixed_base_msm_pippenger.rs}   | 110 +-----------------
 cryptography/bls12_381/src/lib.rs             |   4 +-
 2 files changed, 3 insertions(+), 111 deletions(-)
 rename cryptography/bls12_381/src/{msm.rs => fixed_base_msm_pippenger.rs} (74%)

diff --git a/cryptography/bls12_381/src/msm.rs b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
similarity index 74%
rename from cryptography/bls12_381/src/msm.rs
rename to cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
index 8cd32f63..b6734779 100644
--- a/cryptography/bls12_381/src/msm.rs
+++ b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
@@ -8,15 +8,6 @@ use crate::booth_encoding::get_booth_index;
 use crate::g1_batch_normalize;
 use crate::G1Point;
 
-#[derive(Debug, Clone, Copy)]
-pub struct Info {
-    base_idx: u64,
-    bucket_idx: u64,
-    // We use precomputations which removes the window_idx
-    // window_idx: u64,
-    sign: bool,
-}
-
 pub fn precompute(
     window_size: usize,
     number_of_windows: usize,
@@ -40,105 +31,6 @@ pub fn precompute(
     g1_batch_normalize(&results)
 }
 
-// Note: This does not work if the input points are [P, -P] for example
-// We could iterate for that case, but its unlikely given the points are random
-pub fn msm_best2(
-    coeffs: &[Scalar],
-    bases_precomputed: &[G1Point],
-    window_size: usize,
-) -> G1Projective {
-    // assert_eq!(coeffs.len(), bases.len());
-
-    let c = window_size;
-
-    // coeffs to byte representation
-    let coeffs: Vec<_> = coeffs.iter().map(|a| a.to_bytes_le()).collect();
-
-    // Information on the points we want to add
-    let mut all_information = vec![vec![]; 1 << (c - 1)];
-
-    // number of windows
-    let number_of_windows = Scalar::NUM_BITS as usize / c + 1;
-
-    for window_idx in 0..number_of_windows {
-        for (base_idx, coeff) in coeffs.iter().enumerate() {
-            let buck_idx = get_booth_index(window_idx, c, coeff.as_ref());
-
-            if buck_idx != 0 {
-                // parse bucket index
-                let sign = buck_idx.is_positive();
-                let buck_idx = buck_idx.unsigned_abs() as usize - 1;
-                //
-                // Since we are using precomputed points, the base_idx is augmented
-                //
-                // We need to modify the base index to take into account:
-                // - The window, so we fetch the precomputed base for that window
-                // - The position of the point in the precomputed bases,
-                // relative to the original bases vector
-                //
-                // If you imagine we had:
-                // [P1, P2, P3]
-                // precomp = [P1, c*P1,..., (num_window-1)*c*P1, P2,...]
-                //
-                // The index of P1, P2, etc can be computed by:
-                // augmented_base_idx = base_idx * num_windows
-                // Then in order to get the correct point, we do:
-                // augmented_base_idx += window_idx
-                let base_idx = (base_idx * number_of_windows) + window_idx;
-
-                let info = Info {
-                    bucket_idx: buck_idx as u64,
-                    sign,
-                    base_idx: base_idx as u64,
-                };
-
-                all_information[buck_idx].push(info);
-            }
-        }
-    }
-
-    // All of the above costs about 200 microseconds on 64 points.
-    // Using a vector is about 3 times faster, but the points are not ordered by bucket index
-    // so we could try and do a second pass on the vector to see if thats quicker for small numPoints
-    //
-    // Note: for duplicate points, we could either put them in the running sum
-    // or use the optimized formulas
-    let (all_points, bucket_indices): (Vec<Vec<_>>, Vec<u64>) = all_information
-        .into_iter()
-        .enumerate()
-        .filter_map(|(bucket_idx, points)| {
-            if points.is_empty() {
-                None
-            } else {
-                let res: Vec<_> = points
-                    .into_iter()
-                    .map(|point_info| {
-                        let mut p = bases_precomputed[point_info.base_idx as usize];
-                        if !point_info.sign {
-                            p = -p;
-                        }
-                        p
-                    })
-                    .collect();
-
-                Some((res, (bucket_idx + 1) as u64))
-            }
-        })
-        .unzip();
-
-    let buckets_added = crate::batch_add::multi_batch_addition(all_points);
-
-    subsum_accumulation(&bucket_indices, &buckets_added)
-    // Now we have all of the information needed
-    // The precomputations that we did, effectively allowed us
-    // to remove the notion of a "window" -- there is only
-    // one window, effectively.
-    //
-    // Note: For 64 points, this is about 3200 elements.
-    //
-    // Do some more preprocessing to reduce the work needed
-}
-
 pub fn msm_best2_noinfo(
     coeffs: &[Scalar],
     bases_precomputed: &[G1Point],
@@ -325,7 +217,7 @@ fn horners_rule_sum(points: &[G1Point]) -> G1Projective {
 mod test {
 
     use crate::{
-        msm::{horners_rule_sum, msm_best2, precompute},
+        fixed_base_msm_pippenger::{horners_rule_sum, msm_best2_noinfo as msm_best2, precompute},
         G1Point, G1Projective, Scalar,
     };
 
diff --git a/cryptography/bls12_381/src/lib.rs b/cryptography/bls12_381/src/lib.rs
index f0527015..7d349584 100644
--- a/cryptography/bls12_381/src/lib.rs
+++ b/cryptography/bls12_381/src/lib.rs
@@ -2,15 +2,15 @@ mod batch_add;
 pub mod batch_inversion;
 mod booth_encoding;
 pub mod fixed_base_msm;
+pub mod fixed_base_msm_pippenger;
 pub mod lincomb;
-pub mod msm; // TODO: rename since msm is ambiguous
-             // Re-exporting the blstrs crate
 
 // Re-export ff and group, so other crates do not need to directly import(and independently version) them
 pub use ff;
 pub use group;
 use group::{prime::PrimeCurveAffine, Curve};
 
+// Re-exporting the blstrs crate
 pub type G1Point = blstrs::G1Affine;
 pub type G1Projective = blstrs::G1Projective;
 

From 1b308bb7817190235cde43b2a89de9d1c7438765 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Mon, 2 Sep 2024 19:34:50 +0100
Subject: [PATCH 12/25] add struct for fixed base pippenger

---
 .../bls12_381/src/fixed_base_msm_pippenger.rs | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
index b6734779..8bd5ad8b 100644
--- a/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
+++ b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
@@ -8,6 +8,31 @@ use crate::booth_encoding::get_booth_index;
 use crate::g1_batch_normalize;
 use crate::G1Point;
 
+pub struct FixedBaseMSMPippenger {
+    precomputed_points: Vec<G1Affine>,
+    number_of_windows: usize,
+    window_size: usize,
+}
+
+impl FixedBaseMSMPippenger {
+    pub fn new(points: &[G1Affine]) -> FixedBaseMSMPippenger {
+        // The +2 was empirically seen to give better results
+        let window_size = (f64::from(points.len() as u32)).ln().ceil() as usize + 2;
+        let number_of_windows = Scalar::NUM_BITS as usize / window_size + 1;
+        let precomputed_points = precompute(window_size, number_of_windows, points);
+
+        FixedBaseMSMPippenger {
+            precomputed_points,
+            number_of_windows,
+            window_size,
+        }
+    }
+
+    pub fn msm(&self, scalars: &[Scalar]) -> G1Projective {
+        msm_best2_noinfo(scalars, &self.precomputed_points, self.window_size)
+    }
+}
+
 pub fn precompute(
     window_size: usize,
     number_of_windows: usize,

From 350d3f4de7b6950104ccffc03136a1c3e744b435 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Mon, 2 Sep 2024 19:52:10 +0100
Subject: [PATCH 13/25] remove commented line

---
 cryptography/bls12_381/src/batch_add.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cryptography/bls12_381/src/batch_add.rs b/cryptography/bls12_381/src/batch_add.rs
index 6a26d636..e962e986 100644
--- a/cryptography/bls12_381/src/batch_add.rs
+++ b/cryptography/bls12_381/src/batch_add.rs
@@ -146,7 +146,6 @@ pub fn multi_batch_addition(mut multi_points: Vec<Vec<G1Affine>>) -> Vec<G1Affin
                 for k in (0..points.len()).step_by(stride * 2) {
                     if k + stride < points.len() {
                         new_differences.push(choose_add_or_double(points[k + stride], points[k]));
-                        // new_differences.push(points[k + stride].x() - points[k].x());
                     }
                 }
 

From 9a8fd9abbd4e0ff0563fbbd6c1c679717ffac429 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Mon, 2 Sep 2024 19:53:09 +0100
Subject: [PATCH 14/25] update benchmarks

---
 cryptography/bls12_381/benches/benchmark.rs | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/cryptography/bls12_381/benches/benchmark.rs b/cryptography/bls12_381/benches/benchmark.rs
index 4ab34db5..36a29a7d 100644
--- a/cryptography/bls12_381/benches/benchmark.rs
+++ b/cryptography/bls12_381/benches/benchmark.rs
@@ -2,10 +2,10 @@ use crate_crypto_internal_eth_kzg_bls12_381::{
     batch_inversion,
     ff::Field,
     fixed_base_msm::{FixedBaseMSM, UsePrecomp},
+    fixed_base_msm_pippenger::FixedBaseMSMPippenger,
     g1_batch_normalize, g2_batch_normalize,
     group::Group,
     lincomb::{g1_lincomb, g1_lincomb_unsafe, g2_lincomb, g2_lincomb_unsafe},
-    msm::{msm_best2, msm_best2_noinfo, precompute},
     G1Projective, G2Projective, Scalar,
 };
 use criterion::{criterion_group, criterion_main, Criterion};
@@ -36,18 +36,12 @@ pub fn fixed_base_msm(c: &mut Criterion) {
         b.iter(|| fbm.msm(scalars.clone()))
     });
 
-    use crate_crypto_internal_eth_kzg_bls12_381::ff::PrimeField;
-    let window_size = (f64::from(length as u32)).ln().ceil() as usize + 2;
-    let number_of_windows = Scalar::NUM_BITS as usize / window_size + 1;
+    let fixed_base_pip = FixedBaseMSMPippenger::new(&generators);
 
-    let precomp_bases = precompute(window_size, number_of_windows, &generators);
-
-    c.bench_function("bls12_381 fixed_base_msm best2 no info", |b| {
-        b.iter(|| msm_best2_noinfo(&scalars, &precomp_bases, window_size))
-    });
-    c.bench_function("bls12_381 fixed_base_msm using pippenger bes32", |b| {
-        b.iter(|| msm_best2(&scalars, &precomp_bases, window_size))
-    });
+    c.bench_function(
+        "bls12_381 fixed_base_msm best2 no info (fixed base pip)",
+        |b| b.iter(|| fixed_base_pip.msm(&scalars)),
+    );
 }
 
 pub fn bench_msm(c: &mut Criterion) {

From aa2a8bccda22dc0a421ee816f56a1ebb914aa0f2 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Tue, 3 Sep 2024 11:04:05 +0100
Subject: [PATCH 15/25] commit bos-coster

---
 .../bls12_381/src/fixed_base_msm_pippenger.rs | 208 +++++++++++++++++-
 1 file changed, 206 insertions(+), 2 deletions(-)

diff --git a/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
index 8bd5ad8b..a917bda5 100644
--- a/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
+++ b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
@@ -1,6 +1,7 @@
 use blstrs::G1Affine;
 use blstrs::G1Projective;
 use blstrs::Scalar;
+use ff::Field;
 use ff::PrimeField;
 use group::Group;
 
@@ -127,8 +128,195 @@ pub fn msm_best2_noinfo(
         .collect();
 
     let buckets_added = crate::batch_add::multi_batch_addition(all_information);
+    let res = subsum_accumulation(&bucket_indices, &buckets_added);
+    res
+}
 
-    subsum_accumulation(&bucket_indices, &buckets_added)
+pub fn multi_msm(
+    matrix_coeffs: &[&[Scalar]],
+    bases_precomputed: &[G1Point],
+    window_size: usize,
+) -> Vec<G1Projective> {
+    // assert_eq!(coeffs.len(), bases.len());
+
+    let c = window_size;
+
+    // coeffs to byte representation
+    let matrix_coeffs: Vec<_> = matrix_coeffs
+        .iter()
+        .map(|a| {
+            a.iter()
+                .map(|coeff| coeff.to_bytes_le())
+                .collect::<Vec<_>>()
+        })
+        .collect();
+
+    // Information on the points we want to add
+    let mut all_information = vec![vec![]; (1 << (c - 1)) * matrix_coeffs.len()];
+
+    // number of windows
+    let number_of_windows = Scalar::NUM_BITS as usize / c + 1;
+
+    for window_idx in 0..number_of_windows {
+        for (msm_index, coeffs) in matrix_coeffs.iter().enumerate() {
+            for (base_idx, coeff) in coeffs.iter().enumerate() {
+                let buck_idx = get_booth_index(window_idx, c, coeff.as_ref());
+
+                if buck_idx != 0 {
+                    // parse bucket index
+                    let sign = buck_idx.is_positive();
+                    let buck_idx = buck_idx.unsigned_abs() as usize - 1;
+                    //
+                    // Since we are using precomputed points, the base_idx is augmented
+                    //
+                    // We need to modify the base index to take into account:
+                    // - The window, so we fetch the precomputed base for that window
+                    // - The position of the point in the precomputed bases,
+                    // relative to the original bases vector
+                    //
+                    // If you imagine we had:
+                    // [P1, P2, P3]
+                    // precomp = [P1, c*P1,..., (num_window-1)*c*P1, P2,...]
+                    //
+                    // The index of P1, P2, etc can be computed by:
+                    // augmented_base_idx = base_idx * num_windows
+                    // Then in order to get the correct point, we do:
+                    // augmented_base_idx += window_idx
+                    let base_idx = (base_idx * number_of_windows) + window_idx;
+
+                    let point = if sign {
+                        bases_precomputed[base_idx as usize]
+                    } else {
+                        -bases_precomputed[base_idx as usize]
+                    };
+
+                    all_information[buck_idx + (msm_index * (1 << (c - 1)))].push(point);
+                }
+            }
+        }
+    }
+
+    // All of the above costs about 200 microseconds on 64 points.
+    // Using a vector is about 3 times faster, but the points are not ordered by bucket index
+    // so we could try and do a second pass on the vector to see if thats quicker for small numPoints
+    //
+    // Note: for duplicate points, we could either put them in the running sum
+    // or use the optimized formulas
+    let (chunked_bucket_indices, all_information): (Vec<Vec<u64>>, Vec<_>) = all_information
+        .chunks(1 << (c - 1))
+        .into_iter()
+        .map(|chunk| {
+            let chunked_indices: Vec<u64> = chunk
+                .iter()
+                .enumerate()
+                .filter(|(_, points)| !points.is_empty())
+                .map(|(index, _)| (index + 1) as u64)
+                .collect();
+
+            let all_info: Vec<_> = chunk
+                .iter()
+                .filter(|points| !points.is_empty())
+                .flat_map(|points| points.iter().cloned()) // Flatten the points directly
+                .collect();
+
+            (chunked_indices, all_info)
+        })
+        .collect();
+
+    let buckets_added = crate::batch_add::multi_batch_addition(all_information);
+
+    let mut result = Vec::new();
+    let mut start = 0;
+    for bucket_indices in chunked_bucket_indices {
+        result.push(subsum_accumulation(
+            &bucket_indices,
+            &buckets_added[start..start + bucket_indices.len()],
+        ));
+        start += bucket_indices.len()
+    }
+    result
+}
+
+use ruint::aliases::*;
+use ruint::Uint;
+use std::cmp::Reverse;
+use std::collections::BinaryHeap;
+
+#[derive(Debug, Clone, Copy)]
+struct PointScalar {
+    scalar: U256,
+    point: G1Projective,
+}
+
+impl PartialEq for PointScalar {
+    fn eq(&self, other: &Self) -> bool {
+        self.scalar == other.scalar && self.point == other.point
+    }
+}
+
+impl Eq for PointScalar {}
+
+impl PartialOrd for PointScalar {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for PointScalar {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.scalar.cmp(&other.scalar)
+    }
+}
+
+pub fn bos_coster(scalars: &[U256], points: &[G1Projective]) -> G1Projective {
+    if scalars.len() != points.len() {
+        panic!("Mismatch between number of scalars and points");
+    }
+
+    let mut heap = BinaryHeap::new();
+    for (scalar, point) in scalars.into_iter().zip(points.iter()) {
+        if *scalar != U256::ZERO {
+            heap.push(PointScalar {
+                scalar: *scalar,
+                point: *point,
+            });
+        }
+    }
+
+    while heap.len() > 1 {
+        let PointScalar {
+            scalar: n1,
+            point: p1,
+        } = heap.pop().unwrap();
+        let PointScalar {
+            scalar: n2,
+            point: p2,
+        } = heap.pop().unwrap();
+
+        let p_sum = p1 + &p2;
+        let n_diff = n1 - n2;
+
+        if n_diff > U256::ZERO {
+            heap.push(PointScalar {
+                scalar: n_diff,
+                point: p1.clone(),
+            });
+        }
+        heap.push(PointScalar {
+            scalar: n2,
+            point: p_sum,
+        });
+    }
+
+    if let Some(PointScalar {
+        scalar: n,
+        point: p,
+    }) = heap.pop()
+    {
+        p * Scalar::from_bytes_be(&n.to_be_bytes()).unwrap()
+    } else {
+        G1Projective::identity() // Identity point
+    }
 }
 
 // Algorithm1 from the LFG paper
@@ -248,8 +436,9 @@ mod test {
 
     use blstrs::G1Affine;
     use group::{prime::PrimeCurveAffine, Group};
+    use ruint::aliases::U256;
 
-    use super::subsum_accumulation;
+    use super::{bos_coster, subsum_accumulation, DebugPoint};
 
     #[test]
     fn subsum_smoke_test() {
@@ -332,6 +521,21 @@ mod test {
         assert_eq!(res, naive_msm(&input_points, &input_scalars));
     }
 
+    #[test]
+    fn smoke_test_bos_coster() {
+        let input_points = vec![G1Point::generator(), G1Point::generator()];
+
+        // let input_scalars = vec![Scalar::from(1), Scalar::from(2)];
+        let input_scalars = vec![U256::from(100), U256::from(200)];
+
+        let res = crate::fixed_base_msm_pippenger::bos_coster(
+            &input_scalars,
+            &[G1Projective::generator(), G1Projective::generator()],
+        );
+        let input_scalars = vec![Scalar::from(100), Scalar::from(200)];
+        assert_eq!(res, naive_msm(&input_points, &input_scalars));
+    }
+
     #[test]
     fn smoke_test_msm_best2_double_scalar() {
         use crate::ff::PrimeField;

From e0b44d05da067d99a088fcd71cd320bcff5f0b59 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Tue, 3 Sep 2024 11:09:11 +0100
Subject: [PATCH 16/25] bos-coster benchmarks

---
 cryptography/bls12_381/benches/benchmark.rs | 23 ++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/cryptography/bls12_381/benches/benchmark.rs b/cryptography/bls12_381/benches/benchmark.rs
index 36a29a7d..2e000af8 100644
--- a/cryptography/bls12_381/benches/benchmark.rs
+++ b/cryptography/bls12_381/benches/benchmark.rs
@@ -1,3 +1,4 @@
+use blstrs::Scalar;
 use crate_crypto_internal_eth_kzg_bls12_381::{
     batch_inversion,
     ff::Field,
@@ -6,9 +7,10 @@ use crate_crypto_internal_eth_kzg_bls12_381::{
     g1_batch_normalize, g2_batch_normalize,
     group::Group,
     lincomb::{g1_lincomb, g1_lincomb_unsafe, g2_lincomb, g2_lincomb_unsafe},
-    G1Projective, G2Projective, Scalar,
+    G1Projective, G2Projective,
 };
 use criterion::{criterion_group, criterion_main, Criterion};
+use ruint::aliases::U256;
 
 pub fn batch_inversion(c: &mut Criterion) {
     const NUM_ELEMENTS: usize = 8192;
@@ -45,15 +47,26 @@ pub fn fixed_base_msm(c: &mut Criterion) {
 }
 
 pub fn bench_msm(c: &mut Criterion) {
-    const NUM_G1_ELEMENTS: usize = 4096;
+    const NUM_G1_ELEMENTS: usize = 64;
 
     let polynomial_4096 = random_scalars(NUM_G1_ELEMENTS);
-    let g1_elements = random_g1_points(NUM_G1_ELEMENTS);
-    let g1_elements = g1_batch_normalize(&g1_elements);
+    let g1_elements_proj = random_g1_points(NUM_G1_ELEMENTS);
+    let g1_elements = g1_batch_normalize(&g1_elements_proj);
 
     c.bench_function(&format!("g1 msm of size {}", NUM_G1_ELEMENTS), |b| {
         b.iter(|| g1_lincomb_unsafe(&g1_elements, &polynomial_4096))
     });
+
+    let scalars_u256: Vec<_> = polynomial_4096
+        .iter()
+        .map(|p| U256::from_be_bytes(p.to_bytes_be()))
+        .collect();
+
+    c.bench_function(
+        &format!("bos-coster msm of size {}", NUM_G1_ELEMENTS),
+        |b| b.iter(|| bos_coster(&scalars_u256, &g1_elements_proj)),
+    );
+
     c.bench_function(&format!("g1 (safe) msm of size {}", NUM_G1_ELEMENTS), |b| {
         b.iter(|| g1_lincomb(&g1_elements, &polynomial_4096))
     });
@@ -94,5 +107,5 @@ fn random_g2_points(size: usize) -> Vec<G2Projective> {
     points
 }
 
-criterion_group!(benches, batch_inversion, fixed_base_msm, bench_msm);
+criterion_group!(benches, /*batch_inversion, fixed_base_msm, fixed_base_msm */ bench_msm);
 criterion_main!(benches);

From 67964cc28ca6cfb4d81ea4e437733644f2e46062 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Tue, 3 Sep 2024 11:15:07 +0100
Subject: [PATCH 17/25] remove bos-coster

---
 cryptography/bls12_381/benches/benchmark.rs   |  11 --
 .../bls12_381/src/fixed_base_msm_pippenger.rs | 100 +-----------------
 2 files changed, 1 insertion(+), 110 deletions(-)

diff --git a/cryptography/bls12_381/benches/benchmark.rs b/cryptography/bls12_381/benches/benchmark.rs
index 2e000af8..cf4a90fa 100644
--- a/cryptography/bls12_381/benches/benchmark.rs
+++ b/cryptography/bls12_381/benches/benchmark.rs
@@ -10,7 +10,6 @@ use crate_crypto_internal_eth_kzg_bls12_381::{
     G1Projective, G2Projective,
 };
 use criterion::{criterion_group, criterion_main, Criterion};
-use ruint::aliases::U256;
 
 pub fn batch_inversion(c: &mut Criterion) {
     const NUM_ELEMENTS: usize = 8192;
@@ -57,16 +56,6 @@ pub fn bench_msm(c: &mut Criterion) {
         b.iter(|| g1_lincomb_unsafe(&g1_elements, &polynomial_4096))
     });
 
-    let scalars_u256: Vec<_> = polynomial_4096
-        .iter()
-        .map(|p| U256::from_be_bytes(p.to_bytes_be()))
-        .collect();
-
-    c.bench_function(
-        &format!("bos-coster msm of size {}", NUM_G1_ELEMENTS),
-        |b| b.iter(|| bos_coster(&scalars_u256, &g1_elements_proj)),
-    );
-
     c.bench_function(&format!("g1 (safe) msm of size {}", NUM_G1_ELEMENTS), |b| {
         b.iter(|| g1_lincomb(&g1_elements, &polynomial_4096))
     });
diff --git a/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
index a917bda5..8df83bcb 100644
--- a/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
+++ b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
@@ -237,88 +237,6 @@ pub fn multi_msm(
     result
 }
 
-use ruint::aliases::*;
-use ruint::Uint;
-use std::cmp::Reverse;
-use std::collections::BinaryHeap;
-
-#[derive(Debug, Clone, Copy)]
-struct PointScalar {
-    scalar: U256,
-    point: G1Projective,
-}
-
-impl PartialEq for PointScalar {
-    fn eq(&self, other: &Self) -> bool {
-        self.scalar == other.scalar && self.point == other.point
-    }
-}
-
-impl Eq for PointScalar {}
-
-impl PartialOrd for PointScalar {
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl Ord for PointScalar {
-    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        self.scalar.cmp(&other.scalar)
-    }
-}
-
-pub fn bos_coster(scalars: &[U256], points: &[G1Projective]) -> G1Projective {
-    if scalars.len() != points.len() {
-        panic!("Mismatch between number of scalars and points");
-    }
-
-    let mut heap = BinaryHeap::new();
-    for (scalar, point) in scalars.into_iter().zip(points.iter()) {
-        if *scalar != U256::ZERO {
-            heap.push(PointScalar {
-                scalar: *scalar,
-                point: *point,
-            });
-        }
-    }
-
-    while heap.len() > 1 {
-        let PointScalar {
-            scalar: n1,
-            point: p1,
-        } = heap.pop().unwrap();
-        let PointScalar {
-            scalar: n2,
-            point: p2,
-        } = heap.pop().unwrap();
-
-        let p_sum = p1 + &p2;
-        let n_diff = n1 - n2;
-
-        if n_diff > U256::ZERO {
-            heap.push(PointScalar {
-                scalar: n_diff,
-                point: p1.clone(),
-            });
-        }
-        heap.push(PointScalar {
-            scalar: n2,
-            point: p_sum,
-        });
-    }
-
-    if let Some(PointScalar {
-        scalar: n,
-        point: p,
-    }) = heap.pop()
-    {
-        p * Scalar::from_bytes_be(&n.to_be_bytes()).unwrap()
-    } else {
-        G1Projective::identity() // Identity point
-    }
-}
-
 // Algorithm1 from the LFG paper
 // TODO: Fix later, this algorithm is broken in the POC and the paper
 // fn subsum_accumulation(b: &[u64], s: &[G1Affine]) -> G1Projective {
@@ -436,9 +354,8 @@ mod test {
 
     use blstrs::G1Affine;
     use group::{prime::PrimeCurveAffine, Group};
-    use ruint::aliases::U256;
 
-    use super::{bos_coster, subsum_accumulation, DebugPoint};
+    use super::subsum_accumulation;
 
     #[test]
     fn subsum_smoke_test() {
@@ -521,21 +438,6 @@ mod test {
         assert_eq!(res, naive_msm(&input_points, &input_scalars));
     }
 
-    #[test]
-    fn smoke_test_bos_coster() {
-        let input_points = vec![G1Point::generator(), G1Point::generator()];
-
-        // let input_scalars = vec![Scalar::from(1), Scalar::from(2)];
-        let input_scalars = vec![U256::from(100), U256::from(200)];
-
-        let res = crate::fixed_base_msm_pippenger::bos_coster(
-            &input_scalars,
-            &[G1Projective::generator(), G1Projective::generator()],
-        );
-        let input_scalars = vec![Scalar::from(100), Scalar::from(200)];
-        assert_eq!(res, naive_msm(&input_points, &input_scalars));
-    }
-
     #[test]
     fn smoke_test_msm_best2_double_scalar() {
         use crate::ff::PrimeField;

From 0e7ec3ff68ce1e170edd5a603b297dadcc8af5e5 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Tue, 3 Sep 2024 11:51:44 +0100
Subject: [PATCH 18/25] rename msm method

---
 cryptography/bls12_381/src/fixed_base_msm_pippenger.rs | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
index 8df83bcb..73f68a12 100644
--- a/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
+++ b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
@@ -30,7 +30,7 @@ impl FixedBaseMSMPippenger {
     }
 
     pub fn msm(&self, scalars: &[Scalar]) -> G1Projective {
-        msm_best2_noinfo(scalars, &self.precomputed_points, self.window_size)
+        pippenger_fixed_base_msm(scalars, &self.precomputed_points, self.window_size)
     }
 }
 
@@ -57,7 +57,7 @@ pub fn precompute(
     g1_batch_normalize(&results)
 }
 
-pub fn msm_best2_noinfo(
+pub fn pippenger_fixed_base_msm(
     coeffs: &[Scalar],
     bases_precomputed: &[G1Point],
     window_size: usize,
@@ -348,7 +348,9 @@ fn horners_rule_sum(points: &[G1Point]) -> G1Projective {
 mod test {
 
     use crate::{
-        fixed_base_msm_pippenger::{horners_rule_sum, msm_best2_noinfo as msm_best2, precompute},
+        fixed_base_msm_pippenger::{
+            horners_rule_sum, pippenger_fixed_base_msm as msm_best2, precompute,
+        },
         G1Point, G1Projective, Scalar,
     };
 

From cb4dbbd76714a328aa0f189a387da1b86a551a08 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Tue, 3 Sep 2024 15:00:28 +0100
Subject: [PATCH 19/25] fix: subsum calculation was not taking into account
 when the last bucket was not b_1

---
 .../bls12_381/src/fixed_base_msm_pippenger.rs | 29 ++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
index 73f68a12..7068244e 100644
--- a/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
+++ b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
@@ -8,7 +8,7 @@ use group::Group;
 use crate::booth_encoding::get_booth_index;
 use crate::g1_batch_normalize;
 use crate::G1Point;
-
+#[derive(Debug, Clone)]
 pub struct FixedBaseMSMPippenger {
     precomputed_points: Vec<G1Affine>,
     number_of_windows: usize,
@@ -293,6 +293,8 @@ pub fn multi_msm(
 // It seems to be faster, but thats likely because the actual one is not implemented
 // correctly and does not have the short cuts for bucket sizes 0 and 1
 fn subsum_accumulation(b: &[u64], s: &[G1Affine]) -> G1Projective {
+    assert_eq!(b.len(), s.len());
+
     // If we only have one, then we can return the scalar multiplication
     // This is an assumption that LFG was making too.
     if b.len() == 0 {
@@ -321,6 +323,14 @@ fn subsum_accumulation(b: &[u64], s: &[G1Affine]) -> G1Projective {
             for _ in 0..diff {
                 res += running_sum
             }
+        } else {
+            //Check the diff between the last scalar and 1
+            // This is so that we "finish" the horner sum.
+
+            let diff = b[index] - 1;
+            for _ in 0..diff {
+                res += running_sum
+            }
         }
     });
     res
@@ -395,6 +405,23 @@ mod test {
         );
     }
 
+    fn naive_subsum_accumulation(b: &[u64], s: &[G1Affine]) -> G1Projective {
+        let mut res = G1Projective::identity();
+        for (scalar, point) in b.iter().zip(s) {
+            res += G1Projective::from(point) * Scalar::from(*scalar)
+        }
+        res
+    }
+
+    #[test]
+    fn subsum_regression_test() {
+        let indices = [2, 3];
+        let points = vec![G1Affine::generator(); 2];
+        let got = subsum_accumulation(&indices, &points);
+        let expected = naive_subsum_accumulation(&indices, &points);
+        assert_eq!(got, expected);
+    }
+
     #[test]
     fn horners_sum_smoke_test() {
         let result = horners_rule_sum(&[G1Affine::generator()]);

From 3574e14ecc298b73bd5cac3cdc9ec7316af0ec3e Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Tue, 3 Sep 2024 17:18:14 +0100
Subject: [PATCH 20/25] use fixed base msm

---
 cryptography/bls12_381/src/fixed_base_msm.rs | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/cryptography/bls12_381/src/fixed_base_msm.rs b/cryptography/bls12_381/src/fixed_base_msm.rs
index 491ffc01..e3e77150 100644
--- a/cryptography/bls12_381/src/fixed_base_msm.rs
+++ b/cryptography/bls12_381/src/fixed_base_msm.rs
@@ -1,4 +1,4 @@
-use crate::{G1Projective, Scalar};
+use crate::{fixed_base_msm_pippenger::FixedBaseMSMPippenger, G1Projective, Scalar};
 use blstrs::{Fp, G1Affine};
 
 /// FixedBaseMSMPrecomp computes a multi scalar multiplication using pre-computations.
@@ -28,7 +28,9 @@ pub enum UsePrecomp {
 #[derive(Debug)]
 pub enum FixedBaseMSM {
     Precomp(FixedBaseMSMPrecomp),
-    NoPrecomp(Vec<G1Affine>),
+    // TODO: We are hijacking the NoPrecomp variant to store the
+    // TODO: new pippenger algorithm.
+    NoPrecomp(FixedBaseMSMPippenger),
 }
 
 impl FixedBaseMSM {
@@ -37,18 +39,14 @@ impl FixedBaseMSM {
             UsePrecomp::Yes { width } => {
                 FixedBaseMSM::Precomp(FixedBaseMSMPrecomp::new(generators, width))
             }
-            UsePrecomp::No => FixedBaseMSM::NoPrecomp(generators),
+            UsePrecomp::No => FixedBaseMSM::NoPrecomp(FixedBaseMSMPippenger::new(&generators)),
         }
     }
 
     pub fn msm(&self, scalars: Vec<Scalar>) -> G1Projective {
         match self {
             FixedBaseMSM::Precomp(precomp) => precomp.msm(scalars),
-            FixedBaseMSM::NoPrecomp(generators) => {
-                use crate::lincomb::g1_lincomb;
-                g1_lincomb(generators, &scalars)
-                    .expect("number of generators and scalars should be equal")
-            }
+            FixedBaseMSM::NoPrecomp(precomp) => precomp.msm(&scalars),
         }
     }
 }

From bd4cd72206483bf486870f10034f1ff1d66d1332 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Tue, 3 Sep 2024 17:20:06 +0100
Subject: [PATCH 21/25] update benchmarks

---
 cryptography/bls12_381/benches/benchmark.rs | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/cryptography/bls12_381/benches/benchmark.rs b/cryptography/bls12_381/benches/benchmark.rs
index cf4a90fa..25803359 100644
--- a/cryptography/bls12_381/benches/benchmark.rs
+++ b/cryptography/bls12_381/benches/benchmark.rs
@@ -39,10 +39,9 @@ pub fn fixed_base_msm(c: &mut Criterion) {
 
     let fixed_base_pip = FixedBaseMSMPippenger::new(&generators);
 
-    c.bench_function(
-        "bls12_381 fixed_base_msm best2 no info (fixed base pip)",
-        |b| b.iter(|| fixed_base_pip.msm(&scalars)),
-    );
+    c.bench_function("bls12_381 fixed based pippenger algorithm", |b| {
+        b.iter(|| fixed_base_pip.msm(&scalars))
+    });
 }
 
 pub fn bench_msm(c: &mut Criterion) {
@@ -96,5 +95,11 @@ fn random_g2_points(size: usize) -> Vec<G2Projective> {
     points
 }
 
-criterion_group!(benches, /*batch_inversion, fixed_base_msm, fixed_base_msm */ bench_msm);
+criterion_group!(
+    benches,
+    batch_inversion,
+    fixed_base_msm,
+    bench_msm,
+    fixed_base_msm
+);
 criterion_main!(benches);

From f8c3276634b24be650d51e90c26fcb3a5f2131d7 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Tue, 3 Sep 2024 17:21:00 +0100
Subject: [PATCH 22/25] small cleanup

---
 cryptography/bls12_381/src/fixed_base_msm_pippenger.rs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
index 7068244e..81b94874 100644
--- a/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
+++ b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
@@ -1,7 +1,6 @@
 use blstrs::G1Affine;
 use blstrs::G1Projective;
 use blstrs::Scalar;
-use ff::Field;
 use ff::PrimeField;
 use group::Group;
 
@@ -11,7 +10,6 @@ use crate::G1Point;
 #[derive(Debug, Clone)]
 pub struct FixedBaseMSMPippenger {
     precomputed_points: Vec<G1Affine>,
-    number_of_windows: usize,
     window_size: usize,
 }
 
@@ -24,7 +22,6 @@ impl FixedBaseMSMPippenger {
 
         FixedBaseMSMPippenger {
             precomputed_points,
-            number_of_windows,
             window_size,
         }
     }

From 6850fec5d5f9031ca2aabaa7a10286e7bc2357d2 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Tue, 3 Sep 2024 17:23:12 +0100
Subject: [PATCH 23/25] remove TODO: We can assume that users will pass in
 empty vectors

---
 cryptography/bls12_381/src/batch_add.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cryptography/bls12_381/src/batch_add.rs b/cryptography/bls12_381/src/batch_add.rs
index e962e986..ecf5a3a0 100644
--- a/cryptography/bls12_381/src/batch_add.rs
+++ b/cryptography/bls12_381/src/batch_add.rs
@@ -179,8 +179,6 @@ pub fn multi_batch_addition(mut multi_points: Vec<Vec<G1Affine>>) -> Vec<G1Affin
         }
     }
 
-    // multi_points.into_iter().map(|points| points[0]).collect()
-    // TODO: using this incase we have empty vectors passed in
     multi_points
         .into_iter()
         .map(|points| points.get(0).copied().unwrap_or(G1Affine::default()))

From 6cd81e2383bcd415c8f420601f8beaf97328fed4 Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Wed, 4 Sep 2024 18:48:50 +0100
Subject: [PATCH 24/25] use -3: batch_addition does not allow us to have
 duplicate points or points that are negations of each other

---
 cryptography/kzg_multi_open/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cryptography/kzg_multi_open/src/lib.rs b/cryptography/kzg_multi_open/src/lib.rs
index 6fac861d..68aa2211 100644
--- a/cryptography/kzg_multi_open/src/lib.rs
+++ b/cryptography/kzg_multi_open/src/lib.rs
@@ -28,7 +28,7 @@ pub(crate) fn create_insecure_commit_opening_keys(
     let g1_gen = G1Projective::generator();
 
     let mut g1_points = Vec::new();
-    let secret = -Scalar::from(1 as u64);
+    let secret = -Scalar::from(3 as u64);
     let mut current_secret_pow = Scalar::ONE;
     for _ in 0..num_coefficients_in_polynomial {
         g1_points.push(g1_gen * current_secret_pow);
@@ -39,7 +39,7 @@ pub(crate) fn create_insecure_commit_opening_keys(
     let ck = CommitKey::new(g1_points.clone());
 
     let mut g2_points = Vec::new();
-    let secret = -Scalar::from(1 as u64);
+    let secret = -Scalar::from(3 as u64);
     let mut current_secret_pow = Scalar::ONE;
     let g2_gen = G2Projective::generator();
     // The setup needs 65 g1 elements for the opening key, in order

From 328fe47468145403220edc2b39340d15b0b5165e Mon Sep 17 00:00:00 2001
From: Kevaundray Wedderburn <kevtheappdev@gmail.com>
Date: Wed, 4 Sep 2024 18:57:27 +0100
Subject: [PATCH 25/25] check for invariants in the constructor

---
 .../bls12_381/src/fixed_base_msm_pippenger.rs | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
index 81b94874..8e59e653 100644
--- a/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
+++ b/cryptography/bls12_381/src/fixed_base_msm_pippenger.rs
@@ -1,3 +1,5 @@
+use std::collections::HashSet;
+
 use blstrs::G1Affine;
 use blstrs::G1Projective;
 use blstrs::Scalar;
@@ -20,6 +22,34 @@ impl FixedBaseMSMPippenger {
         let number_of_windows = Scalar::NUM_BITS as usize / window_size + 1;
         let precomputed_points = precompute(window_size, number_of_windows, points);
 
+        // Check that the points are not duplicated since the batch addition formula
+        // assumes that all points are distinct.
+        let points_set: HashSet<_> = precomputed_points
+            .iter()
+            .map(|p| p.to_uncompressed())
+            .collect();
+        assert_eq!(
+            points_set.len(),
+            precomputed_points.len(),
+            "precomputed points should not contain any duplicates. Each input point is scaled by 2^k, so if points were generated by going s^k*G, please ensure s is not a power of 2"
+        );
+
+        // Check that no point is the negation of each other
+        // as the batch addition formula assumes that distinct points have
+        // distinct `x` coordinates.
+        for i in 0..precomputed_points.len() {
+            for k in 0..precomputed_points.len() {
+                if i == k {
+                    continue;
+                }
+                // Check that point is not the negation of other point
+                assert!(
+                    precomputed_points[i] != -precomputed_points[k],
+                    "point and its negation are not allowed in the same MSM. Each input point is scaled by 2^k, so if points were generated by going s^k*G, please ensure s is not a power of 2"
+                )
+            }
+        }
+
         FixedBaseMSMPippenger {
             precomputed_points,
             window_size,