Attempt proper SIMD unweave

Beinsezii · Beinsezii · commit ad60e5d76614 · 2024-06-03T16:42:26.000-07:00
Fail. 1000x slower
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,7 +15,7 @@ publish = true
 
 [features]
 nightly = []
-default = []
+default = ["nightly"]
 
 [dependencies]
 
diff --git a/benches/conversions.rs b/benches/conversions.rs
@@ -27,7 +27,20 @@ pub fn conversions(c: &mut Criterion) {
     } ));
 
     c.bench_function("lrgb_to_xyz_simd", |b| b.iter(|| {
-        black_box(pixels.clone().as_simd_mut::<32>().1.chunks_exact_mut(3).for_each(|pixel| colcon::lrgb_to_xyz(pixel.try_into().unwrap())));
+        black_box({
+            let mut pixels_simd = pixels.clone();
+            let mut unwoven = colcon::unweave_simd::<3, 8>(&pixels_simd);
+
+            let [mut rs, mut gs, mut bs] = unwoven;
+
+            for (r, (g, b)) in rs.1.iter_mut().zip(gs.1.iter_mut().zip(bs.1.iter_mut())) {
+                let mut arr = [*r, *g, *b];
+                colcon::lrgb_to_xyz(&mut arr);
+            }
+
+            //pixels_simd = colcon::weave(unwoven);
+
+        });
     } ));
 
     c.bench_function("xyz_to_cielab", |b| b.iter(|| {
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,4 +1,5 @@
 #![cfg_attr(feature = "nightly", feature(portable_simd))]
+#![cfg_attr(feature = "nightly", feature(slice_as_chunks))]
 #![warn(missing_docs)]
 
 //! Simple colorspace conversions in pure Rust.
@@ -133,6 +134,41 @@ where
     }
 }
 
+#[cfg(feature = "nightly")]
+/// Create an array of separate channel buffers from a single interwoven buffer.
+/// Copies the data.
+pub fn unweave_simd<'a, const C: usize, const L: usize>(slice: &[f32]) -> [(Box<[f32]>, Box<[Simd<f32, L>]>); C]
+where
+    LaneCount<L>: SupportedLaneCount,
+{
+    let len = slice.len() / (C * L);
+    let mut result: [Vec<Simd<f32, L>>; C] = (0..C)
+        .map(|_| Vec::with_capacity(len))
+        .collect::<Vec<Vec<_>>>()
+        .try_into()
+        .unwrap();
+
+    //let chunks = slice.as_chunks::<C>();
+    //for chunk in chunks.0.
+    //let mut remainders: [Box<[f32]>; C] = [Box::new([]), Box::new([]), Box::new([])];
+    for chunk in slice.chunks(C * L) {
+        if chunk.len() == C * L {
+            for c in 0..C {
+                result[c].push(Simd::from_slice(
+                    &(0..L).map(|l| chunk[c + l * c]).collect::<Vec<f32>>(),
+                ));
+            }
+        }
+    }
+
+    result
+        .into_iter()
+        .map(|v| (Vec::new().into_boxed_slice(), v.into_boxed_slice()))
+        .collect::<Vec<(Box<[f32]>, Box<[Simd<f32, L>]>)>>()
+        .try_into()
+        .unwrap()
+}
+
 /// Create an array of separate channel buffers from a single interwoven buffer.
 /// Copies the data.
 pub fn unweave<const N: usize>(slice: &[f32]) -> [Box<[f32]>; N] {