Convert lrgb_to_xyz to DType

Beinsezii · Beinsezii · commit 34d3a8649a05 · 2024-06-02T23:31:28.000-07:00
Should be a best case scenario. Literally just element-wise FMA.

Almost +30%: 107µs to 77µs on arch=native

It's *cool* yes but the code quality degrades so much I wonder if its
even worth it. Then when you factor in the complex 3-dimension
deinterleave that'll be needed to use it properly...

I still have to test it of course, but I just feel it'll eat what little
perf I get. I have AVX512 as well, so AVX≤2 will probably end up hurting
even more.
diff --git a/benches/conversions.rs b/benches/conversions.rs
@@ -1,6 +1,7 @@
 #![feature(portable_simd)]
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use colcon::{Space, convert_space};
+//use std::simd::prelude::*;
 
 fn pixels() -> Box<[f32]> {
     let size = 512;
@@ -25,6 +26,10 @@ pub fn conversions(c: &mut Criterion) {
         black_box(pixels.clone().chunks_exact_mut(3).for_each(|pixel| colcon::lrgb_to_xyz(pixel.try_into().unwrap())));
     } ));
 
+    c.bench_function("lrgb_to_xyz_simd", |b| b.iter(|| {
+        black_box(pixels.clone().as_simd_mut::<32>().1.chunks_exact_mut(3).for_each(|pixel| colcon::lrgb_to_xyz(pixel.try_into().unwrap())));
+    } ));
+
     c.bench_function("xyz_to_cielab", |b| b.iter(|| {
         black_box(pixels.clone().chunks_exact_mut(3).for_each(|pixel| colcon::xyz_to_cielab(pixel.try_into().unwrap())));
     } ));
diff --git a/src/lib.rs b/src/lib.rs
@@ -288,11 +288,11 @@ fn matmul3t(pixel: [f32; 3], matrix: [[f32; 3]; 3]) -> [f32; 3] {
 }
 
 /// Transposed 3 * 3x3 matrix multiply, ie matrix @ pixel
-fn matmul3(matrix: [[f32; 3]; 3], pixel: [f32; 3]) -> [f32; 3] {
+fn matmul3<T: DType>(matrix: [[f32; 3]; 3], pixel: [T; 3]) -> [T; 3] {
     [
-        pixel[0] * matrix[0][0] + pixel[1] * matrix[0][1] + pixel[2] * matrix[0][2],
-        pixel[0] * matrix[1][0] + pixel[1] * matrix[1][1] + pixel[2] * matrix[1][2],
-        pixel[0] * matrix[2][0] + pixel[1] * matrix[2][1] + pixel[2] * matrix[2][2],
+        pixel[0].fma(DType::f32(matrix[0][0]), pixel[1].fma(DType::f32(matrix[0][1]), pixel[2] * DType::f32(matrix[0][2]))),
+        pixel[0].fma(DType::f32(matrix[1][0]), pixel[1].fma(DType::f32(matrix[1][1]), pixel[2] * DType::f32(matrix[1][2]))),
+        pixel[0].fma(DType::f32(matrix[2][0]), pixel[1].fma(DType::f32(matrix[2][1]), pixel[2] * DType::f32(matrix[2][2]))),
     ]
 }
 // ### MATRICES ### }}}
@@ -1048,11 +1048,15 @@ pub extern "C" fn srgb_to_lrgb(pixel: &mut [f32; 3]) {
 /// Convert from Linear Light RGB to CIE XYZ, D65 standard illuminant
 ///
 /// <https://en.wikipedia.org/wiki/SRGB#From_sRGB_to_CIE_XYZ>
-#[no_mangle]
-pub extern "C" fn lrgb_to_xyz(pixel: &mut [f32; 3]) {
+pub fn lrgb_to_xyz<T: DType>(pixel: &mut [T; 3]) {
     *pixel = matmul3(XYZ65_MAT, *pixel)
 }
 
+#[no_mangle]
+extern "C" fn lrgb_to_xyz_f32(pixel: &mut [f32; 3]) {
+    lrgb_to_xyz(pixel)
+}
+
 /// Convert from CIE XYZ to CIE LAB.
 ///
 /// <https://en.wikipedia.org/wiki/CIELAB_color_space#From_CIEXYZ_to_CIELAB>
@@ -1601,7 +1605,7 @@ mod tests {
 
     #[test]
     fn xyz_forwards() {
-        func_cmp(LRGB, XYZ, lrgb_to_xyz)
+        func_cmp(LRGB, XYZ, lrgb_to_xyz_f32)
     }
     #[test]
     fn xyz_backwards() {
@@ -1752,7 +1756,7 @@ mod tests {
             ("hsv_backwards", hsv_to_srgb),
             ("lrgb_forwards", srgb_to_lrgb),
             ("lrgb_backwards", lrgb_to_srgb),
-            ("xyz_forwards", lrgb_to_xyz),
+            ("xyz_forwards", lrgb_to_xyz_f32),
             ("xyz_backwards", xyz_to_lrgb),
             ("lab_forwards", xyz_to_cielab),
             ("lab_backwards", cielab_to_xyz),