Skip to content

Commit 34d3a86

Browse files
committed
Convert lrgb_to_xyz to DType
Should be a best case scenario. Literally just element-wise FMA. Almost +30%: 107µs to 77µs on arch=native It's *cool* yes but the code quality degrades so much I wonder if its even worth it. Then when you factor in the complex 3-dimension deinterleave that'll be needed to use it properly... I still have to test it of course, but I just feel it'll eat what little perf I get. I have AVX512 as well, so AVX≤2 will probably end up hurting even more.
1 parent 138a072 commit 34d3a86

File tree

2 files changed

+17
-8
lines changed

2 files changed

+17
-8
lines changed

benches/conversions.rs

+5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#![feature(portable_simd)]
22
use criterion::{black_box, criterion_group, criterion_main, Criterion};
33
use colcon::{Space, convert_space};
4+
//use std::simd::prelude::*;
45

56
fn pixels() -> Box<[f32]> {
67
let size = 512;
@@ -25,6 +26,10 @@ pub fn conversions(c: &mut Criterion) {
2526
black_box(pixels.clone().chunks_exact_mut(3).for_each(|pixel| colcon::lrgb_to_xyz(pixel.try_into().unwrap())));
2627
} ));
2728

29+
c.bench_function("lrgb_to_xyz_simd", |b| b.iter(|| {
30+
black_box(pixels.clone().as_simd_mut::<32>().1.chunks_exact_mut(3).for_each(|pixel| colcon::lrgb_to_xyz(pixel.try_into().unwrap())));
31+
} ));
32+
2833
c.bench_function("xyz_to_cielab", |b| b.iter(|| {
2934
black_box(pixels.clone().chunks_exact_mut(3).for_each(|pixel| colcon::xyz_to_cielab(pixel.try_into().unwrap())));
3035
} ));

src/lib.rs

+12-8
Original file line numberDiff line numberDiff line change
@@ -288,11 +288,11 @@ fn matmul3t(pixel: [f32; 3], matrix: [[f32; 3]; 3]) -> [f32; 3] {
288288
}
289289

290290
/// Transposed 3 * 3x3 matrix multiply, ie matrix @ pixel
291-
fn matmul3(matrix: [[f32; 3]; 3], pixel: [f32; 3]) -> [f32; 3] {
291+
fn matmul3<T: DType>(matrix: [[f32; 3]; 3], pixel: [T; 3]) -> [T; 3] {
292292
[
293-
pixel[0] * matrix[0][0] + pixel[1] * matrix[0][1] + pixel[2] * matrix[0][2],
294-
pixel[0] * matrix[1][0] + pixel[1] * matrix[1][1] + pixel[2] * matrix[1][2],
295-
pixel[0] * matrix[2][0] + pixel[1] * matrix[2][1] + pixel[2] * matrix[2][2],
293+
pixel[0].fma(DType::f32(matrix[0][0]), pixel[1].fma(DType::f32(matrix[0][1]), pixel[2] * DType::f32(matrix[0][2]))),
294+
pixel[0].fma(DType::f32(matrix[1][0]), pixel[1].fma(DType::f32(matrix[1][1]), pixel[2] * DType::f32(matrix[1][2]))),
295+
pixel[0].fma(DType::f32(matrix[2][0]), pixel[1].fma(DType::f32(matrix[2][1]), pixel[2] * DType::f32(matrix[2][2]))),
296296
]
297297
}
298298
// ### MATRICES ### }}}
@@ -1048,11 +1048,15 @@ pub extern "C" fn srgb_to_lrgb(pixel: &mut [f32; 3]) {
10481048
/// Convert from Linear Light RGB to CIE XYZ, D65 standard illuminant
10491049
///
10501050
/// <https://en.wikipedia.org/wiki/SRGB#From_sRGB_to_CIE_XYZ>
1051-
#[no_mangle]
1052-
pub extern "C" fn lrgb_to_xyz(pixel: &mut [f32; 3]) {
1051+
pub fn lrgb_to_xyz<T: DType>(pixel: &mut [T; 3]) {
10531052
*pixel = matmul3(XYZ65_MAT, *pixel)
10541053
}
10551054

1055+
#[no_mangle]
1056+
extern "C" fn lrgb_to_xyz_f32(pixel: &mut [f32; 3]) {
1057+
lrgb_to_xyz(pixel)
1058+
}
1059+
10561060
/// Convert from CIE XYZ to CIE LAB.
10571061
///
10581062
/// <https://en.wikipedia.org/wiki/CIELAB_color_space#From_CIEXYZ_to_CIELAB>
@@ -1601,7 +1605,7 @@ mod tests {
16011605

16021606
#[test]
16031607
fn xyz_forwards() {
1604-
func_cmp(LRGB, XYZ, lrgb_to_xyz)
1608+
func_cmp(LRGB, XYZ, lrgb_to_xyz_f32)
16051609
}
16061610
#[test]
16071611
fn xyz_backwards() {
@@ -1752,7 +1756,7 @@ mod tests {
17521756
("hsv_backwards", hsv_to_srgb),
17531757
("lrgb_forwards", srgb_to_lrgb),
17541758
("lrgb_backwards", lrgb_to_srgb),
1755-
("xyz_forwards", lrgb_to_xyz),
1759+
("xyz_forwards", lrgb_to_xyz_f32),
17561760
("xyz_backwards", xyz_to_lrgb),
17571761
("lab_forwards", xyz_to_cielab),
17581762
("lab_backwards", cielab_to_xyz),

0 commit comments

Comments
 (0)