Skip to content

Commit 2e576c1

Browse files
committed
Add F64 support and FMA3 ops. Squashed commit:
commit be480be Author: Beinsezii <beinsezii@gmail.com> Date: Fri Jun 7 14:39:37 2024 -0700 More FMA3 ops Average 2-4% perf gain, should also be more accurate. Code little less readable but with all the formulae being open source documents it shouldn't be too bad commit 0f14713 Author: Beinsezii <beinsezii@gmail.com> Date: Thu Jun 6 18:28:20 2024 -0700 TESTS use f64 for everything except NaN checks Lowered epsilons to accomodate this. commit cd658e2 Author: Beinsezii <beinsezii@gmail.com> Date: Thu Jun 6 17:47:59 2024 -0700 Fold commit 5733c9d Author: Beinsezii <beinsezii@gmail.com> Date: Thu Jun 6 17:46:29 2024 -0700 Move UTs to separate file commit efb769d Author: Beinsezii <beinsezii@gmail.com> Date: Thu Jun 6 17:42:41 2024 -0700 F64 Part 5: convert_space Leaving FFI as-is commit 83d8153 Author: Beinsezii <beinsezii@gmail.com> Date: Thu Jun 6 17:37:03 2024 -0700 F64 Part 4: More C FFI commit b09da2b Author: Beinsezii <beinsezii@gmail.com> Date: Thu Jun 6 17:28:07 2024 -0700 F64 Part 3: Backward functions commit 75c7df7 Author: Beinsezii <beinsezii@gmail.com> Date: Thu Jun 6 17:07:55 2024 -0700 F64 Part 2: Forward functions commit 6990067 Author: Beinsezii <beinsezii@gmail.com> Date: Thu Jun 6 16:27:36 2024 -0700 F64 Part 1: Transfer and util functions commit b28529f Author: Beinsezii <beinsezii@gmail.com> Date: Mon Jun 3 18:12:15 2024 -0700 Use macros to quickdly define external C fns commit 7a31d4a Author: Beinsezii <beinsezii@gmail.com> Date: Mon Jun 3 17:44:19 2024 -0700 Use custom traits instead of Into<f32> commit e96880b Author: Beinsezii <beinsezii@gmail.com> Date: Mon Jun 3 17:18:25 2024 -0700 DT with only f64 Looks clean. Though I might want slices and arrays too? .into() won't work for that. New trait? commit ad60e5d Author: Beinsezii <beinsezii@gmail.com> Date: Mon Jun 3 16:42:26 2024 -0700 Attempt proper SIMD unweave Fail. 1000x slower commit 20e5927 Author: Beinsezii <beinsezii@gmail.com> Date: Mon Jun 3 03:58:35 2024 -0700 Gate mul_add() behind FMA3 check It's crazy slow without FMA3 and the compiler won't auto change between FMA or not because *technically* it changes the results and that's a sin in the Rust bible commit f51e096 Author: Beinsezii <beinsezii@gmail.com> Date: Mon Jun 3 03:20:23 2024 -0700 Rustfmt commit bf2c194 Merge: 34d3a86 f9d3e65 Author: Beinsezii <beinsezii@gmail.com> Date: Mon Jun 3 03:11:52 2024 -0700 Merge branch 'master' into portable_simd commit 34d3a86 Author: Beinsezii <beinsezii@gmail.com> Date: Sun Jun 2 23:31:28 2024 -0700 Convert lrgb_to_xyz to DType Should be a best case scenario. Literally just element-wise FMA. Almost +30%: 107µs to 77µs on arch=native It's *cool* yes but the code quality degrades so much I wonder if its even worth it. Then when you factor in the complex 3-dimension deinterleave that'll be needed to use it properly... I still have to test it of course, but I just feel it'll eat what little perf I get. I have AVX512 as well, so AVX≤2 will probably end up hurting even more. commit 138a072 Author: Beinsezii <beinsezii@gmail.com> Date: Sun Jun 2 22:29:14 2024 -0700 Attempt using `portable_simd` It's actually no faster than the autovectorized version? commit 37882b9 Author: Beinsezii <beinsezii@gmail.com> Date: Sun Jun 2 20:55:42 2024 -0700 Initial F64 + Autovectorize test disappointing. Like 10% faster at most. Probably from the branch
1 parent f9d3e65 commit 2e576c1

File tree

5 files changed

+1028
-823
lines changed

5 files changed

+1028
-823
lines changed

Cargo.toml

+2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ publish = true
1313

1414
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
1515

16+
[features]
17+
1618
[dependencies]
1719

1820
[dev-dependencies]

benches/conversions.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ pub fn conversions(c: &mut Criterion) {
7777
} ));
7878

7979
c.bench_function("srgb_eotf_inverse", |b| b.iter(|| {
80-
black_box(pixels.clone().iter_mut().for_each(|n| *n = colcon::srgb_eotf_inverse(*n)));
80+
black_box(pixels.clone().iter_mut().for_each(|n| *n = colcon::srgb_oetf(*n)));
8181
} ));
8282

8383
c.bench_function("full_to", |b| b.iter(|| {

scripts/test_ctypes.py

+42-34
Original file line numberDiff line numberDiff line change
@@ -17,30 +17,38 @@
1717
colcon.convert_space_ffi.restype = ctypes.c_int32
1818

1919
# up
20-
colcon.srgb_to_hsv.argtypes = [cpixel]
21-
colcon.srgb_to_lrgb.argtypes = [cpixel]
22-
colcon.lrgb_to_xyz.argtypes = [cpixel]
23-
colcon.xyz_to_lab.argtypes = [cpixel]
24-
colcon.xyz_to_oklab.argtypes = [cpixel]
25-
colcon.xyz_to_jzazbz.argtypes = [cpixel]
26-
colcon.lab_to_lch.argtypes = [cpixel]
20+
colcon.srgb_to_hsv_f32.argtypes = [cpixel]
21+
colcon.srgb_to_lrgb_f32.argtypes = [cpixel]
22+
colcon.lrgb_to_xyz_f32.argtypes = [cpixel]
23+
colcon.xyz_to_cielab_f32.argtypes = [cpixel]
24+
colcon.xyz_to_oklab_f32.argtypes = [cpixel]
25+
colcon.xyz_to_jzazbz_f32.argtypes = [cpixel]
26+
colcon.lab_to_lch_f32.argtypes = [cpixel]
2727

2828
# down
29-
colcon.lch_to_lab.argtypes = [cpixel]
30-
colcon.jzazbz_to_xyz.argtypes = [cpixel]
31-
colcon.oklab_to_xyz.argtypes = [cpixel]
32-
colcon.lab_to_xyz.argtypes = [cpixel]
33-
colcon.xyz_to_lrgb.argtypes = [cpixel]
34-
colcon.lrgb_to_srgb.argtypes = [cpixel]
35-
colcon.srgb_to_hsv.argtypes = [cpixel]
29+
colcon.lch_to_lab_f32.argtypes = [cpixel]
30+
colcon.jzazbz_to_xyz_f32.argtypes = [cpixel]
31+
colcon.oklab_to_xyz_f32.argtypes = [cpixel]
32+
colcon.cielab_to_xyz_f32.argtypes = [cpixel]
33+
colcon.xyz_to_lrgb_f32.argtypes = [cpixel]
34+
colcon.lrgb_to_srgb_f32.argtypes = [cpixel]
35+
colcon.srgb_to_hsv_f32.argtypes = [cpixel]
3636

3737
# extra
38-
colcon.srgb_eotf.argtypes = [ctypes.c_float]
39-
colcon.srgb_eotf.restype = ctypes.c_float
40-
colcon.srgb_eotf_inverse.argtypes = [ctypes.c_float]
41-
colcon.srgb_eotf_inverse.restype = ctypes.c_float
42-
colcon.hk_high2023.argtypes = [cpixel]
43-
colcon.hk_high2023_comp.argtypes = [cpixel]
38+
colcon.srgb_eotf_f32.argtypes = [ctypes.c_float]
39+
colcon.srgb_eotf_f32.restype = ctypes.c_float
40+
colcon.srgb_oetf_f32.argtypes = [ctypes.c_float]
41+
colcon.srgb_oetf_f32.restype = ctypes.c_float
42+
colcon.pq_eotf_f32.argtypes = [ctypes.c_float]
43+
colcon.pq_eotf_f32.restype = ctypes.c_float
44+
colcon.pqz_eotf_f32.argtypes = [ctypes.c_float]
45+
colcon.pqz_eotf_f32.restype = ctypes.c_float
46+
colcon.pq_oetf_f32.argtypes = [ctypes.c_float]
47+
colcon.pq_oetf_f32.restype = ctypes.c_float
48+
colcon.pqz_oetf_f32.argtypes = [ctypes.c_float]
49+
colcon.pqz_oetf_f32.restype = ctypes.c_float
50+
colcon.hk_high2023_f32.argtypes = [cpixel]
51+
colcon.hk_high2023_comp_f32.argtypes = [cpixel]
4452

4553
SRGB = [0.20000000, 0.35000000, 0.95000000]
4654
LRGB = [0.03310477, 0.10048151, 0.89000541]
@@ -60,60 +68,60 @@ def pixcmp(a, b):
6068

6169
# up
6270
pix = cpixel(*SRGB)
63-
colcon.srgb_to_hsv(pix)
71+
colcon.srgb_to_hsv_f32(pix)
6472
pixcmp(list(pix), HSV)
6573

6674
pix = cpixel(*SRGB)
67-
colcon.srgb_to_lrgb(pix)
75+
colcon.srgb_to_lrgb_f32(pix)
6876
pixcmp(list(pix), LRGB)
6977

7078
pix = cpixel(*LRGB)
71-
colcon.lrgb_to_xyz(pix)
79+
colcon.lrgb_to_xyz_f32(pix)
7280
pixcmp(list(pix), XYZ)
7381

7482
pix = cpixel(*XYZ)
75-
colcon.xyz_to_lab(pix)
83+
colcon.xyz_to_cielab_f32(pix)
7684
pixcmp(list(pix), LAB)
7785

7886
pix = cpixel(*XYZ)
79-
colcon.xyz_to_oklab(pix)
87+
colcon.xyz_to_oklab_f32(pix)
8088
pixcmp(list(pix), OKLAB)
8189

8290
pix = cpixel(*XYZ)
83-
colcon.xyz_to_jzazbz(pix)
91+
colcon.xyz_to_jzazbz_f32(pix)
8492
pixcmp(list(pix), JZAZBZ)
8593

8694
pix = cpixel(*LAB)
87-
colcon.lab_to_lch(pix)
95+
colcon.lab_to_lch_f32(pix)
8896
pixcmp(list(pix), LCH)
8997

9098
# down
9199
pix = cpixel(*LCH)
92-
colcon.lch_to_lab(pix)
100+
colcon.lch_to_lab_f32(pix)
93101
pixcmp(list(pix), LAB)
94102

95103
pix = cpixel(*LAB)
96-
colcon.lab_to_xyz(pix)
104+
colcon.cielab_to_xyz_f32(pix)
97105
pixcmp(list(pix), XYZ)
98106

99107
pix = cpixel(*JZAZBZ)
100-
colcon.jzazbz_to_xyz(pix)
108+
colcon.jzazbz_to_xyz_f32(pix)
101109
pixcmp(list(pix), XYZ)
102110

103111
pix = cpixel(*OKLAB)
104-
colcon.oklab_to_xyz(pix)
112+
colcon.oklab_to_xyz_f32(pix)
105113
pixcmp(list(pix), XYZ)
106114

107115
pix = cpixel(*XYZ)
108-
colcon.xyz_to_lrgb(pix)
116+
colcon.xyz_to_lrgb_f32(pix)
109117
pixcmp(list(pix), LRGB)
110118

111119
pix = cpixel(*LRGB)
112-
colcon.lrgb_to_srgb(pix)
120+
colcon.lrgb_to_srgb_f32(pix)
113121
pixcmp(list(pix), SRGB)
114122

115123
pix = cpixel(*SRGB)
116-
colcon.srgb_to_hsv(pix)
124+
colcon.srgb_to_hsv_f32(pix)
117125
pixcmp(list(pix), HSV)
118126

119127
pix = (ctypes.c_float * len(SRGB))(*SRGB)

0 commit comments

Comments
 (0)