Skip to content

Commit be480be

Browse files
committed
More FMA3 ops
Average 2-4% perf gain, should also be more accurate. Code little less readable but with all the formulae being open source documents it shouldn't be too bad
1 parent 0f14713 commit be480be

File tree

1 file changed

+11
-12
lines changed

1 file changed

+11
-12
lines changed

src/lib.rs

+11-12
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ mod tests;
1313

1414
use core::cmp::PartialOrd;
1515
use core::ffi::{c_char, CStr};
16-
use core::ops::{Add, Div, Mul, Rem, Sub};
16+
use core::ops::{Add, Div, Mul, Neg, Rem, Sub};
1717

1818
// DType {{{
1919

@@ -56,8 +56,9 @@ pub trait DType:
5656
+ Add<Output = Self>
5757
+ Div<Output = Self>
5858
+ Mul<Output = Self>
59-
+ Sub<Output = Self>
59+
+ Neg<Output = Self>
6060
+ Rem<Output = Self>
61+
+ Sub<Output = Self>
6162
+ PartialOrd
6263
+ FromF32
6364
{
@@ -356,7 +357,7 @@ fn pq_eotf_common<T: DType>(e: T, m2: T) -> T {
356357
let ep_pow_1divm2 = e.spowf(T::ff32(1.0) / m2);
357358

358359
let numerator: T = (ep_pow_1divm2 - PQEOTF_C1.to_dt()).max(0.0.to_dt());
359-
let denominator: T = T::ff32(PQEOTF_C2) - T::ff32(PQEOTF_C3) * ep_pow_1divm2;
360+
let denominator: T = ep_pow_1divm2.fma(T::ff32(-PQEOTF_C3), PQEOTF_C2.to_dt());
360361

361362
let y = (numerator / denominator).spowf((1.0 / PQEOTF_M1).to_dt());
362363

@@ -1018,9 +1019,7 @@ pub fn srgb_to_hsv<T: DType>(pixel: &mut [T; 3]) {
10181019
} else {
10191020
let s = dmax / vmax;
10201021

1021-
let dr = (((vmax - pixel[0]) / 6.0.to_dt()) + (dmax / 2.0.to_dt())) / dmax;
1022-
let dg = (((vmax - pixel[1]) / 6.0.to_dt()) + (dmax / 2.0.to_dt())) / dmax;
1023-
let db = (((vmax - pixel[2]) / 6.0.to_dt()) + (dmax / 2.0.to_dt())) / dmax;
1022+
let [dr, dg, db] = pixel.map(|c| (((vmax - c) / 6.0.to_dt()) + (dmax / 2.0.to_dt())) / dmax);
10241023

10251024
let h = if pixel[0] == vmax {
10261025
db - dg
@@ -1065,7 +1064,7 @@ pub fn xyz_to_cielab<T: DType>(pixel: &mut [T; 3]) {
10651064
});
10661065

10671066
*pixel = [
1068-
(T::ff32(116.0) * pixel[1]) - 16.0.to_dt(),
1067+
T::ff32(116.0).fma(pixel[1], T::ff32(-16.0)),
10691068
T::ff32(500.0) * (pixel[0] - pixel[1]),
10701069
T::ff32(200.0) * (pixel[1] - pixel[2]),
10711070
]
@@ -1087,8 +1086,8 @@ pub fn xyz_to_jzazbz<T: DType>(pixel: &mut [T; 3]) {
10871086
let mut lms = matmul3(
10881087
JZAZBZ_M1,
10891088
[
1090-
pixel[0] * JZAZBZ_B.to_dt() - T::ff32(JZAZBZ_B - 1.0) * pixel[2],
1091-
pixel[1] * JZAZBZ_G.to_dt() - T::ff32(JZAZBZ_G - 1.0) * pixel[0],
1089+
pixel[0].fma(JZAZBZ_B.to_dt(), T::ff32(-JZAZBZ_B + 1.0) * pixel[2]),
1090+
pixel[1].fma(JZAZBZ_G.to_dt(), T::ff32(-JZAZBZ_G + 1.0) * pixel[0]),
10921091
pixel[2],
10931092
],
10941093
);
@@ -1198,8 +1197,8 @@ pub fn hsv_to_srgb<T: DType>(pixel: &mut [T; 3]) {
11981197
}
11991198
let var_i = var_h.trunc();
12001199
let var_1 = pixel[2] * (T::ff32(1.0) - pixel[1]);
1201-
let var_2 = pixel[2] * (T::ff32(1.0) - pixel[1] * (var_h - var_i));
1202-
let var_3 = pixel[2] * (T::ff32(1.0) - pixel[1] * (T::ff32(1.0) - (var_h - var_i)));
1200+
let var_2 = pixel[2] * (-var_h + var_i).fma(pixel[1], 1.0.to_dt());
1201+
let var_3 = pixel[2] * (T::ff32(-1.0) + (var_h - var_i)).fma(pixel[1], T::ff32(1.0));
12031202

12041203
*pixel = if var_i == 0.0.to_dt() {
12051204
[pixel[2], var_3, var_1]
@@ -1269,7 +1268,7 @@ pub fn jzazbz_to_xyz<T: DType>(pixel: &mut [T; 3]) {
12691268
JZAZBZ_M2_INV,
12701269
[
12711270
(pixel[0] + JZAZBZ_D0.to_dt())
1272-
/ (T::ff32(1.0 + JZAZBZ_D) - T::ff32(JZAZBZ_D) * (pixel[0] + JZAZBZ_D0.to_dt())),
1271+
/ (pixel[0] + JZAZBZ_D0.to_dt()).fma(T::ff32(-JZAZBZ_D), T::ff32(1.0 + JZAZBZ_D)),
12731272
pixel[1],
12741273
pixel[2],
12751274
],

0 commit comments

Comments
 (0)