Skip to content

Commit bf735f1

Browse files
added inverse cdfnorm function
1 parent 02eada9 commit bf735f1

11 files changed

+480
-2
lines changed

CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ set(PROJECT_SOURCES
4444
src/rvvlm_cbrtDI.c
4545
src/rvvlm_cdfnormD.c
4646
src/rvvlm_cdfnormDI.c
47+
src/rvvlm_cdfnorminvD.c
48+
src/rvvlm_cdfnorminvDI.c
4749
src/rvvlm_erfD.c
4850
src/rvvlm_erfDI.c
4951
src/rvvlm_erfcD.c

include/rvvlm.h

+11
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,13 @@ union sui64_fp64 {
297297
#define RVVLM_CDFNORMDI_VSET_CONFIG "rvvlm_fp64m1.h"
298298
#define RVVLM_CDFNORMDI_STD rvvlm_cdfnormI
299299

300+
// FP64 cdfnorminv function configuration
301+
#define RVVLM_CDFNORMINVD_VSET_CONFIG "rvvlm_fp64m1.h"
302+
#define RVVLM_CDFNORMINVD_STD rvvlm_cdfnorminv
303+
304+
#define RVVLM_CDFNORMINVDI_VSET_CONFIG "rvvlm_fp64m1.h"
305+
#define RVVLM_CDFNORMINVDI_STD rvvlm_cdfnorminvI
306+
300307
// FP64 erf function configuration
301308
#define RVVLM_ERFD_VSET_CONFIG "rvvlm_fp64m2.h"
302309
#define RVVLM_ERFD_STD rvvlm_erf
@@ -550,6 +557,10 @@ void RVVLM_CDFNORMD_STD(size_t x_len, const double *x, double *y);
550557
void RVVLM_CDFNORMDI_STD(size_t x_len, const double *x, size_t stride_x,
551558
double *y, size_t stride_y);
552559

560+
void RVVLM_CDFNORMINVD_STD(size_t x_len, const double *x, double *y);
561+
void RVVLM_CDFNORMINVDI_STD(size_t x_len, const double *x, size_t stride_x,
562+
double *y, size_t stride_y);
563+
553564
void RVVLM_ERFD_STD(size_t x_len, const double *x, double *y);
554565
void RVVLM_ERFDI_STD(size_t x_len, const double *x, size_t stride_x, double *y,
555566
size_t stride_y);

include/rvvlm_cdfnorminvD.inc.h

+234
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
// SPDX-FileCopyrightText: 2023 Rivos Inc.
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#if (STRIDE == UNIT_STRIDE)
6+
#define F_VER1 RVVLM_CDFNORMINVD_STD
7+
#else
8+
#define F_VER1 RVVLM_CDFNORMINVDI_STD
9+
#endif
10+
11+
// cdfnorminv is defined on (0, 1). Suffices to consider (0, 1/2]
12+
// Two regions of approximation: left is [0, 0x1.2p-3) and right is [0x1.2p-3,
13+
// 1/2) Both are done with rational functions. For right, t*P(t)/Q(t) t = 1/2-x;
14+
// x in [0x1.2p-3, 1/2) For left, y*P(t)/Q(t), y = sqrt(-log(2x)); and t = 1/y
15+
16+
// P_coefficients in asending order, all in Q79.
17+
// p0_delta is in floating point, scale 79
18+
#define P_right_0 -0x6709ca23d4199a8L
19+
#define P_right_1 -0xfd998fbae8eb3c8L
20+
#define P_right_2 0x48ca86036ae6e955L
21+
#define P_right_3 -0x278f4a98238f8c27L
22+
#define P_right_4 -0x40132208941e6a5aL
23+
#define P_right_5 0x402e2635719a3914L
24+
#define P_right_6 0x31c67fdc7e5073fL
25+
#define P_right_7 -0x12d1e1d375fb5d31L
26+
#define P_right_8 0x4232daca563749dL
27+
#define P_right_9 0xb02a8971665c0dL
28+
#define P_right_10 -0x2a7ae4292a6a4fL
29+
#define DELTA_P0_right 0x1.6c4b0b32778d0p-3
30+
31+
// Q_coefficients in asending order, all in Q79.
32+
// q0_delta is in floating point, scale 79
33+
#define Q_right_0 -0x52366e5b14c0970L
34+
#define Q_right_1 -0xca57e95abcc599bL
35+
#define Q_right_2 0x3b6c91ec67f5759cL
36+
#define Q_right_3 -0x1c40d5daa3be22bcL
37+
#define Q_right_4 -0x41f11eb5d837386cL
38+
#define Q_right_5 0x3c6ce478fcd75c9aL
39+
#define Q_right_6 0xbb1cd7270cfba1dL
40+
#define Q_right_7 -0x1988a4116498f1afL
41+
#define Q_right_8 0x44dc3042f103d20L
42+
#define Q_right_9 0x2390e683d02edf3L
43+
#define Q_right_10 -0x8ec66f2a7e410cL
44+
#define DELTA_Q0_right -0x1.29a0161e99446p-3
45+
46+
// P_coefficients in asending order, all in Q67. p0_delta is in floating point
47+
#define P_left_0 0x216a32ed581bfL
48+
#define P_left_1 0x5ac486106d127fL
49+
#define P_left_2 0x3a9f84d231c6131L
50+
#define P_left_3 0xb54f6ab23cca5a3L
51+
#define P_left_4 0xecc53db7ed5eccbL
52+
#define P_left_5 0x194382b2de726d58L
53+
#define P_left_6 0x166fc6bd87b1b0b6L
54+
#define P_left_7 0xfd7bc0d477f41a9L
55+
#define P_left_8 0x7fc186088d7ad8cL
56+
#define P_left_9 0x18d6aeeb448b50aL
57+
#define P_left_10 -0x8fb330020a5bL
58+
#define DELTA_P0_left 0x1.b81f6f45914f0p-2
59+
60+
// Q_coefficients in asending order, all in Q67. q0_delta is in floating point
61+
#define Q_left_0 0x17a09aabf9ceeL
62+
#define Q_left_1 0x4030b9059ffcadL
63+
#define Q_left_2 0x29b26b0d87f7855L
64+
#define Q_left_3 0x87572a13d3fa2ddL
65+
#define Q_left_4 0xd7a728b5620ac3cL
66+
#define Q_left_5 0x1754392b473fd439L
67+
#define Q_left_6 0x1791b9a091a816c2L
68+
#define Q_left_7 0x167f71db9e13b075L
69+
#define Q_left_8 0xcb9f5f3e5e618a4L
70+
#define Q_left_9 0x68271fae767c68eL
71+
#define Q_left_10 0x13745c4fa224b25L
72+
#define DELTA_Q0_left 0x1.f7e7557a34ae6p-2
73+
74+
// cdfnorminv(x) = -sqrt(2)*erfcinv(2x)
75+
// The approximation rational functions are based on those for erfcinv
76+
// hence you will see a doubling of arguments here and there so that
77+
// "2x" is created
78+
void F_VER1(API) {
79+
size_t vlen;
80+
VFLOAT vx, vx_sign, vy, vy_special;
81+
VBOOL special_args;
82+
83+
SET_ROUNDTONEAREST;
84+
// stripmining over input arguments
85+
for (; _inarg_n > 0; _inarg_n -= vlen) {
86+
vlen = VSET(_inarg_n);
87+
vx = VFLOAD_INARG1(vlen);
88+
89+
// Handle Inf and NaN
90+
EXCEPTION_HANDLING_CDFNORMINV(vx, special_args, vy_special, vlen);
91+
92+
vx_sign = __riscv_vfsub(vx, 0x1.0p-1, vlen);
93+
VFLOAT one_minus_x = __riscv_vfrsub(vx, fp_posOne, vlen);
94+
VBOOL x_gt_half = __riscv_vmfgt(vx_sign, fp_posZero, vlen);
95+
vx = __riscv_vmerge(vx, one_minus_x, x_gt_half, vlen);
96+
// vx is now in (0, 1/2]
97+
VBOOL x_in_left = __riscv_vmfle(vx, 0x1.2p-3, vlen);
98+
99+
VFLOAT w_hi, w_lo, w_hi_left, w_lo_left, y_hi, y_lo;
100+
VINT T, T_left, T_tiny;
101+
VBOOL x_is_tiny;
102+
x_is_tiny = __riscv_vmxor(x_is_tiny, x_is_tiny, vlen);
103+
104+
if (__riscv_vcpop(x_in_left, vlen) > 0) {
105+
VFLOAT x_left = VFMV_VF(0x1.0p-4, vlen);
106+
x_left = __riscv_vmerge(x_left, vx, x_in_left, vlen);
107+
x_is_tiny = __riscv_vmflt(x_left, 0x1.0p-53, vlen);
108+
INT n_adjust = 59;
109+
x_left = __riscv_vfmul(x_left, 0x1.0p60, vlen);
110+
// adjusting only 59 instead of 60 essentially doubles x
111+
NEG_LOGX_4_TRANSFORM(x_left, n_adjust, y_hi, y_lo, vlen);
112+
113+
SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_left, w_lo_left, T_left, 0x1.0p63,
114+
0x1.0p-63, vlen);
115+
if (__riscv_vcpop(x_is_tiny, vlen) > 0) {
116+
VFLOAT w_hi_dummy, w_lo_dummy;
117+
SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_dummy, w_lo_dummy, T_tiny, 0x1.0p64,
118+
0x1.0p-64, vlen);
119+
}
120+
}
121+
vx = __riscv_vfadd(vx, vx, vlen);
122+
w_hi = VFMV_VF(fp_posOne, vlen);
123+
w_hi = __riscv_vfsub(w_hi, vx, vlen);
124+
w_lo = __riscv_vfrsub(w_hi, fp_posOne, vlen);
125+
w_lo = __riscv_vfsub(w_lo, vx, vlen);
126+
T = __riscv_vfcvt_x(__riscv_vfmul(w_hi, 0x1.0p63, vlen), vlen);
127+
VFLOAT delta_t = __riscv_vfmul(w_lo, 0x1.0p63, vlen);
128+
T = __riscv_vadd(T, __riscv_vfcvt_x(delta_t, vlen), vlen);
129+
T = __riscv_vmerge(T, T_left, x_in_left, vlen);
130+
131+
w_hi = __riscv_vmerge(w_hi, w_hi_left, x_in_left, vlen);
132+
w_lo = __riscv_vmerge(w_lo, w_lo_left, x_in_left, vlen);
133+
134+
// For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T)
135+
VINT P, Q;
136+
137+
P = __riscv_vmerge(VMVI_VX(P_right_10, vlen), P_left_10, x_in_left, vlen);
138+
P = PSTEP_I_ab(x_in_left, P_left_6, P_right_6, T,
139+
PSTEP_I_ab(x_in_left, P_left_7, P_right_7, T,
140+
PSTEP_I_ab(x_in_left, P_left_8, P_right_8, T,
141+
PSTEP_I_ab(x_in_left, P_left_9,
142+
P_right_9, T, P, vlen),
143+
vlen),
144+
vlen),
145+
vlen);
146+
147+
Q = __riscv_vmerge(VMVI_VX(Q_right_10, vlen), Q_left_10, x_in_left, vlen);
148+
Q = PSTEP_I_ab(x_in_left, Q_left_6, Q_right_6, T,
149+
PSTEP_I_ab(x_in_left, Q_left_7, Q_right_7, T,
150+
PSTEP_I_ab(x_in_left, Q_left_8, Q_right_8, T,
151+
PSTEP_I_ab(x_in_left, Q_left_9,
152+
Q_right_9, T, Q, vlen),
153+
vlen),
154+
vlen),
155+
vlen);
156+
157+
P = PSTEP_I_ab(
158+
x_in_left, P_left_0, P_right_0, T,
159+
PSTEP_I_ab(
160+
x_in_left, P_left_1, P_right_1, T,
161+
PSTEP_I_ab(x_in_left, P_left_2, P_right_2, T,
162+
PSTEP_I_ab(x_in_left, P_left_3, P_right_3, T,
163+
PSTEP_I_ab(x_in_left, P_left_4, P_right_4, T,
164+
PSTEP_I_ab(x_in_left, P_left_5,
165+
P_right_5, T, P, vlen),
166+
vlen),
167+
vlen),
168+
vlen),
169+
vlen),
170+
vlen);
171+
172+
Q = PSTEP_I_ab(
173+
x_in_left, Q_left_0, Q_right_0, T,
174+
PSTEP_I_ab(
175+
x_in_left, Q_left_1, Q_right_1, T,
176+
PSTEP_I_ab(x_in_left, Q_left_2, Q_right_2, T,
177+
PSTEP_I_ab(x_in_left, Q_left_3, Q_right_3, T,
178+
PSTEP_I_ab(x_in_left, Q_left_4, Q_right_4, T,
179+
PSTEP_I_ab(x_in_left, Q_left_5,
180+
Q_right_5, T, Q, vlen),
181+
vlen),
182+
vlen),
183+
vlen),
184+
vlen),
185+
vlen);
186+
187+
VFLOAT p_hi, p_lo;
188+
p_hi = __riscv_vfcvt_f(P, vlen);
189+
190+
p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen),
191+
vlen);
192+
VFLOAT delta_p0 = VFMV_VF(DELTA_P0_right, vlen);
193+
delta_p0 = __riscv_vfmerge(delta_p0, DELTA_P0_left, x_in_left, vlen);
194+
p_lo = __riscv_vfadd(p_lo, delta_p0, vlen);
195+
196+
VFLOAT q_hi, q_lo;
197+
q_hi = __riscv_vfcvt_f(Q, vlen);
198+
q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen),
199+
vlen);
200+
VFLOAT delta_q0 = VFMV_VF(DELTA_Q0_right, vlen);
201+
delta_q0 = __riscv_vfmerge(delta_q0, DELTA_Q0_left, x_in_left, vlen);
202+
q_lo = __riscv_vfadd(q_lo, delta_q0, vlen);
203+
204+
if (__riscv_vcpop(x_is_tiny, vlen) > 0) {
205+
VFLOAT p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny;
206+
ERFCINV_PQ_HILO_TINY(T_tiny, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny,
207+
vlen);
208+
p_hi = __riscv_vmerge(p_hi, p_hi_tiny, x_is_tiny, vlen);
209+
p_lo = __riscv_vmerge(p_lo, p_lo_tiny, x_is_tiny, vlen);
210+
q_hi = __riscv_vmerge(q_hi, q_hi_tiny, x_is_tiny, vlen);
211+
q_lo = __riscv_vmerge(q_lo, q_lo_tiny, x_is_tiny, vlen);
212+
}
213+
214+
// (y_hi, y_lo) <-- (w_hi + w_lo) * (p_hi + p_lo)
215+
y_hi = __riscv_vfmul(w_hi, p_hi, vlen);
216+
y_lo = __riscv_vfmsub(w_hi, p_hi, y_hi, vlen);
217+
y_lo = __riscv_vfmacc(y_lo, w_hi, p_lo, vlen);
218+
y_lo = __riscv_vfmacc(y_lo, w_lo, p_hi, vlen);
219+
220+
DIV_N2D2(y_hi, y_lo, q_hi, q_lo, w_hi, vlen);
221+
222+
vy = w_hi;
223+
224+
vy = __riscv_vfsgnj(vy, vx_sign, vlen);
225+
vy = __riscv_vmerge(vy, vy_special, special_args, vlen);
226+
227+
// copy vy into y and increment addr pointers
228+
VFSTORE_OUTARG1(vy, vlen);
229+
230+
INCREMENT_INARG1(vlen);
231+
INCREMENT_OUTARG1(vlen);
232+
}
233+
RESTORE_FRM;
234+
}

include/rvvlm_inverrorfuncsD.h

+75-2
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,8 @@
3535
#define Q_tiny_9 0x5e4a26a7c1415755 // sacle 57
3636
#define DELTA_Q0_tiny 0x1.8a7adad44d65ap-4 // scale 66
3737

38-
#define Q50_84
38+
#if defined(COMPILE_FOR_ERFCINV)
3939
// Using [P,Q]_tiny_[HI,LO]_k, HI in Q50, LO in Q84
40-
#if defined(Q50_84)
4140
#define P_tiny_HI_0 -0x8593442eL
4241
#define P_tiny_LO_0 -0x4e7245b3L
4342
#define P_tiny_HI_1 -0x7f3dc156b1L
@@ -81,6 +80,51 @@
8180
#define Q_tiny_LO_9 -0x155b50b48L
8281
#endif
8382

83+
#if defined(COMPILE_FOR_CDFNORMINV)
84+
// Using [P,Q]_tiny_[HI,LO]_k, HI in Q50, LO in Q84
85+
#define P_tiny_HI_0 -0xbce768cfL
86+
#define P_tiny_LO_0 -0x6824d442L
87+
#define P_tiny_HI_1 -0xb3f23f158aL
88+
#define P_tiny_LO_1 0x120e225b6L
89+
#define P_tiny_HI_2 -0x2e77fdb703eaL
90+
#define P_tiny_LO_2 -0x1e1d72461L
91+
#define P_tiny_HI_3 -0x44fbca4f8507eL
92+
#define P_tiny_LO_3 -0xd2fb9bf1L
93+
#define P_tiny_HI_4 -0x25be85812224dcL
94+
#define P_tiny_LO_4 -0x14663c6d2L
95+
#define P_tiny_HI_5 -0x56d9a544fd76f0L
96+
#define P_tiny_LO_5 -0x1e3fd12d9L
97+
#define P_tiny_HI_6 0xb44c46b00008ccL
98+
#define P_tiny_LO_6 0x123f14b79L
99+
#define P_tiny_HI_7 0x22eb3f29425cc2dL
100+
#define P_tiny_LO_7 -0x1f47840b1L
101+
#define P_tiny_HI_8 0x6b5068e2aa0bc1L
102+
#define P_tiny_LO_8 -0xd830044aL
103+
#define P_tiny_HI_9 0x1e496a7253435eL
104+
#define P_tiny_LO_9 -0xf06a1c9L
105+
106+
#define Q_tiny_HI_0 -0x85933cdaL
107+
#define Q_tiny_LO_0 -0xb5b39d61L
108+
#define Q_tiny_HI_1 -0x7f3de4b69fL
109+
#define Q_tiny_LO_1 -0x151d1cd35L
110+
#define Q_tiny_HI_2 -0x20dd8dc1da27L
111+
#define Q_tiny_LO_2 -0x1706945d7L
112+
#define Q_tiny_HI_3 -0x30dc92d1cd231L
113+
#define Q_tiny_LO_3 0xabde03f9L
114+
#define Q_tiny_HI_4 -0x1af5fcee397d58L
115+
#define Q_tiny_LO_4 -0xc3530d28L
116+
#define Q_tiny_HI_5 -0x42639eeec1d051L
117+
#define Q_tiny_LO_5 0x662b41ecL
118+
#define Q_tiny_HI_6 0x6182b99f6ca998L
119+
#define Q_tiny_LO_6 0x938a5e35L
120+
#define Q_tiny_HI_7 0x17a6848dc07624aL
121+
#define Q_tiny_LO_7 0x8a0484b7L
122+
#define Q_tiny_HI_8 0x105ecd6aac52b12L
123+
#define Q_tiny_LO_8 0x1d1e38258L
124+
#define Q_tiny_HI_9 0xbc944d4f8282afL
125+
#define Q_tiny_LO_9 -0x155b50b48L
126+
#endif
127+
84128
// erfinv(+-1) = +-Inf with divide by zero
85129
// erfinv(x) |x| > 1, real is NaN with invalid
86130
// erfinv(NaN) is NaN, invalid if input is signalling NaN
@@ -140,6 +184,35 @@
140184
} \
141185
} while (0)
142186

187+
// cdfnorminv(0) = -Inf, erfcinv(1) = Inf with divide by zero
188+
// cdfnorminv(x) x outside [0, 1], real is NaN with invalid
189+
// cdfnorminv(NaN) is NaN, invalid if input is signalling NaN
190+
#define EXCEPTION_HANDLING_CDFNORMINV(vx, special_args, vy_special, vlen) \
191+
do { \
192+
VUINT vclass = __riscv_vfclass((vx), (vlen)); \
193+
IDENTIFY(vclass, 0x39F, (special_args), (vlen)); \
194+
VBOOL x_ge_1 = __riscv_vmfge((vx), fp_posOne, (vlen)); \
195+
(special_args) = __riscv_vmor((special_args), x_ge_1, (vlen)); \
196+
if (__riscv_vcpop((special_args), (vlen)) > 0) { \
197+
VBOOL x_gt_1 = __riscv_vmfgt((vx), fp_posOne, (vlen)); \
198+
VBOOL x_lt_0 = __riscv_vmflt((vx), fp_posZero, (vlen)); \
199+
/* substitute x > 1 or x < 0 with sNaN */ \
200+
(vx) = __riscv_vfmerge((vx), fp_sNaN, x_gt_1, (vlen)); \
201+
(vx) = __riscv_vfmerge((vx), fp_sNaN, x_lt_0, (vlen)); \
202+
/* substitute x = 0 or 1 with +/-Inf and generate div-by-zero signal */ \
203+
VFLOAT tmp = VFMV_VF(fp_posZero, (vlen)); \
204+
VFLOAT x_tmp = __riscv_vfsub((vx), 0x1.0p-1, (vlen)); \
205+
tmp = __riscv_vfsgnj(tmp, x_tmp, (vlen)); \
206+
VBOOL x_eq_1 = __riscv_vmfeq((vx), fp_posOne, (vlen)); \
207+
VBOOL x_eq_0 = __riscv_vmfeq((vx), fp_posZero, (vlen)); \
208+
VBOOL pm_Inf = __riscv_vmor(x_eq_1, x_eq_0, (vlen)); \
209+
tmp = __riscv_vfrec7(pm_Inf, tmp, (vlen)); \
210+
(vy_special) = __riscv_vfsub((special_args), (vx), (vx), (vlen)); \
211+
(vy_special) = __riscv_vmerge((vy_special), tmp, pm_Inf, (vlen)); \
212+
(vx) = __riscv_vfmerge((vx), 0x1.0p-1, (special_args), (vlen)); \
213+
} \
214+
} while (0)
215+
143216
// Compute -log(2^(-n_adjust) * x), where x < 1
144217
#define NEG_LOGX_4_TRANSFORM(vx, n_adjust, y_hi, y_lo, vlen) \
145218
do { \

src/rvvlm_cdfnorminvD.c

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// SPDX-FileCopyrightText: 2023 Rivos Inc.
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#include <riscv_vector.h>
6+
#include <stdio.h>
7+
8+
#include "rvvlm.h"
9+
#define API_SIGNATURE API_SIGNATURE_11
10+
#define STRIDE UNIT_STRIDE
11+
12+
#include RVVLM_CDFNORMINVD_VSET_CONFIG
13+
14+
#define COMPILE_FOR_CDFNORMINV
15+
#include "rvvlm_inverrorfuncsD.h"
16+
17+
#include "rvvlm_cdfnorminvD.inc.h"

0 commit comments

Comments
 (0)