|
| 1 | +// SPDX-FileCopyrightText: 2023 Rivos Inc. |
| 2 | +// |
| 3 | +// SPDX-License-Identifier: Apache-2.0 |
| 4 | + |
| 5 | +#if (STRIDE == UNIT_STRIDE) |
| 6 | +#define F_VER1 RVVLM_CDFNORMINVD_STD |
| 7 | +#else |
| 8 | +#define F_VER1 RVVLM_CDFNORMINVDI_STD |
| 9 | +#endif |
| 10 | + |
| 11 | +// cdfnorminv is defined on (0, 1). Suffices to consider (0, 1/2] |
| 12 | +// Two regions of approximation: left is [0, 0x1.2p-3) and right is [0x1.2p-3, |
| 13 | +// 1/2) Both are done with rational functions. For right, t*P(t)/Q(t) t = 1/2-x; |
| 14 | +// x in [0x1.2p-3, 1/2) For left, y*P(t)/Q(t), y = sqrt(-log(2x)); and t = 1/y |
| 15 | + |
| 16 | +// P_coefficients in asending order, all in Q79. |
| 17 | +// p0_delta is in floating point, scale 79 |
| 18 | +#define P_right_0 -0x6709ca23d4199a8L |
| 19 | +#define P_right_1 -0xfd998fbae8eb3c8L |
| 20 | +#define P_right_2 0x48ca86036ae6e955L |
| 21 | +#define P_right_3 -0x278f4a98238f8c27L |
| 22 | +#define P_right_4 -0x40132208941e6a5aL |
| 23 | +#define P_right_5 0x402e2635719a3914L |
| 24 | +#define P_right_6 0x31c67fdc7e5073fL |
| 25 | +#define P_right_7 -0x12d1e1d375fb5d31L |
| 26 | +#define P_right_8 0x4232daca563749dL |
| 27 | +#define P_right_9 0xb02a8971665c0dL |
| 28 | +#define P_right_10 -0x2a7ae4292a6a4fL |
| 29 | +#define DELTA_P0_right 0x1.6c4b0b32778d0p-3 |
| 30 | + |
| 31 | +// Q_coefficients in asending order, all in Q79. |
| 32 | +// q0_delta is in floating point, scale 79 |
| 33 | +#define Q_right_0 -0x52366e5b14c0970L |
| 34 | +#define Q_right_1 -0xca57e95abcc599bL |
| 35 | +#define Q_right_2 0x3b6c91ec67f5759cL |
| 36 | +#define Q_right_3 -0x1c40d5daa3be22bcL |
| 37 | +#define Q_right_4 -0x41f11eb5d837386cL |
| 38 | +#define Q_right_5 0x3c6ce478fcd75c9aL |
| 39 | +#define Q_right_6 0xbb1cd7270cfba1dL |
| 40 | +#define Q_right_7 -0x1988a4116498f1afL |
| 41 | +#define Q_right_8 0x44dc3042f103d20L |
| 42 | +#define Q_right_9 0x2390e683d02edf3L |
| 43 | +#define Q_right_10 -0x8ec66f2a7e410cL |
| 44 | +#define DELTA_Q0_right -0x1.29a0161e99446p-3 |
| 45 | + |
| 46 | +// P_coefficients in asending order, all in Q67. p0_delta is in floating point |
| 47 | +#define P_left_0 0x216a32ed581bfL |
| 48 | +#define P_left_1 0x5ac486106d127fL |
| 49 | +#define P_left_2 0x3a9f84d231c6131L |
| 50 | +#define P_left_3 0xb54f6ab23cca5a3L |
| 51 | +#define P_left_4 0xecc53db7ed5eccbL |
| 52 | +#define P_left_5 0x194382b2de726d58L |
| 53 | +#define P_left_6 0x166fc6bd87b1b0b6L |
| 54 | +#define P_left_7 0xfd7bc0d477f41a9L |
| 55 | +#define P_left_8 0x7fc186088d7ad8cL |
| 56 | +#define P_left_9 0x18d6aeeb448b50aL |
| 57 | +#define P_left_10 -0x8fb330020a5bL |
| 58 | +#define DELTA_P0_left 0x1.b81f6f45914f0p-2 |
| 59 | + |
| 60 | +// Q_coefficients in asending order, all in Q67. q0_delta is in floating point |
| 61 | +#define Q_left_0 0x17a09aabf9ceeL |
| 62 | +#define Q_left_1 0x4030b9059ffcadL |
| 63 | +#define Q_left_2 0x29b26b0d87f7855L |
| 64 | +#define Q_left_3 0x87572a13d3fa2ddL |
| 65 | +#define Q_left_4 0xd7a728b5620ac3cL |
| 66 | +#define Q_left_5 0x1754392b473fd439L |
| 67 | +#define Q_left_6 0x1791b9a091a816c2L |
| 68 | +#define Q_left_7 0x167f71db9e13b075L |
| 69 | +#define Q_left_8 0xcb9f5f3e5e618a4L |
| 70 | +#define Q_left_9 0x68271fae767c68eL |
| 71 | +#define Q_left_10 0x13745c4fa224b25L |
| 72 | +#define DELTA_Q0_left 0x1.f7e7557a34ae6p-2 |
| 73 | + |
| 74 | +// cdfnorminv(x) = -sqrt(2)*erfcinv(2x) |
| 75 | +// The approximation rational functions are based on those for erfcinv |
| 76 | +// hence you will see a doubling of arguments here and there so that |
| 77 | +// "2x" is created |
| 78 | +void F_VER1(API) { |
| 79 | + size_t vlen; |
| 80 | + VFLOAT vx, vx_sign, vy, vy_special; |
| 81 | + VBOOL special_args; |
| 82 | + |
| 83 | + SET_ROUNDTONEAREST; |
| 84 | + // stripmining over input arguments |
| 85 | + for (; _inarg_n > 0; _inarg_n -= vlen) { |
| 86 | + vlen = VSET(_inarg_n); |
| 87 | + vx = VFLOAD_INARG1(vlen); |
| 88 | + |
| 89 | + // Handle Inf and NaN |
| 90 | + EXCEPTION_HANDLING_CDFNORMINV(vx, special_args, vy_special, vlen); |
| 91 | + |
| 92 | + vx_sign = __riscv_vfsub(vx, 0x1.0p-1, vlen); |
| 93 | + VFLOAT one_minus_x = __riscv_vfrsub(vx, fp_posOne, vlen); |
| 94 | + VBOOL x_gt_half = __riscv_vmfgt(vx_sign, fp_posZero, vlen); |
| 95 | + vx = __riscv_vmerge(vx, one_minus_x, x_gt_half, vlen); |
| 96 | + // vx is now in (0, 1/2] |
| 97 | + VBOOL x_in_left = __riscv_vmfle(vx, 0x1.2p-3, vlen); |
| 98 | + |
| 99 | + VFLOAT w_hi, w_lo, w_hi_left, w_lo_left, y_hi, y_lo; |
| 100 | + VINT T, T_left, T_tiny; |
| 101 | + VBOOL x_is_tiny; |
| 102 | + x_is_tiny = __riscv_vmxor(x_is_tiny, x_is_tiny, vlen); |
| 103 | + |
| 104 | + if (__riscv_vcpop(x_in_left, vlen) > 0) { |
| 105 | + VFLOAT x_left = VFMV_VF(0x1.0p-4, vlen); |
| 106 | + x_left = __riscv_vmerge(x_left, vx, x_in_left, vlen); |
| 107 | + x_is_tiny = __riscv_vmflt(x_left, 0x1.0p-53, vlen); |
| 108 | + INT n_adjust = 59; |
| 109 | + x_left = __riscv_vfmul(x_left, 0x1.0p60, vlen); |
| 110 | + // adjusting only 59 instead of 60 essentially doubles x |
| 111 | + NEG_LOGX_4_TRANSFORM(x_left, n_adjust, y_hi, y_lo, vlen); |
| 112 | + |
| 113 | + SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_left, w_lo_left, T_left, 0x1.0p63, |
| 114 | + 0x1.0p-63, vlen); |
| 115 | + if (__riscv_vcpop(x_is_tiny, vlen) > 0) { |
| 116 | + VFLOAT w_hi_dummy, w_lo_dummy; |
| 117 | + SQRTX_4_TRANSFORM(y_hi, y_lo, w_hi_dummy, w_lo_dummy, T_tiny, 0x1.0p64, |
| 118 | + 0x1.0p-64, vlen); |
| 119 | + } |
| 120 | + } |
| 121 | + vx = __riscv_vfadd(vx, vx, vlen); |
| 122 | + w_hi = VFMV_VF(fp_posOne, vlen); |
| 123 | + w_hi = __riscv_vfsub(w_hi, vx, vlen); |
| 124 | + w_lo = __riscv_vfrsub(w_hi, fp_posOne, vlen); |
| 125 | + w_lo = __riscv_vfsub(w_lo, vx, vlen); |
| 126 | + T = __riscv_vfcvt_x(__riscv_vfmul(w_hi, 0x1.0p63, vlen), vlen); |
| 127 | + VFLOAT delta_t = __riscv_vfmul(w_lo, 0x1.0p63, vlen); |
| 128 | + T = __riscv_vadd(T, __riscv_vfcvt_x(delta_t, vlen), vlen); |
| 129 | + T = __riscv_vmerge(T, T_left, x_in_left, vlen); |
| 130 | + |
| 131 | + w_hi = __riscv_vmerge(w_hi, w_hi_left, x_in_left, vlen); |
| 132 | + w_lo = __riscv_vmerge(w_lo, w_lo_left, x_in_left, vlen); |
| 133 | + |
| 134 | + // For transformed branch, compute (w_hi + w_lo) * P(T)/Q(T) |
| 135 | + VINT P, Q; |
| 136 | + |
| 137 | + P = __riscv_vmerge(VMVI_VX(P_right_10, vlen), P_left_10, x_in_left, vlen); |
| 138 | + P = PSTEP_I_ab(x_in_left, P_left_6, P_right_6, T, |
| 139 | + PSTEP_I_ab(x_in_left, P_left_7, P_right_7, T, |
| 140 | + PSTEP_I_ab(x_in_left, P_left_8, P_right_8, T, |
| 141 | + PSTEP_I_ab(x_in_left, P_left_9, |
| 142 | + P_right_9, T, P, vlen), |
| 143 | + vlen), |
| 144 | + vlen), |
| 145 | + vlen); |
| 146 | + |
| 147 | + Q = __riscv_vmerge(VMVI_VX(Q_right_10, vlen), Q_left_10, x_in_left, vlen); |
| 148 | + Q = PSTEP_I_ab(x_in_left, Q_left_6, Q_right_6, T, |
| 149 | + PSTEP_I_ab(x_in_left, Q_left_7, Q_right_7, T, |
| 150 | + PSTEP_I_ab(x_in_left, Q_left_8, Q_right_8, T, |
| 151 | + PSTEP_I_ab(x_in_left, Q_left_9, |
| 152 | + Q_right_9, T, Q, vlen), |
| 153 | + vlen), |
| 154 | + vlen), |
| 155 | + vlen); |
| 156 | + |
| 157 | + P = PSTEP_I_ab( |
| 158 | + x_in_left, P_left_0, P_right_0, T, |
| 159 | + PSTEP_I_ab( |
| 160 | + x_in_left, P_left_1, P_right_1, T, |
| 161 | + PSTEP_I_ab(x_in_left, P_left_2, P_right_2, T, |
| 162 | + PSTEP_I_ab(x_in_left, P_left_3, P_right_3, T, |
| 163 | + PSTEP_I_ab(x_in_left, P_left_4, P_right_4, T, |
| 164 | + PSTEP_I_ab(x_in_left, P_left_5, |
| 165 | + P_right_5, T, P, vlen), |
| 166 | + vlen), |
| 167 | + vlen), |
| 168 | + vlen), |
| 169 | + vlen), |
| 170 | + vlen); |
| 171 | + |
| 172 | + Q = PSTEP_I_ab( |
| 173 | + x_in_left, Q_left_0, Q_right_0, T, |
| 174 | + PSTEP_I_ab( |
| 175 | + x_in_left, Q_left_1, Q_right_1, T, |
| 176 | + PSTEP_I_ab(x_in_left, Q_left_2, Q_right_2, T, |
| 177 | + PSTEP_I_ab(x_in_left, Q_left_3, Q_right_3, T, |
| 178 | + PSTEP_I_ab(x_in_left, Q_left_4, Q_right_4, T, |
| 179 | + PSTEP_I_ab(x_in_left, Q_left_5, |
| 180 | + Q_right_5, T, Q, vlen), |
| 181 | + vlen), |
| 182 | + vlen), |
| 183 | + vlen), |
| 184 | + vlen), |
| 185 | + vlen); |
| 186 | + |
| 187 | + VFLOAT p_hi, p_lo; |
| 188 | + p_hi = __riscv_vfcvt_f(P, vlen); |
| 189 | + |
| 190 | + p_lo = __riscv_vfcvt_f(__riscv_vsub(P, __riscv_vfcvt_x(p_hi, vlen), vlen), |
| 191 | + vlen); |
| 192 | + VFLOAT delta_p0 = VFMV_VF(DELTA_P0_right, vlen); |
| 193 | + delta_p0 = __riscv_vfmerge(delta_p0, DELTA_P0_left, x_in_left, vlen); |
| 194 | + p_lo = __riscv_vfadd(p_lo, delta_p0, vlen); |
| 195 | + |
| 196 | + VFLOAT q_hi, q_lo; |
| 197 | + q_hi = __riscv_vfcvt_f(Q, vlen); |
| 198 | + q_lo = __riscv_vfcvt_f(__riscv_vsub(Q, __riscv_vfcvt_x(q_hi, vlen), vlen), |
| 199 | + vlen); |
| 200 | + VFLOAT delta_q0 = VFMV_VF(DELTA_Q0_right, vlen); |
| 201 | + delta_q0 = __riscv_vfmerge(delta_q0, DELTA_Q0_left, x_in_left, vlen); |
| 202 | + q_lo = __riscv_vfadd(q_lo, delta_q0, vlen); |
| 203 | + |
| 204 | + if (__riscv_vcpop(x_is_tiny, vlen) > 0) { |
| 205 | + VFLOAT p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny; |
| 206 | + ERFCINV_PQ_HILO_TINY(T_tiny, p_hi_tiny, p_lo_tiny, q_hi_tiny, q_lo_tiny, |
| 207 | + vlen); |
| 208 | + p_hi = __riscv_vmerge(p_hi, p_hi_tiny, x_is_tiny, vlen); |
| 209 | + p_lo = __riscv_vmerge(p_lo, p_lo_tiny, x_is_tiny, vlen); |
| 210 | + q_hi = __riscv_vmerge(q_hi, q_hi_tiny, x_is_tiny, vlen); |
| 211 | + q_lo = __riscv_vmerge(q_lo, q_lo_tiny, x_is_tiny, vlen); |
| 212 | + } |
| 213 | + |
| 214 | + // (y_hi, y_lo) <-- (w_hi + w_lo) * (p_hi + p_lo) |
| 215 | + y_hi = __riscv_vfmul(w_hi, p_hi, vlen); |
| 216 | + y_lo = __riscv_vfmsub(w_hi, p_hi, y_hi, vlen); |
| 217 | + y_lo = __riscv_vfmacc(y_lo, w_hi, p_lo, vlen); |
| 218 | + y_lo = __riscv_vfmacc(y_lo, w_lo, p_hi, vlen); |
| 219 | + |
| 220 | + DIV_N2D2(y_hi, y_lo, q_hi, q_lo, w_hi, vlen); |
| 221 | + |
| 222 | + vy = w_hi; |
| 223 | + |
| 224 | + vy = __riscv_vfsgnj(vy, vx_sign, vlen); |
| 225 | + vy = __riscv_vmerge(vy, vy_special, special_args, vlen); |
| 226 | + |
| 227 | + // copy vy into y and increment addr pointers |
| 228 | + VFSTORE_OUTARG1(vy, vlen); |
| 229 | + |
| 230 | + INCREMENT_INARG1(vlen); |
| 231 | + INCREMENT_OUTARG1(vlen); |
| 232 | + } |
| 233 | + RESTORE_FRM; |
| 234 | +} |
0 commit comments