|
| 1 | +// SPDX-FileCopyrightText: 2023 Rivos Inc. |
| 2 | +// |
| 3 | +// SPDX-License-Identifier: Apache-2.0 |
| 4 | + |
| 5 | +#if (STRIDE == UNIT_STRIDE) |
| 6 | +#define F_VER1 RVVLM_ERFD_STD |
| 7 | +#else |
| 8 | +#define F_VER1 RVVLM_ERFDI_STD |
| 9 | +#endif |
| 10 | + |
| 11 | +#include <fenv.h> |
| 12 | + |
| 13 | +// T is 2.0 |
| 14 | +#define T 0x1.0p+1 |
| 15 | + |
| 16 | +// For x in [0, T] odd-polynomial |
| 17 | +// coefficients P_1 to P_17 are in fixed-point |
| 18 | +// scaled so that they have high precision |
| 19 | +#define P_1 0x120dd750429b6d0f // Q60 |
| 20 | +#define P_3 -0x1812746b0379e00c // Q62 |
| 21 | +#define P_5 0x1ce2f21a04292b5f // Q64 |
| 22 | +#define P_7 -0x1b82ce31281b38e1 // Q66 |
| 23 | +#define P_9 0x1565bcd0dd0bcd58 // Q68 |
| 24 | +#define P_11 -0xe016d9f815a019d // Q70 |
| 25 | +#define P_13 0x7e68c976c0ebcdc // Q72 |
| 26 | +#define P_15 -0x3e9a49c76e6ee9a // Q74 |
| 27 | +#define P_17 0x1b9e64a589f8da9 // Q76 |
| 28 | +#define P_19 -0x1.5f70cd90f1878p-23 |
| 29 | +#define P_21 0x1.fca2b5f17c85ap-27 |
| 30 | +#define P_23 -0x1.514eafaeffc30p-30 |
| 31 | +#define P_25 0x1.9b3583b6b826dp-34 |
| 32 | +#define P_27 -0x1.c97ffcf4f4e22p-38 |
| 33 | +#define P_29 0x1.c2f4a46d3297dp-42 |
| 34 | +#define P_31 -0x1.6ef3c7000b58bp-46 |
| 35 | +#define P_33 0x1.ac36453182837p-51 |
| 36 | +#define P_35 -0x1.0482966738f0ep-56 |
| 37 | + |
| 38 | +// For x in (T, 6.0], general polynomial |
| 39 | +// Coefficients Q_0 through Q_8 are in fixed points |
| 40 | +#define Q_0 0xffff87b6641370f // Q60 |
| 41 | +#define Q_1 -0x9062a79f9b29022 // Q62 |
| 42 | +#define Q_2 -0x11dc7e40e4efb77d // Q64 |
| 43 | +#define Q_3 -0x1dd1004e1f59ed4 // Q66 |
| 44 | +#define Q_4 0x1980c051527d41e7 // Q68 |
| 45 | +#define Q_5 0x902cddcb829790b // Q70 |
| 46 | +#define Q_6 -0x33d6f572cdbfa228 // Q72 |
| 47 | +#define Q_7 0x425f9974bef87221 // Q74 |
| 48 | +#define Q_8 -0x5363e91dfca5d4df // Q76 |
| 49 | +#define Q_9 0x1.b5eea4ad8cdbfp-16 |
| 50 | +#define Q_10 -0x1.ded0a34468c8cp-18 |
| 51 | +#define Q_11 0x1.af4968b4d634ap-20 |
| 52 | +#define Q_12 -0x1.51de51c57f11ap-22 |
| 53 | +#define Q_13 0x1.cbbf535e64b65p-25 |
| 54 | +#define Q_14 -0x1.025a03d4fdf7bp-27 |
| 55 | +#define Q_15 0x1.c735f1e16e8cdp-31 |
| 56 | +#define Q_16 -0x1.2de00f5eeee49p-34 |
| 57 | +#define Q_17 0x1.219bdcb68d070p-38 |
| 58 | +#define Q_18 -0x1.7b5fc54357bcfp-43 |
| 59 | +#define Q_19 0x1.301ac8caec6e3p-48 |
| 60 | +#define Q_20 -0x1.c3232aa28d427p-55 |
| 61 | + |
| 62 | +// The error function erf is an odd function: erf(-x) = -erf(x) |
| 63 | +// and thus we compute erf(|x|) and restore the sign at the end. |
| 64 | +// For x >= 6, erf(x) rounds to 1.0 |
| 65 | +// The algorithm uses two approximation methods on [0, T], and |
| 66 | +// (T, 6.]. For the first region, we approximate with an odd |
| 67 | +// polynomial. For the second region, the polynomial used actually |
| 68 | +// approximates (erfc(x))^(1/8). The desired result is 1 - (poly(x))^8 |
| 69 | +// Some algorithm for erf approximates log(erfc(x)) for x large. But |
| 70 | +// this requires an evaluation of expm1(y) after the polynomial approximation. |
| 71 | +// We essentially replaced the the cost of expm1 with 3 multiplications. |
| 72 | +void F_VER1(API) { |
| 73 | + size_t vlen; |
| 74 | + VFLOAT vx, vx_orig, vy, vy_special; |
| 75 | + VBOOL special_args; |
| 76 | + |
| 77 | + SET_ROUNDTONEAREST; |
| 78 | + // stripmining over input arguments |
| 79 | + for (; _inarg_n > 0; _inarg_n -= vlen) { |
| 80 | + vlen = VSET(_inarg_n); |
| 81 | + vx = VFLOAD_INARG1(vlen); |
| 82 | + vx_orig = vx; |
| 83 | + |
| 84 | + // Handle Inf and NaN |
| 85 | + EXCEPTION_HANDLING_ERF(vx, special_args, vy_special, vlen); |
| 86 | + |
| 87 | + // At this point, vx is 0 or >= 2^(-30). Can saturate vx at 6.0 |
| 88 | + vx = __riscv_vfsgnj(vx, fp_posOne, vlen); |
| 89 | + vx = __riscv_vfmin(vx, 0x1.8p+2, vlen); |
| 90 | + |
| 91 | + VBOOL x_gt_T = __riscv_vmfgt(vx, T, vlen); |
| 92 | + VFLOAT r, delta_r, xsq; |
| 93 | + xsq = __riscv_vfmul(vx, vx, vlen); |
| 94 | + r = __riscv_vmerge(xsq, vx, x_gt_T, vlen); |
| 95 | + delta_r = I_AS_F(__riscv_vxor(F_AS_I(delta_r), F_AS_I(delta_r), vlen)); |
| 96 | + delta_r = __riscv_vmerge(__riscv_vfmsub(vx, vx, xsq, vlen), delta_r, x_gt_T, |
| 97 | + vlen); |
| 98 | + |
| 99 | + // Compute polynomial in r. |
| 100 | + // For x in [0, T], r = x*x |
| 101 | + // the polynomial in r is x*(p_1 + p_3 r + p_5 r^2 ... + p_35 r^22) |
| 102 | + // For x in (T, 6], r = x |
| 103 | + // the polynomial in r is q_0 + q_1 r + q_2 r^2 + ... + q_20 r^20 |
| 104 | + // The higher order of the polynomial is computed in floating point; |
| 105 | + // the lower order part (more significant) are then done in fixed point |
| 106 | + // Both lower parts have 17 coefficients and so can be done with the |
| 107 | + // exact instruction sequence using the corresponding coefficients |
| 108 | + |
| 109 | + VFLOAT poly = PSTEP(Q_18, r, PSTEP(Q_19, Q_20, r, vlen), vlen); |
| 110 | + |
| 111 | + VFLOAT poly_right; |
| 112 | + poly_right = |
| 113 | + I_AS_F(__riscv_vxor(F_AS_I(poly_right), F_AS_I(poly_right), vlen)); |
| 114 | + poly_right = __riscv_vmerge(poly_right, poly, x_gt_T, vlen); |
| 115 | + |
| 116 | + poly_right = PSTEP_ab(x_gt_T, Q_17, P_35, r, poly_right, vlen); |
| 117 | + poly_right = PSTEP_ab(x_gt_T, Q_16, P_33, r, poly_right, vlen); |
| 118 | + poly_right = PSTEP_ab(x_gt_T, Q_15, P_31, r, poly_right, vlen); |
| 119 | + poly_right = PSTEP_ab(x_gt_T, Q_14, P_29, r, poly_right, vlen); |
| 120 | + poly_right = PSTEP_ab(x_gt_T, Q_13, P_27, r, poly_right, vlen); |
| 121 | + |
| 122 | + VFLOAT r4 = __riscv_vfmul(r, r, vlen); |
| 123 | + r4 = __riscv_vfmul(r4, r4, vlen); |
| 124 | + |
| 125 | + VFLOAT poly_left = VFMV_VF(P_25, vlen); |
| 126 | + poly_left = __riscv_vfmerge(poly_left, Q_12, x_gt_T, vlen); |
| 127 | + poly_left = PSTEP_ab(x_gt_T, Q_11, P_23, r, poly_left, vlen); |
| 128 | + poly_left = PSTEP_ab(x_gt_T, Q_10, P_21, r, poly_left, vlen); |
| 129 | + poly_left = PSTEP_ab(x_gt_T, Q_9, P_19, r, poly_left, vlen); |
| 130 | + |
| 131 | + poly = __riscv_vfmadd(poly_right, r4, poly_left, vlen); |
| 132 | + VINT POLY = __riscv_vfcvt_x(__riscv_vfmul(poly, 0x1.0p78, vlen), vlen); |
| 133 | + |
| 134 | + VINT R = __riscv_vfcvt_x(__riscv_vfmul(r, 0x1.0p60, vlen), vlen); |
| 135 | + VINT D_R = __riscv_vfcvt_x(__riscv_vfmul(delta_r, 0x1.0p60, vlen), vlen); |
| 136 | + R = __riscv_vadd(R, D_R, vlen); |
| 137 | + // POLY is in Q78, R is in Q60 |
| 138 | + |
| 139 | + VINT COEFF = __riscv_vmerge(VMVI_VX(P_17, vlen), Q_8, x_gt_T, vlen); |
| 140 | + POLY = __riscv_vsll(__riscv_vsmul(R, POLY, 1, vlen), 1, vlen); |
| 141 | + POLY = __riscv_vadd(POLY, COEFF, vlen); // Q76 |
| 142 | + |
| 143 | + COEFF = __riscv_vmerge(VMVI_VX(P_15, vlen), Q_7, x_gt_T, vlen); |
| 144 | + POLY = __riscv_vsll(__riscv_vsmul(R, POLY, 1, vlen), 1, vlen); |
| 145 | + POLY = __riscv_vadd(POLY, COEFF, vlen); // Q74 |
| 146 | + |
| 147 | + COEFF = __riscv_vmerge(VMVI_VX(P_13, vlen), Q_6, x_gt_T, vlen); |
| 148 | + POLY = __riscv_vsll(__riscv_vsmul(R, POLY, 1, vlen), 1, vlen); |
| 149 | + POLY = __riscv_vadd(POLY, COEFF, vlen); // Q72 |
| 150 | + |
| 151 | + COEFF = __riscv_vmerge(VMVI_VX(P_11, vlen), Q_5, x_gt_T, vlen); |
| 152 | + POLY = __riscv_vsll(__riscv_vsmul(R, POLY, 1, vlen), 1, vlen); |
| 153 | + POLY = __riscv_vadd(POLY, COEFF, vlen); // Q70 |
| 154 | + |
| 155 | + COEFF = __riscv_vmerge(VMVI_VX(P_9, vlen), Q_4, x_gt_T, vlen); |
| 156 | + POLY = __riscv_vsll(__riscv_vsmul(R, POLY, 1, vlen), 1, vlen); |
| 157 | + POLY = __riscv_vadd(POLY, COEFF, vlen); // Q68 |
| 158 | + |
| 159 | + COEFF = __riscv_vmerge(VMVI_VX(P_7, vlen), Q_3, x_gt_T, vlen); |
| 160 | + POLY = __riscv_vsll(__riscv_vsmul(R, POLY, 1, vlen), 1, vlen); |
| 161 | + POLY = __riscv_vadd(POLY, COEFF, vlen); // Q66 |
| 162 | + |
| 163 | + COEFF = __riscv_vmerge(VMVI_VX(P_5, vlen), Q_2, x_gt_T, vlen); |
| 164 | + POLY = __riscv_vsll(__riscv_vsmul(R, POLY, 1, vlen), 1, vlen); |
| 165 | + POLY = __riscv_vadd(POLY, COEFF, vlen); // Q64 |
| 166 | + |
| 167 | + COEFF = __riscv_vmerge(VMVI_VX(P_3, vlen), Q_1, x_gt_T, vlen); |
| 168 | + POLY = __riscv_vsll(__riscv_vsmul(R, POLY, 1, vlen), 1, vlen); |
| 169 | + POLY = __riscv_vadd(POLY, COEFF, vlen); // Q62 |
| 170 | + |
| 171 | + COEFF = __riscv_vmerge(VMVI_VX(P_1, vlen), Q_0, x_gt_T, vlen); |
| 172 | + POLY = __riscv_vsll(__riscv_vsmul(R, POLY, 1, vlen), 1, vlen); |
| 173 | + POLY = __riscv_vadd(POLY, COEFF, vlen); // Q60 |
| 174 | + |
| 175 | + VINT POLY_RIGHT = __riscv_vsll(POLY, 3, vlen); // Q63 |
| 176 | + POLY_RIGHT = __riscv_vsmul(POLY_RIGHT, POLY_RIGHT, 1, vlen); |
| 177 | + POLY_RIGHT = __riscv_vsmul(POLY_RIGHT, POLY_RIGHT, 1, vlen); |
| 178 | + POLY_RIGHT = __riscv_vsmul(POLY_RIGHT, POLY_RIGHT, 1, vlen); |
| 179 | + // POLY_RIGHT is POLY^8 |
| 180 | + |
| 181 | + // convert x to fixed-point Q62+m, 2^m <= x < 2^(m+1) |
| 182 | + VINT e = __riscv_vsra(F_AS_I(vx), MAN_LEN, vlen); |
| 183 | + e = __riscv_vmax(e, EXP_BIAS - 40, vlen); |
| 184 | + e = __riscv_vrsub(e, 2 * EXP_BIAS + 62, vlen); |
| 185 | + VFLOAT scale = I_AS_F(__riscv_vsll(e, MAN_LEN, vlen)); |
| 186 | + // scale is 2^(62-m), X is x in Q_(62-m) |
| 187 | + VINT X = __riscv_vfcvt_x(__riscv_vfmul(vx, scale, vlen), vlen); |
| 188 | + POLY = __riscv_vsmul(X, POLY, 1, vlen); |
| 189 | + // X is Q_(62-m) POLY is now Q_(59-m) |
| 190 | + // x in [0, T], POLY is result in Q 59-m |
| 191 | + |
| 192 | + // x in (T, 6], result is 1 - 2^(-63) POLY_RIGHT |
| 193 | + // that is, 2^(-62)(2^62 - (POLY_RIGHT>>1)) |
| 194 | + INT one = (1LL << 62); |
| 195 | + POLY_RIGHT = __riscv_vsra(POLY_RIGHT, 1, vlen); |
| 196 | + POLY_RIGHT = __riscv_vrsub(POLY_RIGHT, one, vlen); |
| 197 | + |
| 198 | + POLY = __riscv_vmerge(POLY, POLY_RIGHT, x_gt_T, vlen); |
| 199 | + // POLY contains the result in fixed point |
| 200 | + // scale is 59-m for x in [0, T] and 62 for x > T |
| 201 | + |
| 202 | + e = __riscv_vrsub(e, 2 * EXP_BIAS + 3, vlen); |
| 203 | + // exponent field of 2^(-59+m) |
| 204 | + e = __riscv_vmerge(e, EXP_BIAS - 62, x_gt_T, vlen); |
| 205 | + scale = I_AS_F(__riscv_vsll(e, MAN_LEN, vlen)); |
| 206 | + |
| 207 | + vy = __riscv_vfcvt_f(POLY, vlen); |
| 208 | + vy = __riscv_vfmul(vy, scale, vlen); |
| 209 | + vy = __riscv_vfsgnj(vy, vx_orig, vlen); |
| 210 | + |
| 211 | + vy = __riscv_vmerge(vy, vy_special, special_args, vlen); |
| 212 | + // copy vy into y and increment addr pointers |
| 213 | + VFSTORE_OUTARG1(vy, vlen); |
| 214 | + |
| 215 | + INCREMENT_INARG1(vlen); |
| 216 | + INCREMENT_OUTARG1(vlen); |
| 217 | + } |
| 218 | + RESTORE_FRM; |
| 219 | +} |
0 commit comments