diff --git a/doc/header.adoc b/doc/header.adoc index 04dfcaf7e..f55edf3a4 100644 --- a/doc/header.adoc +++ b/doc/header.adoc @@ -47,6 +47,8 @@ include::preface.adoc[] include::rvv-intrinsic-spec.adoc[] +include::rvv-intrinsic-examples.adoc[] + ifeval::["{build-type}" != "quick"] [appendix] == Explicit (Non-overloaded) intrinsics diff --git a/doc/rvv-intrinsic-examples.adoc b/doc/rvv-intrinsic-examples.adoc new file mode 100644 index 000000000..6de62ca69 --- /dev/null +++ b/doc/rvv-intrinsic-examples.adoc @@ -0,0 +1,296 @@ +== Examples + +NOTE: This section is non-normative. + +NOTE: No claims about efficiency are made the examples presented in this section. + +This section presents examples that use the RVV intrinsics specified in this +document. The examples are in C and assume `#include ` has +appeared earlier in the source code. + +=== Memory copy + +.An implementation of the `memcpy` function of the C Standard library using RVV intrinsics. +==== +[,c] +---- +void *memcpy_rvv(void *restrict destination, const void *restrict source, + size_t n) { + unsigned char *dst = destination; + const unsigned char *src = source; + // copy data byte by byte + for (size_t vl; n > 0; n -= vl, src += vl, dst += vl) { + vl = __riscv_vsetvl_e8m8(n); + // Load src[0..vl) + vuint8m8_t vec_src = __riscv_vle8_v_u8m8(src, vl); + // Store dst[0..vl) + __riscv_vse8_v_u8m8(dst, vec_src, vl); + // src is incremented vl (bytes) + // dst is incremented vl (bytes) + // n is decremented vl + } + return destination; +} +---- +==== + +=== SAXPY + +Consider the following function that implements a SAXPY-like kernel. + +[,c] +---- +void saxpy_reference(size_t n, const float a, const float *x, float *y) { + for (size_t i = 0; i < n; ++i) { + y[i] = a * x[i] + y[i]; + } +} +---- + +.An implementation of SAXPY using RVV intrinsics. +==== +[,c] +---- +void saxpy_rvv(size_t n, const float a, const float *x, float *y) { + for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { + vl = __riscv_vsetvl_e32m8(n); + // Load x[i..i+vl) + vfloat32m8_t vx = __riscv_vle32_v_f32m8(x, vl); + // Load y[i..i+vl) + vfloat32m8_t vy = __riscv_vle32_v_f32m8(y, vl); + // Computes vy[0..vl) + a*vx[0..vl) + // and stores it in y[i..i+vl) + __riscv_vse32_v_f32m8(y, __riscv_vfmacc_vf_f32m8(vy, a, vx, vl), vl); + } +} +---- +==== + +=== Matrix multiplication + +Consider the following function that implements a naive matrix multiplication. + +[,c] +---- +// matrix multiplication +// C[0..n)[0..m) = A[0..n)[0..p) x B[0..p)[0..m) +void matmul_reference(double *a, double *b, double *c, int n, int m, int p) { + for (int i = 0; i < n; ++i) + for (int j = 0; j < m; ++j) { + c[i * m + j] = 0; + for (int k = 0; k < p; ++k) { + c[i * m + j] += a[i * p + k] * b[k * m + j]; + } + } +} +---- + +The following example is a version of the matrix multiplication. The +accumulation on `c[i * m + j]` is implemented using partial accumulations +followed by a single final accumulation. + +.An implementation of a naive matrix multiplication using RVV intrinsics. +==== +[,c] +---- +void matmul_rvv(double *a, double *b, double *c, int n, int m, int p) { + size_t vlmax = __riscv_vsetvlmax_e64m1(); + for (int i = 0; i < n; ++i) + for (int j = 0; j < m; ++j) { + double *ptr_a = &a[i * p]; + double *ptr_b = &b[j]; + int k = p; + // Set accumulator to zero. + vfloat64m1_t vec_s = __riscv_vfmv_v_f_f64m1(0.0, vlmax); + vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0.0, vlmax); + for (size_t vl; k > 0; k -= vl) { + vl = __riscv_vsetvl_e64m1(k); + + // Load row a[i][k..k+vl) + vfloat64m1_t vec_a = __riscv_vle64_v_f64m1(ptr_a, vl); + // Load column b[k..k+vl)[j] + vfloat64m1_t vec_b = + __riscv_vlse64_v_f64m1(ptr_b, sizeof(double) * m, vl); + + // Accumulate dot product of row and column. If vl < vlmax we need to + // preserve the existing values of vec_s, hence the tu policy. + vec_s = __riscv_vfmacc_vv_f64m1_tu(vec_s, vec_a, vec_b, vl); + } + + // Final accumulation. + vfloat64m1_t vec_sum = + __riscv_vfredusum_vs_f64m1_f64m1(vec_s, vec_zero, vlmax); + double sum = __riscv_vfmv_f_s_f64m1_f64(vec_sum); + c[i * m + j] = sum; + } +} +---- +==== + + +=== String copy + +.An implementation of the `strcpy` function of the C Standard Library using RVV intrinsics. +==== +[,c] +---- +char *strcpy_rvv(char *destination, const char *source) { + unsigned char *dst = (unsigned char *)destination; + unsigned char *src = (unsigned char *)source; + size_t vlmax = __riscv_vsetvlmax_e8m8(); + long first_set_bit = -1; + + // This loop stops when among the loaded bytes we find the null byte + // of the string i.e., when first_set_bit >= 0 + for (size_t vl; first_set_bit < 0; src += vl, dst += vl) { + // Load up to vlmax elements if possible. + // vl is set to the maximum number of elements, no more than vlmax, that + // could be loaded without causing a memory fault. + vuint8m8_t vec_src = __riscv_vle8ff_v_u8m8(src, &vl, vlmax); + + // Mask that states where null bytes are in the loaded bytes. + vbool1_t string_terminate = __riscv_vmseq_vx_u8m8_b1(vec_src, 0, vl); + + // If the null byte is not in the loaded bytes the resulting mask will + // be all ones, otherwise only the elements up to and including the + // first null byte of the resulting will be enabled. + vbool1_t mask = __riscv_vmsif_m_b1(string_terminate, vl); + + // Store the enabled elements as determined by the mask above. + __riscv_vse8_v_u8m8_m(mask, dst, vec_src, vl); + + // Determine if we found the null byte in the loaded bytes. + // If not found, first_set_bit is set to all ones (i.e., -1), otherwise + // first_set_bit will be the number of the first element enabled in the + // mask. + first_set_bit = __riscv_vfirst_m_b1(string_terminate, vl); + } + return destination; +} +---- +==== + +=== Control flow + +Consider the following function that computes the division of two arrays +elementwise but sets the result to a given value when the element of the +divisor array is zero. + +[,c] +---- +void branch_ref(double *a, double *b, double *c, int n, double constant) { + for (int i = 0; i < n; ++i) { + c[i] = (b[i] != 0.0) ? a[i] / b[i] : constant; + } +} +---- + +The following example applies if-conversion using masks to implement the +semantics of the conditional operator. + +.An implementation of `branch_ref` using RVV intrinsics. +==== +[,c] +---- +void branch_rvv(double *a, double *b, double *c, int n, double constant) { + // set vlmax and initialize variables + size_t vlmax = __riscv_vsetvlmax_e64m1(); + // "Broadcast" the value of constant to all (vlmax) the elements in + // vec_constant. + vfloat64m1_t vec_constant = __riscv_vfmv_v_f_f64m1(constant, vlmax); + for (size_t vl; n > 0; n -= vl, a += vl, b += vl, c += vl) { + vl = __riscv_vsetvl_e64m1(n); + + // Load a[i..i+vl) + vfloat64m1_t vec_a = __riscv_vle64_v_f64m1(a, vl); + // Load b[i..i+vl) + vfloat64m1_t vec_b = __riscv_vle64_v_f64m1(b, vl); + + // Compute a mask whose enabled elements will correspond to the + // elements of b that are not zero. + vbool64_t mask = __riscv_vmfne_vf_f64m1_b64(vec_b, 0.0, vl); + + // Use mask undisturbed policy to compute the division for the + // elements enabled in the mask, otherwise set them to the given + // constant above (maskedoff). + vfloat64m1_t vec_c = __riscv_vfdiv_vv_f64m1_mu( + mask, /*maskedoff*/ vec_constant, vec_a, vec_b, vl); + + // Store into c[i..i+vl) + __riscv_vse64_v_f64m1(c, vec_c, vl); + } +} +---- +==== + +=== Reduction and counting + +Consider the following function that computes the dot product of two arrays +excluding elements of the first array (along with the correspondign element +of the second array) where the value is 42. The function also counts how many +pairs of elements took part in the dot-product. + +[,c] +---- +void reduce_reference(double *a, double *b, double *result_sum, + int *result_count, int n) { + int count = 0; + double s = 0.0; + for (int i = 0; i < n; ++i) { + if (a[i] != 42.0) { + s += a[i] * b[i]; + count++; + } + } + + *result_sum = s; + *result_count = count; +} +---- + +The following example implements the accumulation of the `s` variable doing +several partial accumulations followed by a final accumulation. + +.An implementation of `reduce_reference` using RVV intrinsics. +==== +[,c] +---- +void reduce_rvv(double *a, double *b, double *result_sum, int *result_count, + int n) { + int count = 0; + // set vlmax and initialize variables + size_t vlmax = __riscv_vsetvlmax_e64m1(); + vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0.0, vlmax); + vfloat64m1_t vec_s = __riscv_vfmv_v_f_f64m1(0.0, vlmax); + for (size_t vl; n > 0; n -= vl, a += vl, b += vl) { + vl = __riscv_vsetvl_e64m1(n); + + // Load a[i..i+vl) + vfloat64m1_t vec_a = __riscv_vle64_v_f64m1(a, vl); + // Load b[i..i+vl) + vfloat64m1_t vec_b = __riscv_vle64_v_f64m1(b, vl); + + // Compute a mask whose enabled elements will correspond to the + // elements of a that are not 42. + vbool64_t mask = __riscv_vmfne_vf_f64m1_b64(vec_a, 42.0, vl); + + // for all e in [0..vl) + // vec_s[e] ← vec_s[e] + vec_a[e] * vec_b[e], if mask[e] is enabled + // vec_s[e] , otherwise (mask undisturbed) + vec_s = __riscv_vfmacc_vv_f64m1_tumu(mask, vec_s, vec_a, vec_b, vl); + + // Adds to count the number of elements in mask that are enabled. + count += __riscv_vcpop_m_b64(mask, vl); + } + + vfloat64m1_t vec_sum; + // Final accumulation. + vec_sum = __riscv_vfredusum_vs_f64m1_f64m1(vec_s, vec_zero, vlmax); + double sum = __riscv_vfmv_f_s_f64m1_f64(vec_sum); + + // Return values. + *result_sum = sum; + *result_count = count; +} +---- +====