From 0694f4a1ea1528e4b915b471b14f72f9e6c28b26 Mon Sep 17 00:00:00 2001
From: Roger Ferrer Ibanez <roger.ferrer@bsc.es>
Date: Wed, 15 May 2024 13:55:09 +0000
Subject: [PATCH 1/7] Add a section with examples

The examples are non-normative. I've taken a subset of the examples
in the examples directory of the repository.
---
 doc/header.adoc                 |   2 +
 doc/rvv-intrinsic-examples.adoc | 297 ++++++++++++++++++++++++++++++++
 2 files changed, 299 insertions(+)
 create mode 100644 doc/rvv-intrinsic-examples.adoc

diff --git a/doc/header.adoc b/doc/header.adoc
index 04dfcaf7e..f55edf3a4 100644
--- a/doc/header.adoc
+++ b/doc/header.adoc
@@ -47,6 +47,8 @@ include::preface.adoc[]
 
 include::rvv-intrinsic-spec.adoc[]
 
+include::rvv-intrinsic-examples.adoc[]
+
 ifeval::["{build-type}" != "quick"]
 [appendix]
 == Explicit (Non-overloaded) intrinsics
diff --git a/doc/rvv-intrinsic-examples.adoc b/doc/rvv-intrinsic-examples.adoc
new file mode 100644
index 000000000..f0a0fc578
--- /dev/null
+++ b/doc/rvv-intrinsic-examples.adoc
@@ -0,0 +1,297 @@
+== Examples
+
+NOTE: This section is non-normative.
+
+NOTE: No claims about efficiency are made the examples presented in this section.
+
+This section presents examples that use the RVV intrinsics specified in this
+document. The examples are in C and assume `#include <riscv_vector.h>` has
+appeared earlier in the source code.
+
+=== Memory copy
+
+.An implementation of the `memcpy` function of the C Standard library using RVV intrinsics.
+====
+[,c]
+----
+void *memcpy_rvv(void *restrict destination, const void *restrict source,
+    size_t n) {
+  unsigned char *dst = destination;
+  const unsigned char *src = source;
+  // copy data byte by byte
+  for (size_t vl; n > 0; n -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e8m8(n);
+    // Load src[0..vl)
+    vuint8m8_t vec_src = __riscv_vle8_v_u8m8(src, vl);
+    // Store dst[0..vl)
+    __riscv_vse8_v_u8m8(dst, vec_src, vl);
+    // src is incremented vl (bytes)
+    // dst is incremented vl (bytes)
+    // n is decremented vl
+  }
+  return destination;
+}
+----
+====
+
+=== SAXPY
+
+Consider the following function that implements a SAXPY-like kernel.
+
+[,c]
+----
+void saxpy_reference(size_t n, const float a, const float *x, float *y) {
+  for (size_t i = 0; i < n; ++i) {
+    y[i] = a * x[i] + y[i];
+  }
+}
+----
+
+.An implementation of SAXPY using RVV intrinsics.
+====
+[,c]
+----
+void saxpy_rvv(size_t n, const float a, const float *x, float *y) {
+  for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+    vl = __riscv_vsetvl_e32m8(n);
+    // Load x[i..i+vl)
+    vfloat32m8_t vx = __riscv_vle32_v_f32m8(x, vl);
+    // Load y[i..i+vl)
+    vfloat32m8_t vy = __riscv_vle32_v_f32m8(y, vl);
+    // Computes vy[0..vl) + a*vx[0..vl)
+    // and stores it in y[i..i+vl)
+    __riscv_vse32_v_f32m8(y, __riscv_vfmacc_vf_f32m8(vy, a, vx, vl), vl);
+  }
+}
+----
+====
+
+=== Matrix multiplication
+
+Consider the following function that implements a naive matrix multiplication.
+
+[,c]
+----
+// matrix multiplication
+//  C[0..n)[0..m) = A[0..n)[0..p] x B[0..p)[0..m)
+void matmul_reference(double *a, double *b, double *c, int n, int m, int p) {
+  for (int i = 0; i < n; ++i)
+    for (int j = 0; j < m; ++j) {
+      c[i * n + j] = 0;
+      for (int k = 0; k < p; ++k) {
+        c[i * n + j] += a[i * p + k] * b[k * m + j];
+      }
+    }
+}
+----
+
+The following example is a version of the matrix multiplication. The
+accumulation on `c[i * n + j]` is implemented using partial accumulations
+followed by a single final accumulation.
+
+.An implementation of a naive matrix multiplication using RVV intrinsics.
+====
+[,c]
+----
+void matmul_rvv(double *a, double *b, double *c, int n, int m, int p) {
+  size_t vlmax = __riscv_vsetvlmax_e64m1();
+  for (int i = 0; i < n; ++i)
+    for (int j = 0; j < m; ++j) {
+      double *ptr_a = &a[i * p];
+      double *ptr_b = &b[j];
+      int k = p;
+      // Set accumulator to  zero.
+      vfloat64m1_t vec_s = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+      vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+      for (size_t vl; k > 0; k -= vl) {
+        vl = __riscv_vsetvl_e64m1(k);
+
+        // Load row a[i][k..k+vl)
+        vfloat64m1_t vec_a = __riscv_vle64_v_f64m1(ptr_a, vl);
+        // Load column b[k..k+vl)[j]
+        vfloat64m1_t vec_b =
+          __riscv_vlse64_v_f64m1(ptr_b, sizeof(double) * m, vl);
+
+        // Accumulate dot product of row and column. If vl < vlmax
+        // we need to preserve the existin values of vec_s, hence the
+        // tu policy.
+        vec_s = __riscv_vfmacc_vv_f64m1_tu(vec_s, vec_a, vec_b, vl);
+      }
+
+      // Final accumulation.
+      vfloat64m1_t vec_sum =
+        __riscv_vfredusum_vs_f64m1_f64m1(vec_s, vec_zero, vlmax);
+      double sum = __riscv_vfmv_f_s_f64m1_f64(vec_sum);
+      c[i * n + j] = sum;
+    }
+}
+----
+====
+
+
+=== String copy
+
+.An implementation of the `strcpy` function of the C Standard Library using RVV intrinsics.
+====
+[,c]
+----
+char *strcpy_rvv(char *destination, const char *source) {
+  unsigned char *dst = (unsigned char *)destination;
+  unsigned char *src = (unsigned char *)source;
+  size_t vlmax = __riscv_vsetvlmax_e8m8();
+  long first_set_bit = -1;
+
+  // This loop stops when among the loaded bytes we find the null byte
+  // of the string .i.e. when first_set_bit >= 0
+  for (size_t vl; first_set_bit < 0; src += vl, dst += vl) {
+    // Load up to vlmax elements if possible.
+    // vl is set to the maximum number of elements that could be loaded
+    // without causing a memory fault.
+    vuint8m8_t vec_src = __riscv_vle8ff_v_u8m8(src, &vl, vlmax);
+
+    // Mask that states where null bytes are in the loaded bytes.
+    vbool1_t string_terminate = __riscv_vmseq_vx_u8m8_b1(vec_src, 0, vl);
+
+    // If the null byte is not in the loaded bytes the resulting mask will
+    // be all ones, otherwise only the elements up to and including the
+    // first null byte of the resulting will be enabled.
+    vbool1_t mask = __riscv_vmsif_m_b1(string_terminate, vl);
+
+    // Store the enabled elements as determined by the mask above.
+    __riscv_vse8_v_u8m8_m(mask, dst, vec_src, vl);
+
+    // Determine if we found the null byte in the loaded bytes.
+    // If not found, first_set_bit is set to all ones (i.e., -1), otherwise
+    // first_set_bit will be the number of the first element enabled in the
+    // mask.
+    first_set_bit = __riscv_vfirst_m_b1(string_terminate, vl);
+  }
+  return destination;
+}
+----
+====
+
+=== Control flow
+
+Consider the following function that computes the division of two arrays
+elementwise but sets the result to a given value when the element of the
+divisor array is zero.
+
+[,c]
+----
+void branch_ref(double *a, double *b, double *c, int n, double constant) {
+  for (int i = 0; i < n; ++i) {
+    c[i] = (b[i] != 0.0) ? a[i] / b[i] : constant;
+  }
+}
+----
+
+The following example applies if-conversion using masks to implement the
+semantics of the conditional operator.
+
+.An implementation of `branch_ref` using RVV intrinsics.
+====
+[,c]
+----
+void branch_rvv(double *a, double *b, double *c, int n, double constant) {
+  // set vlmax and initialize variables
+  size_t vlmax = __riscv_vsetvlmax_e64m1();
+  // "Broadcast" the value of constant to all (vlmax) the elements in
+  // vec_constant.
+  vfloat64m1_t vec_constant = __riscv_vfmv_v_f_f64m1(constant, vlmax);
+  for (size_t vl; n > 0; n -= vl, a += vl, b += vl, c += vl) {
+    vl = __riscv_vsetvl_e64m1(n);
+
+    // Load a[i..i+vl)
+    vfloat64m1_t vec_a = __riscv_vle64_v_f64m1(a, vl);
+    // Load b[i..i+vl)
+    vfloat64m1_t vec_b = __riscv_vle64_v_f64m1(b, vl);
+
+    // Compute a mask whose enabled elements will correspond to the
+    // elements of b that are not zero.
+    vbool64_t mask = __riscv_vmfne_vf_f64m1_b64(vec_b, 0.0, vl);
+
+    // Use mask undisturbed policy to compute the division for the
+    // elements enabled in the mask, otherwise set them to the given
+    // constant above (maskedoff).
+    vfloat64m1_t vec_c = __riscv_vfdiv_vv_f64m1_mu(
+        mask, /*maskedoff*/ vec_constant, vec_a, vec_b, vl);
+
+    // Store into c[i..i+vl)
+    __riscv_vse64_v_f64m1(c, vec_c, vl);
+  }
+}
+----
+====
+
+=== Reduction and counting
+
+Consider the following function that computes the dot product of two arrays
+excluding elements of the first array (along with the correspondign element
+of the second array) where the value is 42. The function also counts how many
+pairs of elements took part of the dot-product.
+
+[,c]
+----
+void reduce_reference(double *a, double *b, double *result_sum,
+                      int *result_count, int n) {
+  int count = 0;
+  double s = 0.0;
+  for (int i = 0; i < n; ++i) {
+    if (a[i] != 42.0) {
+      s += a[i] * b[i];
+      count++;
+    }
+  }
+
+  *result_sum = s;
+  *result_count = count;
+}
+----
+
+The following example implements the accumulation of the `s` variable doing
+several partial accumulations followed by a final accumulation.
+
+.An implementation of `reduce_reference` using RVV intrinsics.
+====
+[,c]
+----
+void reduce_rvv(double *a, double *b, double *result_sum, int *result_count,
+    int n) {
+  int count = 0;
+  // set vlmax and initialize variables
+  size_t vlmax = __riscv_vsetvlmax_e64m1();
+  vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+  vfloat64m1_t vec_s = __riscv_vfmv_v_f_f64m1(0.0, vlmax);
+  for (size_t vl; n > 0; n -= vl, a += vl, b += vl) {
+    vl = __riscv_vsetvl_e64m1(n);
+
+    // Load a[i..i+vl)
+    vfloat64m1_t vec_a = __riscv_vle64_v_f64m1(a, vl);
+    // Load b[i..i+vl)
+    vfloat64m1_t vec_b = __riscv_vle64_v_f64m1(b, vl);
+
+    // Compute a mask whose enabled elements will correspond to the
+    // elements of a that are not 42.
+    vbool64_t mask = __riscv_vmfne_vf_f64m1_b64(vec_a, 42, vl);
+
+    // for all e in [0..vl)
+    //  vec_s[e] ← vec_s[e] + vec_a[e] * vec_b[e], if mask[e] is enabled
+    //             vec_s[e]                      , otherwise (mask undisturbed)
+    vec_s = __riscv_vfmacc_vv_f64m1_tumu(mask, vec_s, vec_a, vec_b, vl);
+
+    // Adds to count the number of elements in mask that are enabled.
+    count = count + __riscv_vcpop_m_b64(mask, vl);
+  }
+
+  vfloat64m1_t vec_sum;
+  // Final accumulation.
+  vec_sum = __riscv_vfredusum_vs_f64m1_f64m1(vec_s, vec_zero, vlmax);
+  double sum = __riscv_vfmv_f_s_f64m1_f64(vec_sum);
+
+  // Return values.
+  *result_sum = sum;
+  *result_count = count;
+}
+----
+====

From 551855b2a2450a6f68b3d4bdee594263620efe7f Mon Sep 17 00:00:00 2001
From: Roger Ferrer Ibanez <roger.ferrer@bsc.es>
Date: Wed, 15 May 2024 16:07:32 +0000
Subject: [PATCH 2/7] Fix typo and reflow comment.

---
 doc/rvv-intrinsic-examples.adoc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/doc/rvv-intrinsic-examples.adoc b/doc/rvv-intrinsic-examples.adoc
index f0a0fc578..67810a13b 100644
--- a/doc/rvv-intrinsic-examples.adoc
+++ b/doc/rvv-intrinsic-examples.adoc
@@ -112,9 +112,8 @@ void matmul_rvv(double *a, double *b, double *c, int n, int m, int p) {
         vfloat64m1_t vec_b =
           __riscv_vlse64_v_f64m1(ptr_b, sizeof(double) * m, vl);
 
-        // Accumulate dot product of row and column. If vl < vlmax
-        // we need to preserve the existin values of vec_s, hence the
-        // tu policy.
+        // Accumulate dot product of row and column. If vl < vlmax we need to
+        // preserve the existing values of vec_s, hence the tu policy.
         vec_s = __riscv_vfmacc_vv_f64m1_tu(vec_s, vec_a, vec_b, vl);
       }
 

From fdf08e087c610fc085e28254f07dc1bb9ae51745 Mon Sep 17 00:00:00 2001
From: Roger Ferrer Ibanez <roger.ferrer@bsc.es>
Date: Fri, 17 May 2024 06:33:14 +0000
Subject: [PATCH 3/7] Fix typo in the interval notation for A

---
 doc/rvv-intrinsic-examples.adoc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/rvv-intrinsic-examples.adoc b/doc/rvv-intrinsic-examples.adoc
index 67810a13b..6a072ac71 100644
--- a/doc/rvv-intrinsic-examples.adoc
+++ b/doc/rvv-intrinsic-examples.adoc
@@ -73,7 +73,7 @@ Consider the following function that implements a naive matrix multiplication.
 [,c]
 ----
 // matrix multiplication
-//  C[0..n)[0..m) = A[0..n)[0..p] x B[0..p)[0..m)
+//  C[0..n)[0..m) = A[0..n)[0..p) x B[0..p)[0..m)
 void matmul_reference(double *a, double *b, double *c, int n, int m, int p) {
   for (int i = 0; i < n; ++i)
     for (int j = 0; j < m; ++j) {

From ea55dcc7a8bce96e08f32f086e08be460253e228 Mon Sep 17 00:00:00 2001
From: Roger Ferrer Ibanez <roger.ferrer@bsc.es>
Date: Fri, 17 May 2024 06:34:08 +0000
Subject: [PATCH 4/7] Fix typo and add limit the maximum number of elements in
 comment

---
 doc/rvv-intrinsic-examples.adoc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/rvv-intrinsic-examples.adoc b/doc/rvv-intrinsic-examples.adoc
index 6a072ac71..307886a19 100644
--- a/doc/rvv-intrinsic-examples.adoc
+++ b/doc/rvv-intrinsic-examples.adoc
@@ -141,11 +141,11 @@ char *strcpy_rvv(char *destination, const char *source) {
   long first_set_bit = -1;
 
   // This loop stops when among the loaded bytes we find the null byte
-  // of the string .i.e. when first_set_bit >= 0
+  // of the string i.e., when first_set_bit >= 0
   for (size_t vl; first_set_bit < 0; src += vl, dst += vl) {
     // Load up to vlmax elements if possible.
-    // vl is set to the maximum number of elements that could be loaded
-    // without causing a memory fault.
+    // vl is set to the maximum number of elements, no more than vlmax, that
+    // could be loaded without causing a memory fault.
     vuint8m8_t vec_src = __riscv_vle8ff_v_u8m8(src, &vl, vlmax);
 
     // Mask that states where null bytes are in the loaded bytes.

From 630cd689f0933a0ba5a51fd1b978cc666fb5ae4b Mon Sep 17 00:00:00 2001
From: Roger Ferrer Ibanez <roger.ferrer@bsc.es>
Date: Fri, 17 May 2024 06:34:31 +0000
Subject: [PATCH 5/7] Typo

---
 doc/rvv-intrinsic-examples.adoc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/rvv-intrinsic-examples.adoc b/doc/rvv-intrinsic-examples.adoc
index 307886a19..f20f5bbb8 100644
--- a/doc/rvv-intrinsic-examples.adoc
+++ b/doc/rvv-intrinsic-examples.adoc
@@ -228,7 +228,7 @@ void branch_rvv(double *a, double *b, double *c, int n, double constant) {
 Consider the following function that computes the dot product of two arrays
 excluding elements of the first array (along with the correspondign element
 of the second array) where the value is 42. The function also counts how many
-pairs of elements took part of the dot-product.
+pairs of elements took part in the dot-product.
 
 [,c]
 ----

From 5cb77eb542f4689238ed2de8ef64e9601b82c9a2 Mon Sep 17 00:00:00 2001
From: Roger Ferrer Ibanez <roger.ferrer@bsc.es>
Date: Fri, 17 May 2024 06:34:44 +0000
Subject: [PATCH 6/7] Address review comments

---
 doc/rvv-intrinsic-examples.adoc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/rvv-intrinsic-examples.adoc b/doc/rvv-intrinsic-examples.adoc
index f20f5bbb8..d69650130 100644
--- a/doc/rvv-intrinsic-examples.adoc
+++ b/doc/rvv-intrinsic-examples.adoc
@@ -272,7 +272,7 @@ void reduce_rvv(double *a, double *b, double *result_sum, int *result_count,
 
     // Compute a mask whose enabled elements will correspond to the
     // elements of a that are not 42.
-    vbool64_t mask = __riscv_vmfne_vf_f64m1_b64(vec_a, 42, vl);
+    vbool64_t mask = __riscv_vmfne_vf_f64m1_b64(vec_a, 42.0, vl);
 
     // for all e in [0..vl)
     //  vec_s[e] ← vec_s[e] + vec_a[e] * vec_b[e], if mask[e] is enabled
@@ -280,7 +280,7 @@ void reduce_rvv(double *a, double *b, double *result_sum, int *result_count,
     vec_s = __riscv_vfmacc_vv_f64m1_tumu(mask, vec_s, vec_a, vec_b, vl);
 
     // Adds to count the number of elements in mask that are enabled.
-    count = count + __riscv_vcpop_m_b64(mask, vl);
+    count += __riscv_vcpop_m_b64(mask, vl);
   }
 
   vfloat64m1_t vec_sum;

From f083fb14d0188ec955e7a1ee5eb015e8812c6f44 Mon Sep 17 00:00:00 2001
From: Roger Ferrer Ibanez <roger.ferrer@bsc.es>
Date: Mon, 20 May 2024 13:06:06 +0000
Subject: [PATCH 7/7] Fix wrong scalar strided access in matrix C of matmul

Where I accidentally used "n" I wanted to use "m". The vector access was
correct already.
---
 doc/rvv-intrinsic-examples.adoc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/rvv-intrinsic-examples.adoc b/doc/rvv-intrinsic-examples.adoc
index d69650130..6de62ca69 100644
--- a/doc/rvv-intrinsic-examples.adoc
+++ b/doc/rvv-intrinsic-examples.adoc
@@ -77,16 +77,16 @@ Consider the following function that implements a naive matrix multiplication.
 void matmul_reference(double *a, double *b, double *c, int n, int m, int p) {
   for (int i = 0; i < n; ++i)
     for (int j = 0; j < m; ++j) {
-      c[i * n + j] = 0;
+      c[i * m + j] = 0;
       for (int k = 0; k < p; ++k) {
-        c[i * n + j] += a[i * p + k] * b[k * m + j];
+        c[i * m + j] += a[i * p + k] * b[k * m + j];
       }
     }
 }
 ----
 
 The following example is a version of the matrix multiplication. The
-accumulation on `c[i * n + j]` is implemented using partial accumulations
+accumulation on `c[i * m + j]` is implemented using partial accumulations
 followed by a single final accumulation.
 
 .An implementation of a naive matrix multiplication using RVV intrinsics.
@@ -121,7 +121,7 @@ void matmul_rvv(double *a, double *b, double *c, int n, int m, int p) {
       vfloat64m1_t vec_sum =
         __riscv_vfredusum_vs_f64m1_f64m1(vec_s, vec_zero, vlmax);
       double sum = __riscv_vfmv_f_s_f64m1_f64(vec_sum);
-      c[i * n + j] = sum;
+      c[i * m + j] = sum;
     }
 }
 ----