func(omp): make repeated reduce function more flexible

csegarragonz · csegarragonz · commit d6b8545ec888 · 2024-03-18T16:14:31.000Z
diff --git a/func/omp/repeated_reduce.cpp b/func/omp/repeated_reduce.cpp
@@ -5,13 +5,17 @@
 #include <math.h>
 #include <omp.h>
 #include <unistd.h>
+#include <vector>
 
-bool doReduce()
+// This reduce method is called with a varying number of threads, but with
+// a maximum of 10. In addition, the inner parallel for pragma may be
+// elastically scaled from nThreads, all the way up to 10.
+bool doReduce(int numThreads)
 {
-    int nThreads = 10;
     int chunkSize = 1000;
-    int loopSize = nThreads * chunkSize;
-    int counts[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    int loopSize = numThreads * chunkSize;
+    int maxNumThreads = 10;
+    std::vector<int> counts(maxNumThreads, 0);
 
     int reducedA = 0;
     int reducedB = 0;
@@ -21,7 +25,7 @@ bool doReduce()
     FAASM_REDUCE(reducedA, FAASM_TYPE_INT, FAASM_OP_SUM)
     FAASM_REDUCE(reducedB, FAASM_TYPE_INT, FAASM_OP_SUM)
 
-#pragma omp parallel for num_threads(nThreads) default(none)                   \
+#pragma omp parallel for num_threads(numThreads) default(none)                 \
   shared(counts, loopSize, success) reduction(+ : reducedA, reducedB)
     for (int i = 0; i < loopSize; i++) {
         int threadNum = omp_get_thread_num();
@@ -50,17 +54,53 @@ bool doReduce()
         return 1;
     }
 
-    // Check counts
-    for (int t = 0; t < nThreads; t++) {
-        if (counts[t] != chunkSize) {
-            printf(
-              "Loop count for thread %i: %i != %i\n", t, counts[t], chunkSize);
+    // First, work out how many threads actually executed the loop, by checking
+    // how many threads wrote to the counts array
+    int actualNumThreads = 0;
+    for (int i = 0; i < counts.size(); i++) {
+        if (counts.at(i) != 0) {
+            actualNumThreads++;
+        }
+    }
+
+    if ((actualNumThreads < numThreads) || (actualNumThreads > maxNumThreads)) {
+        printf("Actual number of threads outside valid range: %i \\not \\in "
+               "[%i, %i]\n",
+               actualNumThreads,
+               numThreads,
+               maxNumThreads);
+
+        // Exit fast in this case as posterior checks may seg-fault
+        return false;
+    }
+
+    // Check counts (only count the aggregate, and a uniform distribution, as
+    // we may elastically change the parallelism of the loop)
+    int actualChunkSize = (int)loopSize / actualNumThreads;
+    int total = 0;
+    for (int tNum = 0; tNum < actualNumThreads; tNum++) {
+        if (counts[tNum] != actualChunkSize) {
+            printf("Loop count for thread %i: %i != %i\n",
+                   tNum,
+                   counts[tNum],
+                   actualChunkSize);
             success = false;
         }
+
+        total += counts[tNum];
+    }
+
+    if (total != loopSize) {
+        printf("Total loop count failed: %i != %i\n", total, loopSize);
+        success = false;
     }
 
-    int expectedFinalReducedA = 550000;
-    int expectedFinalReducedB = 825000;
+    // The expected final value is: constant (10/15) * (sum [1, nThreads]) *
+    // chinkSize
+    int expectedFinalReducedA =
+      (int)10 * actualNumThreads * (actualNumThreads + 1) / 2 * actualChunkSize;
+    int expectedFinalReducedB =
+      (int)15 * actualNumThreads * (actualNumThreads + 1) / 2 * actualChunkSize;
 
     if (reducedA != expectedFinalReducedA) {
         printf("reducedA %i != %i\n", reducedA, expectedFinalReducedA);
@@ -82,12 +122,11 @@ int main(int argc, char* argv[])
         printf("Incorrect number of threads passed as input: %i\n", numThreads);
         return 1;
     }
-    omp_set_num_threads(numThreads);
 
     // Run reduce in a loop and check each iteration is correct
     int nLoops = 10;
     for (int i = 0; i < nLoops; i++) {
-        bool success = doReduce();
+        bool success = doReduce(numThreads);
         if (!success) {
             printf("Repeated reduce failed on loop %i\n", i);
             return 1;