5
5
#include < math.h>
6
6
#include < omp.h>
7
7
#include < unistd.h>
8
+ #include < vector>
8
9
9
- bool doReduce ()
10
+ // This reduce method is called with a varying number of threads, but with
11
+ // a maximum of 10. In addition, the inner parallel for pragma may be
12
+ // elastically scaled from nThreads, all the way up to 10.
13
+ bool doReduce (int numThreads)
10
14
{
11
- int nThreads = 10 ;
12
15
int chunkSize = 1000 ;
13
- int loopSize = nThreads * chunkSize;
14
- int counts[] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
16
+ int loopSize = numThreads * chunkSize;
17
+ int maxNumThreads = 10 ;
18
+ std::vector<int > counts (maxNumThreads, 0 );
15
19
16
20
int reducedA = 0 ;
17
21
int reducedB = 0 ;
@@ -21,7 +25,7 @@ bool doReduce()
21
25
FAASM_REDUCE (reducedA, FAASM_TYPE_INT, FAASM_OP_SUM)
22
26
FAASM_REDUCE (reducedB, FAASM_TYPE_INT, FAASM_OP_SUM)
23
27
24
- #pragma omp parallel for num_threads(nThreads ) default(none) \
28
+ #pragma omp parallel for num_threads(numThreads ) default(none) \
25
29
shared (counts, loopSize, success) reduction (+ : reducedA, reducedB)
26
30
for (int i = 0 ; i < loopSize; i++) {
27
31
int threadNum = omp_get_thread_num ();
@@ -50,17 +54,53 @@ bool doReduce()
50
54
return 1 ;
51
55
}
52
56
53
- // Check counts
54
- for (int t = 0 ; t < nThreads; t++) {
55
- if (counts[t] != chunkSize) {
56
- printf (
57
- " Loop count for thread %i: %i != %i\n " , t, counts[t], chunkSize);
57
+ // First, work out how many threads actually executed the loop, by checking
58
+ // how many threads wrote to the counts array
59
+ int actualNumThreads = 0 ;
60
+ for (int i = 0 ; i < counts.size (); i++) {
61
+ if (counts.at (i) != 0 ) {
62
+ actualNumThreads++;
63
+ }
64
+ }
65
+
66
+ if ((actualNumThreads < numThreads) || (actualNumThreads > maxNumThreads)) {
67
+ printf (" Actual number of threads outside valid range: %i \\ not \\ in "
68
+ " [%i, %i]\n " ,
69
+ actualNumThreads,
70
+ numThreads,
71
+ maxNumThreads);
72
+
73
+ // Exit fast in this case as posterior checks may seg-fault
74
+ return false ;
75
+ }
76
+
77
+ // Check counts (only count the aggregate, and a uniform distribution, as
78
+ // we may elastically change the parallelism of the loop)
79
+ int actualChunkSize = (int )loopSize / actualNumThreads;
80
+ int total = 0 ;
81
+ for (int tNum = 0 ; tNum < actualNumThreads; tNum++) {
82
+ if (counts[tNum] != actualChunkSize) {
83
+ printf (" Loop count for thread %i: %i != %i\n " ,
84
+ tNum,
85
+ counts[tNum],
86
+ actualChunkSize);
58
87
success = false ;
59
88
}
89
+
90
+ total += counts[tNum];
91
+ }
92
+
93
+ if (total != loopSize) {
94
+ printf (" Total loop count failed: %i != %i\n " , total, loopSize);
95
+ success = false ;
60
96
}
61
97
62
- int expectedFinalReducedA = 550000 ;
63
- int expectedFinalReducedB = 825000 ;
98
+ // The expected final value is: constant (10/15) * (sum [1, nThreads]) *
99
+ // chinkSize
100
+ int expectedFinalReducedA =
101
+ (int )10 * actualNumThreads * (actualNumThreads + 1 ) / 2 * actualChunkSize;
102
+ int expectedFinalReducedB =
103
+ (int )15 * actualNumThreads * (actualNumThreads + 1 ) / 2 * actualChunkSize;
64
104
65
105
if (reducedA != expectedFinalReducedA) {
66
106
printf (" reducedA %i != %i\n " , reducedA, expectedFinalReducedA);
@@ -82,12 +122,11 @@ int main(int argc, char* argv[])
82
122
printf (" Incorrect number of threads passed as input: %i\n " , numThreads);
83
123
return 1 ;
84
124
}
85
- omp_set_num_threads (numThreads);
86
125
87
126
// Run reduce in a loop and check each iteration is correct
88
127
int nLoops = 10 ;
89
128
for (int i = 0 ; i < nLoops; i++) {
90
- bool success = doReduce ();
129
+ bool success = doReduce (numThreads );
91
130
if (!success) {
92
131
printf (" Repeated reduce failed on loop %i\n " , i);
93
132
return 1 ;
0 commit comments