Skip to content

Commit d6b8545

Browse files
committed
func(omp): make repeated reduce function more flexible
1 parent ce44204 commit d6b8545

File tree

1 file changed

+53
-14
lines changed

1 file changed

+53
-14
lines changed

func/omp/repeated_reduce.cpp

+53-14
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,17 @@
55
#include <math.h>
66
#include <omp.h>
77
#include <unistd.h>
8+
#include <vector>
89

9-
bool doReduce()
10+
// This reduce method is called with a varying number of threads, but with
11+
// a maximum of 10. In addition, the inner parallel for pragma may be
12+
// elastically scaled from nThreads, all the way up to 10.
13+
bool doReduce(int numThreads)
1014
{
11-
int nThreads = 10;
1215
int chunkSize = 1000;
13-
int loopSize = nThreads * chunkSize;
14-
int counts[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
16+
int loopSize = numThreads * chunkSize;
17+
int maxNumThreads = 10;
18+
std::vector<int> counts(maxNumThreads, 0);
1519

1620
int reducedA = 0;
1721
int reducedB = 0;
@@ -21,7 +25,7 @@ bool doReduce()
2125
FAASM_REDUCE(reducedA, FAASM_TYPE_INT, FAASM_OP_SUM)
2226
FAASM_REDUCE(reducedB, FAASM_TYPE_INT, FAASM_OP_SUM)
2327

24-
#pragma omp parallel for num_threads(nThreads) default(none) \
28+
#pragma omp parallel for num_threads(numThreads) default(none) \
2529
shared(counts, loopSize, success) reduction(+ : reducedA, reducedB)
2630
for (int i = 0; i < loopSize; i++) {
2731
int threadNum = omp_get_thread_num();
@@ -50,17 +54,53 @@ bool doReduce()
5054
return 1;
5155
}
5256

53-
// Check counts
54-
for (int t = 0; t < nThreads; t++) {
55-
if (counts[t] != chunkSize) {
56-
printf(
57-
"Loop count for thread %i: %i != %i\n", t, counts[t], chunkSize);
57+
// First, work out how many threads actually executed the loop, by checking
58+
// how many threads wrote to the counts array
59+
int actualNumThreads = 0;
60+
for (int i = 0; i < counts.size(); i++) {
61+
if (counts.at(i) != 0) {
62+
actualNumThreads++;
63+
}
64+
}
65+
66+
if ((actualNumThreads < numThreads) || (actualNumThreads > maxNumThreads)) {
67+
printf("Actual number of threads outside valid range: %i \\not \\in "
68+
"[%i, %i]\n",
69+
actualNumThreads,
70+
numThreads,
71+
maxNumThreads);
72+
73+
// Exit fast in this case as posterior checks may seg-fault
74+
return false;
75+
}
76+
77+
// Check counts (only count the aggregate, and a uniform distribution, as
78+
// we may elastically change the parallelism of the loop)
79+
int actualChunkSize = (int)loopSize / actualNumThreads;
80+
int total = 0;
81+
for (int tNum = 0; tNum < actualNumThreads; tNum++) {
82+
if (counts[tNum] != actualChunkSize) {
83+
printf("Loop count for thread %i: %i != %i\n",
84+
tNum,
85+
counts[tNum],
86+
actualChunkSize);
5887
success = false;
5988
}
89+
90+
total += counts[tNum];
91+
}
92+
93+
if (total != loopSize) {
94+
printf("Total loop count failed: %i != %i\n", total, loopSize);
95+
success = false;
6096
}
6197

62-
int expectedFinalReducedA = 550000;
63-
int expectedFinalReducedB = 825000;
98+
// The expected final value is: constant (10/15) * (sum [1, nThreads]) *
99+
// chinkSize
100+
int expectedFinalReducedA =
101+
(int)10 * actualNumThreads * (actualNumThreads + 1) / 2 * actualChunkSize;
102+
int expectedFinalReducedB =
103+
(int)15 * actualNumThreads * (actualNumThreads + 1) / 2 * actualChunkSize;
64104

65105
if (reducedA != expectedFinalReducedA) {
66106
printf("reducedA %i != %i\n", reducedA, expectedFinalReducedA);
@@ -82,12 +122,11 @@ int main(int argc, char* argv[])
82122
printf("Incorrect number of threads passed as input: %i\n", numThreads);
83123
return 1;
84124
}
85-
omp_set_num_threads(numThreads);
86125

87126
// Run reduce in a loop and check each iteration is correct
88127
int nLoops = 10;
89128
for (int i = 0; i < nLoops; i++) {
90-
bool success = doReduce();
129+
bool success = doReduce(numThreads);
91130
if (!success) {
92131
printf("Repeated reduce failed on loop %i\n", i);
93132
return 1;

0 commit comments

Comments
 (0)