Skip to content

Commit 388d056

Browse files
committed
Bugfix: workaround for GH #3002
1 parent e8ccd78 commit 388d056

File tree

4 files changed

+84
-60
lines changed

4 files changed

+84
-60
lines changed

orchestration/aloc/poll-ready.sh

Lines changed: 75 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -29,62 +29,24 @@ while true ; do
2929
esac
3030
done
3131

32-
if [ ! -z "$ALL_RUNNING" ]; then
33-
## Use cluster-state-entity-count-query to initializer to check if
34-
## all of the nodes in the cluster are actually running &
35-
## queryable. The query will hang if one or more of the workers
36-
## has crashed. Unfortunately, that hang makes scripting
37-
## difficult: the `external_sender` proc can hang forever waiting
38-
## for a reply from Wallaroo that will never arrive.
39-
##
40-
## If a worker *has* crashed, then a `cluster-status-query` that
41-
## is sent to any running worker process will return successfully.
42-
## That's not what we want to know.
43-
##
44-
## The only way that I can think of around this problem is to send
45-
## a `cluster-status-query` and then parse the output, e.g.,
46-
## Processing messages: true, Worker count: 2, Workers: |initializer,worker2,|,
47-
## then map the worker name -> Wallaroo external TCP port, then
48-
## send a `cluster-status-query` to each of the workers. But that
49-
## embeds a lot more Wallaroo internal knowledge (and also the TCP
50-
## port number convention used by these shell scripts).
51-
##
52-
## NOTE: GH bug #3002 means that we can DoS ourselves by sending
53-
## this query too soon! {sigh}
32+
## If a worker has crashed, then a `cluster-status-query` that
33+
## is sent to any running worker process will return successfully.
34+
## That's not what we want to know.
35+
##
36+
## NOTE: GH bug #3002 means that we can DoS ourselves by sending
37+
## this query too soon! {sigh}
38+
##
39+
## If we use cluster-state-entity-count-query to initializer to check if
40+
## all of the nodes in the cluster are actually running &
41+
## queryable. The query will hang if one or more of the workers
42+
## has crashed. Unfortunately, that hang makes scripting
43+
## difficult: the `external_sender` proc can hang forever waiting
44+
## for a reply from Wallaroo that will never arrive.
45+
##
46+
## Our workaround is to use our external TCP port numbering scheme to
47+
## query each worker directly. We assume that the initializer's
48+
## cluster membership info is the Source of Truth(tm).
5449

55-
if [ ! -z "$VERBOSE" ]; then
56-
echo -n "Entity count: "
57-
fi
58-
OUTFILE=`tempfile -d /tmp`
59-
trap "rm -f $OUTFILE" 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
60-
for i in `seq 1 $COUNT`; do
61-
../../testing/tools/external_sender/external_sender \
62-
-e $WALLAROO_ARG_EXTERNAL \
63-
-t cluster-state-entity-count-query > $OUTFILE 2>&1 &
64-
PID=$!
65-
sleep 0.1
66-
grep -s initializer $OUTFILE > /dev/null 2>&1
67-
if [ $? -eq 0 ]; then
68-
if [ ! -z "$VERBOSE" ]; then
69-
echo Success
70-
fi
71-
break
72-
fi
73-
if [ ! -z "$VERBOSE" ]; then
74-
echo -n .
75-
fi
76-
done
77-
if [ $i -eq $COUNT ]; then
78-
if [ ! -z "$VERBOSE" ]; then
79-
echo Failed
80-
fi
81-
exit 1
82-
fi
83-
fi
84-
85-
if [ ! -z "$VERBOSE" ]; then
86-
echo -n "Processing messages: "
87-
fi
8850
for i in `seq 1 $COUNT`; do
8951
../../testing/tools/external_sender/external_sender \
9052
-e $WALLAROO_ARG_EXTERNAL -t cluster-status-query 2>&1 | \
@@ -98,6 +60,64 @@ for i in `seq 1 $COUNT`; do
9860
sleep 0.1
9961
done
10062

63+
if [ $i -eq $COUNT ]; then
64+
if [ ! -z "$VERBOSE" ]; then
65+
echo Failed
66+
fi
67+
exit 1
68+
fi
69+
70+
if [ ! -z "$ALL_RUNNING" ]; then
71+
workers=`../../testing/tools/external_sender/external_sender \
72+
-e $WALLAROO_ARG_EXTERNAL -t cluster-status-query 2>&1 | \
73+
grep -s 'Processing messages: ' | \
74+
sed -e 's/.*Workers: .//' -e 's/,|.*//' | \
75+
tr ',' ' '`
76+
for worker in $workers; do
77+
if [ ! -z "$VERBOSE" ]; then
78+
echo -n "Worker $worker: "
79+
fi
80+
base_port=7103
81+
case $worker in
82+
initializer)
83+
port=$base_port
84+
;;
85+
worker*)
86+
n=`echo $worker | sed 's/worker//'`
87+
my_shift=`expr $n \* 10`
88+
port=`expr $base_port + $my_shift`
89+
;;
90+
*)
91+
echo Error: unknown worker $worker
92+
exit 1
93+
;;
94+
esac
95+
if [ ! -z "$VERBOSE" ]; then
96+
echo -n port = $port
97+
fi
98+
for i in `seq 1 $COUNT`; do
99+
../../testing/tools/external_sender/external_sender \
100+
-e 127.0.0.1:$port -t cluster-status-query 2>&1 | \
101+
grep -s 'Processing messages: true' > /dev/null 2>&1
102+
if [ $? -eq 0 ]; then
103+
if [ ! -z "$VERBOSE" ]; then
104+
echo ""
105+
fi
106+
break;
107+
fi
108+
if [ ! -z "$VERBOSE" ]; then
109+
echo -n .
110+
fi
111+
sleep 0.1
112+
done
113+
if [ $i -eq $COUNT ]; then
114+
if [ ! -z "$VERBOSE" ]; then
115+
break
116+
fi
117+
fi
118+
done
119+
fi
120+
101121
if [ $i -eq $COUNT ]; then
102122
if [ ! -z "$VERBOSE" ]; then
103123
echo Failed

orchestration/aloc/sample-env-vars.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ export WALLAROO_METRICS_HOST=$WALLAROO_INIT_HOST
99
export WALLAROO_IN_BASE=7100
1010
export WALLAROO_CONTROL_BASE=7101
1111
export WALLAROO_DATA_BASE=7102
12-
export WALLAROO_EXTERNAL_BASE=7103
12+
export WALLAROO_MY_EXTERNAL_BASE=7103
1313
export WALLAROO_MY_CONTROL_BASE=7104
1414
export WALLAROO_MY_DATA_BASE=7105
1515

@@ -19,9 +19,8 @@ export WALLAROO_ARG_OUT="${WALLAROO_OUT_HOST}:7200"
1919
export WALLAROO_ARG_METRICS="${WALLAROO_METRICS_HOST}:5001"
2020
export WALLAROO_ARG_CONTROL="${WALLAROO_INIT_HOST}:${WALLAROO_CONTROL_BASE}"
2121
export WALLAROO_ARG_DATA="${WALLAROO_INIT_HOST}:${WALLAROO_DATA_BASE}"
22-
export WALLAROO_ARG_EXTERNAL="${WALLAROO_INIT_HOST}:${WALLAROO_EXTERNAL_BASE}"
2322
export WALLAROO_ARG_RESILIENCE="--run-with-resilience"
2423
export WALLAROO_ARG_PONY="--ponynoblock --ponythreads=1 --ponyminthreads=9999"
2524

26-
export WALLAROO_BASE_ARGS="--out $WALLAROO_ARG_OUT --metrics $WALLAROO_ARG_METRICS --control $WALLAROO_ARG_CONTROL --external $WALLAROO_ARG_EXTERNAL $WALLAROO_ARG_RESILIENCE"
25+
export WALLAROO_BASE_ARGS="--out $WALLAROO_ARG_OUT --metrics $WALLAROO_ARG_METRICS --control $WALLAROO_ARG_CONTROL $WALLAROO_ARG_RESILIENCE"
2726

orchestration/aloc/start-initializer.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,13 @@ fi
4141
my_in=`echo $WALLAROO_ARG_IN | \
4242
sed -e "s/__IN_HOST__/$WALLAROO_INIT_HOST/" \
4343
-e "s/__IN_PORT__/$WALLAROO_IN_BASE/"`
44+
my_external="${WALLAROO_INIT_HOST}:${WALLAROO_MY_EXTERNAL_BASE}"
4445

4546
cmd="$WALLAROO_BIN --in $my_in \
4647
$WALLAROO_BASE_ARGS --data $WALLAROO_ARG_DATA \
47-
--cluster-initializer --worker-count $NUM_WORKERS \
48+
--cluster-initializer \
49+
--external $my_external \
50+
--worker-count $NUM_WORKERS \
4851
$WALLAROO_ARG_PONY"
4952
if [ ! -z "$VERBOSE" ]; then
5053
echo "cmd: $cmd /tmp/wallaroo.1 2>&1 &"

orchestration/aloc/start-worker.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,14 @@ my_in_port=`expr $WALLAROO_IN_BASE + $my_shift`
5050
my_in=`echo $WALLAROO_ARG_IN | \
5151
sed -e "s/__IN_HOST__/$WALLAROO_INIT_HOST/" \
5252
-e "s/__IN_PORT__/$my_in_port/"`
53+
my_external="${my_ip}:`expr $WALLAROO_MY_EXTERNAL_BASE + $my_shift`"
5354
my_control="${my_ip}:`expr $WALLAROO_MY_CONTROL_BASE + $my_shift`"
5455
my_data="${my_ip}:`expr $WALLAROO_MY_DATA_BASE + $my_shift`"
5556

5657
cmd="$WALLAROO_BIN --in $my_in \
5758
$WALLAROO_BASE_ARGS \
58-
--name worker$WORKER --my-control $my_control --my-data $my_data \
59+
--name worker$WORKER --external $my_external \
60+
--my-control $my_control --my-data $my_data \
5961
$JOIN_ARG $WALLAROO_ARG_PONY"
6062
if [ ! -z "$VERBOSE" ]; then
6163
echo "cmd: $cmd /tmp/wallaroo.$WORKER 2>&1 &"

0 commit comments

Comments
 (0)