@@ -138,13 +138,15 @@ def _run_executor(self, execution_id: int,
138
138
'Failed to created container executor pod!\n Reason: %s\n Body: %s' %
139
139
(e .reason , e .body ))
140
140
141
+ # Wait up to 300 seconds for the pod to move from pending to another status.
141
142
logging .info ('Waiting for pod "%s:%s" to start.' , namespace , pod_name )
142
143
self ._wait_pod (
143
144
core_api ,
144
145
pod_name ,
145
146
namespace ,
146
147
exit_condition_lambda = _pod_is_not_pending ,
147
- condition_description = 'non-pending status' )
148
+ condition_description = 'non-pending status' ,
149
+ timeout_sec = 300 )
148
150
149
151
logging .info ('Start log streaming for pod "%s:%s".' , namespace , pod_name )
150
152
try :
@@ -162,12 +164,14 @@ def _run_executor(self, execution_id: int,
162
164
for log in logs :
163
165
logging .info (log .decode ().rstrip ('\n ' ))
164
166
167
+ # Wait indefinitely for the pod to complete.
165
168
resp = self ._wait_pod (
166
169
core_api ,
167
170
pod_name ,
168
171
namespace ,
169
172
exit_condition_lambda = _pod_is_done ,
170
- condition_description = 'done state' )
173
+ condition_description = 'done state' ,
174
+ timeout_sec = 0 )
171
175
172
176
if resp .status .phase == kube_utils .PodPhase .FAILED .value :
173
177
raise RuntimeError ('Pod "%s:%s" failed with status "%s".' %
@@ -254,7 +258,7 @@ def _wait_pod(self,
254
258
namespace : Text ,
255
259
exit_condition_lambda : Callable [[client .V1Pod ], bool ],
256
260
condition_description : Text ,
257
- timeout_sec : int = 300 ) -> client .V1Pod :
261
+ timeout_sec : int ) -> client .V1Pod :
258
262
"""Wait for a POD to meet an exit condition.
259
263
260
264
Args:
@@ -265,7 +269,8 @@ def _wait_pod(self,
265
269
for a POD to exit. The function returns True to exit.
266
270
condition_description: The description of the exit condition which will be
267
271
set in the error message if the wait times out.
268
- timeout_sec: The seconds for the function to wait. Defaults to 300s.
272
+ timeout_sec: The seconds for the function to wait. Waits indefinitely if
273
+ value is 0.
269
274
270
275
Returns:
271
276
The POD object which meets the exit condition.
@@ -279,8 +284,8 @@ def _wait_pod(self,
279
284
logging .info (resp .status .phase )
280
285
if exit_condition_lambda (resp ):
281
286
return resp
282
- elapse_time = datetime .datetime .utcnow () - start_time
283
- if elapse_time .seconds >= timeout_sec :
287
+ elapsed_time = datetime .datetime .utcnow () - start_time
288
+ if timeout_sec != 0 and elapsed_time .seconds >= timeout_sec :
284
289
raise RuntimeError (
285
290
'Pod "%s:%s" does not reach "%s" within %s seconds.' %
286
291
(namespace , pod_name , condition_description , timeout_sec ))
0 commit comments