@@ -35,13 +35,17 @@ use ballista_core::serde::scheduler::ExecutorMetadata;
35
35
use datafusion_proto:: logical_plan:: AsLogicalPlan ;
36
36
use datafusion_proto:: physical_plan:: AsExecutionPlan ;
37
37
use log:: { debug, error, info, trace, warn} ;
38
+ use std:: collections:: HashMap ;
38
39
use std:: net:: SocketAddr ;
39
40
40
41
use std:: ops:: Deref ;
41
42
42
- use crate :: cluster:: { bind_task_bias, bind_task_round_robin} ;
43
+ use crate :: cluster:: {
44
+ bind_task_bias, bind_task_round_robin, unbind_prepare_failed_tasks,
45
+ } ;
43
46
use crate :: config:: TaskDistributionPolicy ;
44
47
use crate :: scheduler_server:: event:: QueryStageSchedulerEvent ;
48
+ use crate :: state:: execution_graph:: TaskDescription ;
45
49
use std:: time:: { SystemTime , UNIX_EPOCH } ;
46
50
use tonic:: { Request , Response , Status } ;
47
51
@@ -112,10 +116,10 @@ impl<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan> SchedulerGrpc
112
116
let active_jobs = self . state . task_manager . get_running_job_cache ( ) ;
113
117
let schedulable_tasks = match self . state . config . task_distribution {
114
118
TaskDistributionPolicy :: Bias => {
115
- bind_task_bias ( available_slots, active_jobs, |_| false ) . await
119
+ bind_task_bias ( available_slots, active_jobs. clone ( ) , |_| false ) . await
116
120
}
117
121
TaskDistributionPolicy :: RoundRobin => {
118
- bind_task_round_robin ( available_slots, active_jobs, |_| false ) . await
122
+ bind_task_round_robin ( available_slots, active_jobs. clone ( ) , |_| false ) . await
119
123
}
120
124
TaskDistributionPolicy :: ConsistentHash { ..} => {
121
125
return Err ( Status :: unimplemented (
@@ -124,14 +128,36 @@ impl<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan> SchedulerGrpc
124
128
} ;
125
129
126
130
let mut tasks = vec ! [ ] ;
131
+ let mut prepare_failed_jobs = HashMap :: < String , Vec < TaskDescription > > :: new ( ) ;
127
132
for ( _, task) in schedulable_tasks {
128
- match self . state . task_manager . prepare_task_definition ( task) {
133
+ let job_id = task. partition . job_id . clone ( ) ;
134
+ if prepare_failed_jobs. contains_key ( & job_id) {
135
+ prepare_failed_jobs. entry ( job_id) . or_default ( ) . push ( task) ;
136
+ continue ;
137
+ }
138
+ match self
139
+ . state
140
+ . task_manager
141
+ . prepare_task_definition ( task. clone ( ) )
142
+ {
129
143
Ok ( task_definition) => tasks. push ( task_definition) ,
130
144
Err ( e) => {
131
145
error ! ( "Error preparing task definition: {:?}" , e) ;
146
+ prepare_failed_jobs. entry ( job_id) . or_default ( ) . push ( task) ;
132
147
}
133
148
}
134
149
}
150
+
151
+ unbind_prepare_failed_tasks ( active_jobs, & prepare_failed_jobs) . await ;
152
+ for job_id in prepare_failed_jobs. into_keys ( ) {
153
+ info ! ( "Cancel prepare task definition failed job: {}" , job_id) ;
154
+ self . cancel_job ( job_id) . await . map_err ( |e| {
155
+ let msg = format ! ( "Cancel job error due to {e:?}" ) ;
156
+ error ! ( "{}" , msg) ;
157
+ Status :: internal ( msg)
158
+ } ) ?;
159
+ }
160
+
135
161
Ok ( Response :: new ( PollWorkResult { tasks } ) )
136
162
} else {
137
163
warn ! ( "Received invalid executor poll_work request" ) ;
@@ -527,21 +553,11 @@ impl<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan> SchedulerGrpc
527
553
) -> Result < Response < CancelJobResult > , Status > {
528
554
let job_id = request. into_inner ( ) . job_id ;
529
555
info ! ( "Received cancellation request for job {}" , job_id) ;
530
-
531
- self . query_stage_event_loop
532
- . get_sender ( )
533
- . map_err ( |e| {
534
- let msg = format ! ( "Get query stage event loop error due to {e:?}" ) ;
535
- error ! ( "{}" , msg) ;
536
- Status :: internal ( msg)
537
- } ) ?
538
- . post_event ( QueryStageSchedulerEvent :: JobCancel ( job_id) )
539
- . await
540
- . map_err ( |e| {
541
- let msg = format ! ( "Post to query stage event loop error due to {e:?}" ) ;
542
- error ! ( "{}" , msg) ;
543
- Status :: internal ( msg)
544
- } ) ?;
556
+ self . cancel_job ( job_id) . await . map_err ( |e| {
557
+ let msg = format ! ( "Cancel job error due to {e:?}" ) ;
558
+ error ! ( "{}" , msg) ;
559
+ Status :: internal ( msg)
560
+ } ) ?;
545
561
Ok ( Response :: new ( CancelJobResult { cancelled : true } ) )
546
562
}
547
563
0 commit comments