@@ -10,10 +10,12 @@ use crate::worker::start::RunningTaskContext;
1010use crate :: { JobId , JobTaskId , Map , make_tako_id, unwrap_tako_id} ;
1111use std:: path:: Path ;
1212use tako:: gateway:: NewTasksMessage ;
13- use tako:: { ItemId , WorkerId } ;
13+ use tako:: { InstanceId , ItemId , WorkerId } ;
1414
1515struct RestorerTaskInfo {
1616 state : JobTaskState ,
17+ instance_id : Option < InstanceId > ,
18+ crash_counter : u32 ,
1719}
1820
1921impl RestorerTaskInfo {
@@ -77,15 +79,17 @@ impl RestorerJob {
7779
7880 for ( task_id, job_task) in job. tasks . iter_mut ( ) {
7981 if let Some ( task) = self . tasks . get_mut ( task_id) {
82+ if task. crash_counter > 0 || task. instance_id . is_some ( ) {
83+ new_tasks. adjust_instance_id_and_crash_counters . insert (
84+ make_tako_id ( job_id, * task_id) ,
85+ (
86+ task. instance_id . map ( |x| x. as_num ( ) + 1 ) . unwrap_or ( 0 ) . into ( ) ,
87+ task. crash_counter ,
88+ ) ,
89+ ) ;
90+ }
8091 match & task. state {
81- JobTaskState :: Waiting => continue ,
82- JobTaskState :: Running { started_data } => {
83- let instance_id = started_data. context . instance_id . as_num ( ) + 1 ;
84- new_tasks
85- . adjust_instance_id
86- . insert ( make_tako_id ( job_id, * task_id) , instance_id. into ( ) ) ;
87- continue ;
88- }
92+ JobTaskState :: Waiting | JobTaskState :: Running { .. } => continue ,
8993 JobTaskState :: Finished { .. } => job. counters . n_finished_tasks += 1 ,
9094 JobTaskState :: Failed { .. } => job. counters . n_failed_tasks += 1 ,
9195 JobTaskState :: Canceled { .. } => job. counters . n_canceled_tasks += 1 ,
@@ -112,6 +116,19 @@ impl RestorerJob {
112116 pub fn add_submit ( & mut self , submit : SubmittedJobDescription ) {
113117 self . submit_descs . push ( submit)
114118 }
119+
120+ pub fn increase_crash_counters ( & mut self , worker_id : WorkerId ) {
121+ for task in self . tasks . values_mut ( ) {
122+ match & task. state {
123+ JobTaskState :: Running { started_data }
124+ if started_data. worker_ids . contains ( & worker_id) =>
125+ {
126+ task. crash_counter += 1 ;
127+ }
128+ _ => { }
129+ }
130+ }
131+ }
115132}
116133
117134#[ derive( Default ) ]
@@ -183,7 +200,13 @@ impl StateRestorer {
183200 log:: debug!( "Replaying: WorkerConnected {worker_id}" ) ;
184201 self . max_worker_id = self . max_worker_id . max ( worker_id. as_num ( ) ) ;
185202 }
186- EventPayload :: WorkerLost ( _, _) => { }
203+ EventPayload :: WorkerLost ( worker_id, reason) => {
204+ if reason. is_failure ( ) {
205+ for job in self . jobs . values_mut ( ) {
206+ job. increase_crash_counters ( worker_id) ;
207+ }
208+ }
209+ }
187210 EventPayload :: WorkerOverviewReceived ( _) => { }
188211 EventPayload :: Submit {
189212 job_id,
@@ -232,6 +255,8 @@ impl StateRestorer {
232255 worker_ids : workers,
233256 } ,
234257 } ,
258+ instance_id : Some ( instance_id) ,
259+ crash_counter : 0 ,
235260 } ,
236261 ) ;
237262 }
@@ -300,6 +325,8 @@ impl StateRestorer {
300325 started_data : None ,
301326 cancelled_date : event. time ,
302327 } ,
328+ instance_id : None ,
329+ crash_counter : 0 ,
303330 } ,
304331 ) ;
305332 }
0 commit comments