From 0edd25ad249122370a7f443bc5578d9f41fcb1d2 Mon Sep 17 00:00:00 2001 From: "Loevenich, Mathis" Date: Wed, 17 Nov 2021 12:08:52 +0100 Subject: [PATCH 1/6] add: wrapper classes for all htcondor events (0-12) except for 8 (needs no handling) --- .../log_analyzer/event_handler/job_events.py | 202 +++++++++++++++--- 1 file changed, 170 insertions(+), 32 deletions(-) diff --git a/htcanalyze/log_analyzer/event_handler/job_events.py b/htcanalyze/log_analyzer/event_handler/job_events.py index 9d5d32f..0c8083a 100755 --- a/htcanalyze/log_analyzer/event_handler/job_events.py +++ b/htcanalyze/log_analyzer/event_handler/job_events.py @@ -1,4 +1,5 @@ """Module with wrapper classes for HTCondor Job Events.""" +from abc import ABC from datetime import datetime as date_time from htcanalyze import ReprObject @@ -38,11 +39,14 @@ def __repr__(self): return str(self) -class JobEvent(ReprObject): +class JobEvent(ReprObject, ABC): """ - Job event represents a HTCondor JobEvent. + Abstract class to wrap each HTCondor JobEvent. - Each event hat an event number and a time stamp + Each event has an event number and a time stamp + The event number is defined by HTCondor but kept dynamic here to + avoid errors if the setup changes. + https://htcondor.readthedocs.io/en/latest/codes-other-values :param event_number: int HTCondor event number @@ -59,10 +63,39 @@ def __init__( self.time_stamp = DateTimeWrapper(time_stamp) if time_stamp else None +class ErrorEvent(JobEvent, ABC): + """ + Abstract class to classify error events. + + :param event_number: + :param time_stamp: + :param error_state: + :param reason: + """ + + def __init__( + self, + event_number, + time_stamp, + error_state: ErrorState, + reason: str + ): + super().__init__(event_number, time_stamp) + assert isinstance(error_state, ErrorState) + self.error_state = error_state + self.reason = reason + + class JobSubmissionEvent(JobEvent): """ Job submission event. + Event Number: 000 + Event Name: Job submitted + Event Description: This event occurs when a user submits a job. + It is the first event you will see for a job, + and it should only occur once. + :param event_number: :param time_stamp: :param submitter_address: @@ -89,6 +122,11 @@ class JobExecutionEvent(JobEvent): """ Job execution event. + Event Number: 001 + Event Name: Job executing + Event Description: This shows up when a job is running. + It might occur more than once. + :param event_number: :param time_stamp: :param host_address: @@ -110,35 +148,68 @@ def __init__( ) -class ImageSizeEvent(JobEvent): +class ExecutableErrorEvent(ErrorEvent): """ - Image size event. - Used for ram histograms. + Error in executable event. + + Event Number: 002 + Event Name: Error in executable + Event Description: The job could not be run because the executable was bad. :param event_number: :param time_stamp: - :param size_update: - :param memory_usage: - :param resident_set_size: + :param reason: """ def __init__( self, event_number, time_stamp, - size_update=None, - memory_usage=None, - resident_set_size=None + reason ): - super().__init__(event_number, time_stamp) - self.size_update = size_update - self.memory_usage = memory_usage - self.resident_set_size = resident_set_size + super().__init__( + event_number, + time_stamp, + ExecutableErrorState(), + reason + ) -class JobTerminationEvent(JobEvent): +class JobCheckpointedEvent(JobEvent): + """ + Error in executable event. + + Event Number: 003 + Event Name: Job was checkpointed + Event Description: The job’s complete state was written to a checkpoint + file. This might happen without the job being removed from a machine, + because the checkpointing can happen periodically. + + """ + # Todo: figure out data load + + +class JobEvictedEvent(JobEvent): + """ + Job evicted event. + + Event Number: 004 + Event Name: Job evicted from machine + Event Description: A job was removed from a machine before it finished, + usually for a policy reason. Perhaps an interactive user has + claimed the computer, or perhaps another job is higher priority. + + """ + # Todo: figure out data load + + +class JobTerminationEvent(JobEvent, ABC): """ Job termination event. + Event Number: 005 + Event Name: Job terminated + Event Description: The job has completed. + :param event_number: :param time_stamp: :param resources: @@ -161,13 +232,51 @@ def __init__( self.return_value = return_value -class ErrorEvent(JobEvent): +class ImageSizeEvent(JobEvent): + """ + Image size event. + Used for ram histograms. + + Event Number: 006 + Event Name: Image size of job updated + Event Description: An informational event, to update the amount + of memory that the job is using while running. + It does not reflect the state of the job. + + :param event_number: + :param time_stamp: + :param size_update: + :param memory_usage: + :param resident_set_size: + """ + + def __init__( + self, + event_number, + time_stamp, + size_update=None, + memory_usage=None, + resident_set_size=None + ): + super().__init__(event_number, time_stamp) + self.size_update = size_update + self.memory_usage = memory_usage + self.resident_set_size = resident_set_size + + +class ShadowExceptionEvent(ErrorEvent): """ - Error event. + Shadow exception event. + + Event Number: 007 + Event Name: Shadow exception + Event Description: The condor_shadow, a program on the submit computer + that watches over the job and performs some services for the job, + failed for some catastrophic reason. The job will leave the machine + and go back into the queue. :param event_number: :param time_stamp: - :param error_state: :param reason: """ @@ -175,19 +284,24 @@ def __init__( self, event_number, time_stamp, - error_state: ErrorState, - reason: str + reason ): - super().__init__(event_number, time_stamp) - assert isinstance(error_state, ErrorState) - self.error_state = error_state - self.reason = reason + super().__init__( + event_number, + time_stamp, + ShadowExceptionState(), + reason + ) class JobAbortedEvent(ErrorEvent, JobTerminationEvent): """ Job aborted event. + Event Number: 009 + Event Name: Job aborted + Event Description: The user canceled the job. + :param event_number: :param time_stamp: :param reason: @@ -216,9 +330,16 @@ class JobAbortedBeforeExecutionEvent(JobAbortedEvent): """Job was aborted before execution event.""" -class JobHeldEvent(ErrorEvent): +class JobSuspendedEvent(ErrorEvent): """ - Job held event. + Job suspended event. + + Event Number: 010 + Event Name: Job was suspended + Event Description: The job is still on the computer, + but it is no longer executing. + This is usually for a policy reason, + such as an interactive user using the computer. :param event_number: :param time_stamp: @@ -234,14 +355,31 @@ def __init__( super().__init__( event_number, time_stamp, - JobHeldState(), + JobSuspendedState(), reason ) -class ShadowExceptionEvent(ErrorEvent): +class JobUnsuspendedEvent(JobEvent): """ - Shadow exception event. + Job unsuspended event. + + Event Number: 011 + Event Name: Job was unsuspended + Event Description: The job has resumed execution, + after being suspended earlier. + """ + + +class JobHeldEvent(ErrorEvent): + """ + Job held event. + + Event Number: 012 + Event Name: Job was held + Event Description: The job has transitioned to the hold state. + This might happen if the user applies + the condor_hold command to the job. :param event_number: :param time_stamp: @@ -257,6 +395,6 @@ def __init__( super().__init__( event_number, time_stamp, - ShadowExceptionState(), + JobHeldState(), reason ) From 1003ebbe84aff0c63eab41c3d66546414e0fc1e1 Mon Sep 17 00:00:00 2001 From: "Loevenich, Mathis" Date: Wed, 17 Nov 2021 17:04:15 +0100 Subject: [PATCH 2/6] convert: ram histogram plot from KB to MB --- htcanalyze/log_analyzer/condor_log/ram_history.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/htcanalyze/log_analyzer/condor_log/ram_history.py b/htcanalyze/log_analyzer/condor_log/ram_history.py index 8c2cdfd..809a8fb 100755 --- a/htcanalyze/log_analyzer/condor_log/ram_history.py +++ b/htcanalyze/log_analyzer/condor_log/ram_history.py @@ -35,7 +35,10 @@ def plot_ram(self, show_legend=False) -> str: :param show_legend: Shows a legend :return: str """ - ram = [ram.size_update for ram in self.image_size_events] + ram = [ + ram.size_update/1000 # convert to MB + for ram in self.image_size_events + ] dates = [ram.time_stamp for ram in self.image_size_events] if len(ram) == 0: return "" # No memory usage detected @@ -44,7 +47,7 @@ def plot_ram(self, show_legend=False) -> str: return str( f"Single memory update found:\n" f"Memory usage on the {dates[0]} " - f"was updatet to {ram[0]} MB\n" + f"was updated to {ram[0]} MB\n" ) # else @@ -54,7 +57,7 @@ def plot_ram(self, show_legend=False) -> str: fig.set_x_limits(min_=min(dates)) min_ram = int(min(ram)) # raises error if not casted fig.set_y_limits(min_=min_ram) - fig.y_label = "Usage (KB)" + fig.y_label = "Usage [MB]" fig.x_label = "Time" # this will use the self written function _ From 0626c7e1561d183c2b682e9be45a7e43b6985e76 Mon Sep 17 00:00:00 2001 From: "Loevenich, Mathis" Date: Wed, 17 Nov 2021 17:06:47 +0100 Subject: [PATCH 3/6] restructure: HTCondor JobEvent Wrapper --- .../event_handler/event_handler.py | 51 +++++++++++++++---- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/htcanalyze/log_analyzer/event_handler/event_handler.py b/htcanalyze/log_analyzer/event_handler/event_handler.py index 417c450..4c3e4e3 100755 --- a/htcanalyze/log_analyzer/event_handler/event_handler.py +++ b/htcanalyze/log_analyzer/event_handler/event_handler.py @@ -51,17 +51,48 @@ class ReadLogException(Exception): """Can't read log file exception.""" -class HTCJobEventWrapper(HTCJobEvent): - """Wrapper for HTCJobEvent.""" - - def __new__(cls, job_event: HTCJobEvent): - new = job_event - new.event_number = job_event.get('EventTypeNumber') - new.time_stamp = date_time.strptime( - job_event.get('EventTime'), - STRP_FORMAT +class HTCJobEventWrapper: + """ + Wrapper for HTCondor JobEvent. + + Extracts event number and time_stamp of an event. + The wrapped event can be printed to the terminal for dev purpose. + + :param job_event: HTCJobEvent + """ + + def __init__(self, job_event: HTCJobEvent): + + self.wrapped_class = job_event + self.event_number = job_event.get('EventTypeNumber') + self.time_stamp = date_time.strptime( + job_event.get('EventTime'), + STRP_FORMAT + ) + + def __getattr__(self, attr): + return getattr(self.wrapped_class, attr) + + def get(self, *args, **kwargs): + return self.wrapped_class.get(*args, **kwargs) + + def items(self): + return self.wrapped_class.items() + + def keys(self): + return self.wrapped_class.keys() + + def values(self): + return self.wrapped_class.values() + + def to_dict(self): + return {key: val for key, val in self.items()} + + def __repr__(self): + return json.dumps( + self.to_dict(), + indent=2 ) - return new class EventHandler: From 035943b022b7b81477bf2d6df4cfe9c5637bf92d Mon Sep 17 00:00:00 2001 From: "Loevenich, Mathis" Date: Wed, 17 Nov 2021 18:05:14 +0100 Subject: [PATCH 4/6] add: handling for JOB_DISCONNECTED, JOB_RECONNECT_FAILED and JOB_EVICTED --- .../event_handler/event_handler.py | 236 +++++++++++------- .../log_analyzer/event_handler/job_events.py | 133 +++++++++- .../log_analyzer/event_handler/states.py | 53 +++- 3 files changed, 316 insertions(+), 106 deletions(-) diff --git a/htcanalyze/log_analyzer/event_handler/event_handler.py b/htcanalyze/log_analyzer/event_handler/event_handler.py index 4c3e4e3..c3e0119 100755 --- a/htcanalyze/log_analyzer/event_handler/event_handler.py +++ b/htcanalyze/log_analyzer/event_handler/event_handler.py @@ -3,6 +3,7 @@ import os import re import logging +import json from typing import List, Union from datetime import datetime as date_time @@ -30,13 +31,19 @@ ErrorEvent, JobSubmissionEvent, JobExecutionEvent, + JobEvictedEvent, JobTerminationEvent, + NormalTerminationEvent, + AbnormalTerminationEvent, JobAbortedEvent, JobAbortedBeforeExecutionEvent, JobAbortedBeforeSubmissionEvent, JobHeldEvent, ImageSizeEvent, - ShadowExceptionEvent + ShadowExceptionEvent, + JobDisconnectedEvent, + JobReconnectedEvent, + JobReconnectFailedEvent ) from ..condor_log.logresource import ( LogResources, @@ -101,6 +108,11 @@ class EventHandler: def __init__(self): self._state: Union[JobState, None] = None + @property + def state(self) -> JobState: + """Returns current job state.""" + return self._state + def get_submission_event( self, event: HTCJobEventWrapper @@ -116,7 +128,7 @@ def get_submission_event( match_from_host = re.match( r"<(.+):[0-9]+\?(.*)>", - event.get('SubmitHost') + event.get("SubmitHost") ) if match_from_host: submitted_host = match_from_host[1] @@ -136,11 +148,6 @@ def get_submission_event( reason ) - @property - def state(self) -> JobState: - """Returns current job state.""" - return self._state - def get_execution_event( self, event: HTCJobEventWrapper, @@ -214,21 +221,65 @@ def get_job_terminated_event( # differentiate between normal and abnormal termination if normal_termination: - state = NormalTerminationState() - + self._state = NormalTerminationState() return_value = event.get('ReturnValue') + return NormalTerminationEvent( + event.event_number, + event.time_stamp, + resources, + return_value + ) else: - state = AbnormalTerminationState() return_value = event.get('TerminatedBySignal') + self._state = AbnormalTerminationState() + return AbnormalTerminationEvent( + event.event_number, + event.time_stamp, + resources, + return_value + ) # Todo: include description when possible - self._state = state - return JobTerminationEvent( + @staticmethod + def get_job_evicted_event( + event: HTCJobEventWrapper + ) -> JobEvictedEvent: + """Reads and returns a JobEvictedEvent.""" + assert event.type == jet.JOB_EVICTED + # Todo: figure out the data load of + # TerminatedNormally and TerminatedAndRequed + return JobEvictedEvent( + event.event_number, + event.time_stamp, + checkpointed=event.get("Checkpointed") + ) + + @staticmethod + def get_image_size_event(event: HTCJobEventWrapper) -> ImageSizeEvent: + """Reads and returns a ImageSizeEvent.""" + assert event.type == jet.IMAGE_SIZE + size_update = event.get('Size') + memory_usage = event.get('MemoryUsage') + resident_set_size = event.get('ResidentSetSize') + return ImageSizeEvent( event.event_number, event.time_stamp, - resources, - state, - return_value + size_update, + memory_usage, + resident_set_size + ) + + @staticmethod + def get_shadow_exception_event( + event: HTCJobEventWrapper + ) -> ShadowExceptionEvent: + """Reads and returns a ShadowExceptionEvent.""" + assert event.type == jet.SHADOW_EXCEPTION + reason = event.get('Message') + return ShadowExceptionEvent( + event.event_number, + event.time_stamp, + reason ) def get_job_aborted_event( @@ -260,44 +311,106 @@ def get_job_aborted_event( return aborted_event @staticmethod - def get_image_size_event(event: HTCJobEventWrapper) -> ImageSizeEvent: - """Reads and returns a ImageSizeEvent.""" - assert event.type == jet.IMAGE_SIZE - size_update = event.get('Size') - memory_usage = event.get('MemoryUsage') - resident_set_size = event.get('ResidentSetSize') - return ImageSizeEvent( + def get_job_held_event(event: HTCJobEventWrapper) -> JobHeldEvent: + """Reads and returns a JobHeldEvent.""" + assert event.type == jet.JOB_HELD + message = event.get('HoldReason') + return JobHeldEvent( event.event_number, event.time_stamp, - size_update, - memory_usage, - resident_set_size + message ) @staticmethod - def get_job_held_event(event: HTCJobEventWrapper) -> JobHeldEvent: - """Reads and returns a JobHeldEvent.""" - assert event.type == jet.JOB_HELD - reason = event.get('HoldReason') - return JobHeldEvent( + def get_job_disconnected_event( + event: HTCJobEventWrapper + ) -> JobDisconnectedEvent: + assert event.type == jet.JOB_DISCONNECTED + reason = f"{event.get('DisconnectReason')}" + return JobDisconnectedEvent( event.event_number, event.time_stamp, reason ) @staticmethod - def get_shadow_exception_event( + def get_job_reconnected_event( event: HTCJobEventWrapper - ) -> ShadowExceptionEvent: - """Reads and returns a ShadowExceptionEvent.""" - assert event.type == jet.SHADOW_EXCEPTION - reason = event.get('Message') - return ShadowExceptionEvent( + ) -> JobReconnectedEvent: + assert event.type == jet.JOB_RECONNECTED + reason = f"{event.get('Reason')}" + return JobReconnectedEvent( event.event_number, event.time_stamp, reason ) + @staticmethod + def get_job_reconnect_failed_event( + event: HTCJobEventWrapper + ) -> JobReconnectFailedEvent: + assert event.type == jet.JOB_RECONNECT_FAILED + reason = f"{event.get('Reason')}" + return JobReconnectFailedEvent( + event.event_number, + event.time_stamp, + reason + ) + + def get_job_event( + self, + event: HTCJobEvent, + rdns_lookup: bool = False + ) -> JobEvent: + """ + Takes a HTCondor job event and returns an own wrapped JobEvent class. + + :param event: HTCJobEvent + A job event from the HTCondor python bindings. + :param rdns_lookup: bool + Whether to reversely resolve host addresses by domain name + :return: JobEvent + Wrapped JobEvent class with own properties + """ + wrapped_job_event = HTCJobEventWrapper(event) + if wrapped_job_event.type == jet.SUBMIT: + return self.get_submission_event(wrapped_job_event) + + if wrapped_job_event.type == jet.EXECUTE: + return self.get_execution_event( + wrapped_job_event, + rdns_lookup=rdns_lookup + ) + + if wrapped_job_event.type == jet.JOB_EVICTED: + return self.get_job_evicted_event(wrapped_job_event) + + if wrapped_job_event.type == jet.JOB_TERMINATED: + return self.get_job_terminated_event(wrapped_job_event) + + if wrapped_job_event.type == jet.IMAGE_SIZE: + return self.get_image_size_event(wrapped_job_event) + + if wrapped_job_event.type == jet.SHADOW_EXCEPTION: + return self.get_shadow_exception_event(wrapped_job_event) + + if wrapped_job_event.type == jet.JOB_ABORTED: + return self.get_job_aborted_event(wrapped_job_event) + + if wrapped_job_event.type == jet.JOB_HELD: + return self.get_job_held_event(wrapped_job_event) + + if wrapped_job_event.type == jet.JOB_DISCONNECTED: + return self.get_job_disconnected_event(wrapped_job_event) + + if wrapped_job_event.type == jet.JOB_RECONNECT_FAILED: + return self.get_job_reconnect_failed_event(wrapped_job_event) + + # else: + raise AttributeError( + f"Event type: {wrapped_job_event.type} not handled yet" + ) + def get_htc_events( self, file: str, @@ -330,54 +443,3 @@ def get_htc_events( self._state = ErrorWhileReadingState() raise ReadLogException(reason) from err - - def get_job_event( - self, - event: HTCJobEvent, - rdns_lookup: bool = False - ) -> JobEvent: - """ - Takes a HTCondor job event and returns an own wrapped JobEvent class. - - :param event: HTCJobEvent - A job event from the HTCondor python bindings. - :param rdns_lookup: bool - Whether to reversely resolve host addresses by domain name - :return: JobEvent - Wrapped JobEvent class with own properties - """ - wrapped_job_event = HTCJobEventWrapper(event) - if wrapped_job_event.type == jet.SUBMIT: - job_event = self.get_submission_event(wrapped_job_event) - - elif wrapped_job_event.type == jet.EXECUTE: - job_event = self.get_execution_event( - wrapped_job_event, - rdns_lookup=rdns_lookup - ) - - elif wrapped_job_event.type == jet.IMAGE_SIZE: - job_event = self.get_image_size_event(wrapped_job_event) - - # update resource dict and termination date - elif wrapped_job_event.type == jet.JOB_TERMINATED: - job_event = self.get_job_terminated_event(wrapped_job_event) - - # update error dict and termination date - elif wrapped_job_event.type == jet.JOB_ABORTED: - job_event = self.get_job_aborted_event(wrapped_job_event) - - # update error dict - elif wrapped_job_event.type == jet.JOB_HELD: - job_event = self.get_job_held_event(wrapped_job_event) - - # update error dict - elif wrapped_job_event.type == jet.SHADOW_EXCEPTION: - job_event = self.get_shadow_exception_event(wrapped_job_event) - - else: - raise AttributeError( - f"Event type: {wrapped_job_event.type} not handled yet" - ) - - return job_event diff --git a/htcanalyze/log_analyzer/event_handler/job_events.py b/htcanalyze/log_analyzer/event_handler/job_events.py index 0c8083a..ff0210c 100755 --- a/htcanalyze/log_analyzer/event_handler/job_events.py +++ b/htcanalyze/log_analyzer/event_handler/job_events.py @@ -6,10 +6,17 @@ from .node_cache import NodeCache from .states import ( TerminationState, + NormalTerminationState, + AbnormalTerminationState, ErrorState, AbortedState, ShadowExceptionState, - JobHeldState + JobSuspendedState, + JobHeldState, + JobEvictedState, + ExecutableErrorState, + JobReconnectFailedState, + JobDisconnectedState ) @@ -78,7 +85,7 @@ def __init__( event_number, time_stamp, error_state: ErrorState, - reason: str + reason: str = None ): super().__init__(event_number, time_stamp) assert isinstance(error_state, ErrorState) @@ -188,7 +195,7 @@ class JobCheckpointedEvent(JobEvent): # Todo: figure out data load -class JobEvictedEvent(JobEvent): +class JobEvictedEvent(ErrorEvent): """ Job evicted event. @@ -199,7 +206,29 @@ class JobEvictedEvent(JobEvent): claimed the computer, or perhaps another job is higher priority. """ - # Todo: figure out data load + def __init__( + self, + event_number, + time_stamp, + checkpointed=False, + ): + if checkpointed: + message = ( + "Job evicted with checkpoint, " + "continue process on last checkpoint" + ) + else: + message = ( + "Job evicted, progress is lost, " + "job goes back into the queue" + ) + + super().__init__( + event_number, + time_stamp, + JobEvictedState(), + message + ) class JobTerminationEvent(JobEvent, ABC): @@ -222,14 +251,35 @@ def __init__( event_number=None, time_stamp=None, resources=None, - termination_state: TerminationState = None, - return_value: int = None - + return_value: int = None, + termination_state: TerminationState = None ): super().__init__(event_number, time_stamp) self.resources = resources - self.termination_state = termination_state self.return_value = return_value + self.termination_state = termination_state + + +class NormalTerminationEvent(JobTerminationEvent): + """Normal Termination Event.""" + + def __init__(self, *args, **kwargs): + super().__init__( + *args, + **kwargs, + termination_state=NormalTerminationState() + ) + + +class AbnormalTerminationEvent(JobTerminationEvent): + """Abnormal Termination Event.""" + + def __init__(self, *args, **kwargs): + super().__init__( + *args, + **kwargs, + termination_state=AbnormalTerminationState() + ) class ImageSizeEvent(JobEvent): @@ -398,3 +448,70 @@ def __init__( JobHeldState(), reason ) + + +class JobDisconnectedEvent(ErrorEvent): + """ + Job disconnected event. + + Event Number: 022 + Event Name: Remote system call socket lost + Event Description: The condor_shadow and condor_starter + (which communicate while the job runs) have lost contact. + + """ + + def __init__( + self, + event_number, + time_stamp, + reason + ): + super().__init__( + event_number, + time_stamp, + JobDisconnectedState(), + reason + ) + + +class JobReconnectedEvent(JobEvent): + """ + Job reconnected event. + + Event Number: 023 + Event Name: Remote system call socket reestablished + Event Description: The condor_shadow and condor_starter + (which communicate while the job runs) have been able + to resume contact before the job lease expired. + + """ + + +class JobReconnectFailedEvent(ErrorEvent): + """ + Job reconnect failed event. + + Event Number: 024 + Event Name: Remote system call reconnect failure + Event Description: The condor_shadow and condor_starter + (which communicate while the job runs) were unable to resume + contact before the job lease expired. + + :param event_number: + :param time_stamp: + :param reason: + """ + + def __init__( + self, + event_number, + time_stamp, + reason + ): + super().__init__( + event_number, + time_stamp, + JobReconnectFailedState(), + reason + ) diff --git a/htcanalyze/log_analyzer/event_handler/states.py b/htcanalyze/log_analyzer/event_handler/states.py index 9bde06f..76ece7b 100755 --- a/htcanalyze/log_analyzer/event_handler/states.py +++ b/htcanalyze/log_analyzer/event_handler/states.py @@ -86,39 +86,36 @@ def __init__(self): class ErrorState(State, ABC): """Represents an error state.""" - def __init__(self): + def __init__(self, name=None): + self.name = name self.color = "red" class ErrorWhileReadingState(JobState, ErrorState): """Represents an error while reading state.""" def __init__(self): - super().__init__() - self.name = "ERROR_WHILE_READING" + super().__init__("ERROR_WHILE_READING") class InvalidHostAddressState(ErrorState): """Represents an invalid host address state.""" def __init__(self): - super().__init__() - self.name = "INVALID_HOST_ADDRESS" + super().__init__("INVALID_HOST_ADDRESS") class InvalidUserAddressState(ErrorState): """Represents an invalid user address state.""" def __init__(self): - super().__init__() - self.name = "INVALID_USER_ADDRESS" + super().__init__("INVALID_USER_ADDRESS") class AbortedState(TerminationState, ErrorState): """Represents an abortion state.""" def __init__(self): - super().__init__() - self.name = "ABORTED" + super().__init__("ABORTED") class JobHeldState(ErrorState): @@ -133,5 +130,39 @@ class ShadowExceptionState(ErrorState): """Represents a shadow exception state.""" def __init__(self): - super().__init__() - self.name = "SHADOW_EXCEPTION" + super().__init__("SHADOW_EXCEPTION") + + +class JobSuspendedState(ErrorState): + """Represents if a suspended job state.""" + + def __init__(self): + super().__init__("JOB_SUSPENDED") + + +class JobEvictedState(ErrorState): + """Represents an evicted job.""" + + def __init__(self): + super().__init__("JOB_EVICTED") + + +class ExecutableErrorState(ErrorState): + """Represents an evicted job.""" + + def __init__(self): + super().__init__("EXECUTABLE_ERROR") + + +class JobDisconnectedState(ErrorState): + """Represents a disconnected job state.""" + + def __init__(self): + super().__init__("JOB_DISCONNECTED") + + +class JobReconnectFailedState(ErrorState): + """Represents if a job reconnect failed.""" + + def __init__(self): + super().__init__("JOB_RECONNECT_FAILED") From 0d73e339455df19bd21c9f35540a6fe1427e9b34 Mon Sep 17 00:00:00 2001 From: "Loevenich, Mathis" Date: Wed, 17 Nov 2021 18:28:54 +0100 Subject: [PATCH 5/6] add: base class for JOB_RELEASED --- htcanalyze/log_analyzer/event_handler/job_events.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/htcanalyze/log_analyzer/event_handler/job_events.py b/htcanalyze/log_analyzer/event_handler/job_events.py index ff0210c..f16f043 100755 --- a/htcanalyze/log_analyzer/event_handler/job_events.py +++ b/htcanalyze/log_analyzer/event_handler/job_events.py @@ -450,6 +450,17 @@ def __init__( ) +class JobReleasedEvent(JobEvent): + """ + Job was released after being hold event. + + Event Number: 013 + Event Name: Job was released + Event Description: The job was in the hold state and is to be re-run. + + """ + + class JobDisconnectedEvent(ErrorEvent): """ Job disconnected event. From e034c16e91c7275156d1e52957f4e14870630e2a Mon Sep 17 00:00:00 2001 From: "Loevenich, Mathis" Date: Wed, 17 Nov 2021 18:38:07 +0100 Subject: [PATCH 6/6] rebase with pylint --- .../event_handler/event_handler.py | 37 ++++++++++--------- .../log_analyzer/event_handler/job_events.py | 2 +- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/htcanalyze/log_analyzer/event_handler/event_handler.py b/htcanalyze/log_analyzer/event_handler/event_handler.py index c3e0119..fd6a0df 100755 --- a/htcanalyze/log_analyzer/event_handler/event_handler.py +++ b/htcanalyze/log_analyzer/event_handler/event_handler.py @@ -69,7 +69,6 @@ class HTCJobEventWrapper: """ def __init__(self, job_event: HTCJobEvent): - self.wrapped_class = job_event self.event_number = job_event.get('EventTypeNumber') self.time_stamp = date_time.strptime( @@ -81,19 +80,24 @@ def __getattr__(self, attr): return getattr(self.wrapped_class, attr) def get(self, *args, **kwargs): + """Wraps wrapped_class get function.""" return self.wrapped_class.get(*args, **kwargs) def items(self): + """Wraps wrapped_class items method.""" return self.wrapped_class.items() def keys(self): + """Wraps wrapped_class keys method.""" return self.wrapped_class.keys() def values(self): + """Wraps wrapped_class values method.""" return self.wrapped_class.values() def to_dict(self): - return {key: val for key, val in self.items()} + """Turns wrapped_class items into a dictionary.""" + return dict(self.items()) def __repr__(self): return json.dumps( @@ -217,10 +221,8 @@ def get_job_terminated_event( ) ) - normal_termination = event.get('TerminatedNormally') - # differentiate between normal and abnormal termination - if normal_termination: + if event.get('TerminatedNormally'): self._state = NormalTerminationState() return_value = event.get('ReturnValue') return NormalTerminationEvent( @@ -229,16 +231,16 @@ def get_job_terminated_event( resources, return_value ) - else: - return_value = event.get('TerminatedBySignal') - self._state = AbnormalTerminationState() - return AbnormalTerminationEvent( - event.event_number, - event.time_stamp, - resources, - return_value - ) - # Todo: include description when possible + # else: + return_value = event.get('TerminatedBySignal') + self._state = AbnormalTerminationState() + return AbnormalTerminationEvent( + event.event_number, + event.time_stamp, + resources, + return_value + ) + # Todo: include description when possible @staticmethod def get_job_evicted_event( @@ -325,6 +327,7 @@ def get_job_held_event(event: HTCJobEventWrapper) -> JobHeldEvent: def get_job_disconnected_event( event: HTCJobEventWrapper ) -> JobDisconnectedEvent: + """Reads and returns a JobDisconnectedEvent.""" assert event.type == jet.JOB_DISCONNECTED reason = f"{event.get('DisconnectReason')}" return JobDisconnectedEvent( @@ -337,18 +340,18 @@ def get_job_disconnected_event( def get_job_reconnected_event( event: HTCJobEventWrapper ) -> JobReconnectedEvent: + """Reads and returns a JobReconnectedEvent.""" assert event.type == jet.JOB_RECONNECTED - reason = f"{event.get('Reason')}" return JobReconnectedEvent( event.event_number, event.time_stamp, - reason ) @staticmethod def get_job_reconnect_failed_event( event: HTCJobEventWrapper ) -> JobReconnectFailedEvent: + """Reads and returns a JobReconnectFailedEvent.""" assert event.type == jet.JOB_RECONNECT_FAILED reason = f"{event.get('Reason')}" return JobReconnectFailedEvent( diff --git a/htcanalyze/log_analyzer/event_handler/job_events.py b/htcanalyze/log_analyzer/event_handler/job_events.py index f16f043..4bfa238 100755 --- a/htcanalyze/log_analyzer/event_handler/job_events.py +++ b/htcanalyze/log_analyzer/event_handler/job_events.py @@ -187,7 +187,7 @@ class JobCheckpointedEvent(JobEvent): Event Number: 003 Event Name: Job was checkpointed - Event Description: The job’s complete state was written to a checkpoint + Event Description: The job’s complete state was written to a checkpoint file. This might happen without the job being removed from a machine, because the checkpointing can happen periodically.