Skip to content

Commit 2909f60

Browse files
[POA-3109] Handle panic while stopping apidump process while daemonset exit (#90)
This pull request includes changes to prevent the agent from going into a panic state while closing the apidump process during the daemonset process shutdown and other changes. **Problem:** * Before changing the state of the apidump process to the final state and then calling the `StopApiDumpProcess`, we are checking if the process is already in the final state to avoid `StopApiDumpProcess` being called multiple times. * But this final state check is not sync, due to which dirty read issue can happen and `StopApiDumpProcess` can be called multiple times which will send signal to already stopped channel hence causing program to go to panic state. **Changes:** Improvements to state handling: * Added a check in `changePodTrafficMonitorState` to prevent state changes if the pod is already in a final state. * Ensure the stop channel is closed only after the API dump process is stopped/exited. * Removed redundant checks for end states in various methods, as these are now handled by the new state check in `changePodTrafficMonitorState`. Other naming changes: * Renamed `StopApiDumpProcess` to `SignalApiDumpProcessToStop` for better clarity on its purpose. * Renamed `isEndState` to `isTrafficMonitoringInFinalState` for better readability and consistency.
1 parent 09e4f72 commit 2909f60

File tree

5 files changed

+14
-23
lines changed

5 files changed

+14
-23
lines changed

cmd/internal/kube/daemonset/apidump_process.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ func (d *Daemonset) StartApiDumpProcess(podUID types.UID) error {
3737
// Decrement the wait group counter
3838
d.ApidumpProcessesWG.Done()
3939

40+
// Close the stop channel once the apidump process is exited
41+
close(podArgs.StopChan)
42+
4043
nextState := TrafficMonitoringEnded
4144

4245
if err := recover(); err != nil {
@@ -100,18 +103,17 @@ func (d *Daemonset) StartApiDumpProcess(podUID types.UID) error {
100103
return nil
101104
}
102105

103-
// StopApiDumpProcess signals the API dump process to stop for a given pod
106+
// SignalApiDumpProcessToStop signals the API dump process to stop for a given pod
104107
// identified by its UID. It retrieves the process's stop channel object from a map
105108
// and sends a stop signal to trigger apidump shutdown.
106-
func (d *Daemonset) StopApiDumpProcess(podUID types.UID, stopErr error) error {
109+
func (d *Daemonset) SignalApiDumpProcessToStop(podUID types.UID, stopErr error) error {
107110
podArgs, err := d.getPodArgsFromMap(podUID)
108111
if err != nil {
109112
return err
110113
}
111114

112115
printer.Infof("Stopping API dump process for pod %s\n", podArgs.PodName)
113116
podArgs.StopChan <- stopErr
114-
close(podArgs.StopChan)
115117

116118
return nil
117119
}

cmd/internal/kube/daemonset/daemonset.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -343,11 +343,6 @@ func (d *Daemonset) StopAllApiDumpProcesses() {
343343
podUID := k.(types.UID)
344344
podArgs := v.(*PodArgs)
345345

346-
if podArgs.isEndState() {
347-
printer.Debugf("API dump process for pod %s already stopped, state: %s\n", podArgs.PodName, podArgs.PodTrafficMonitorState)
348-
return true
349-
}
350-
351346
// Since this state can happen at any time so no check for allowed current states
352347
err := podArgs.changePodTrafficMonitorState(DaemonSetShutdown)
353348
if err != nil {
@@ -356,7 +351,7 @@ func (d *Daemonset) StopAllApiDumpProcesses() {
356351
return true
357352
}
358353

359-
err = d.StopApiDumpProcess(podUID, errors.Errorf("Daemonset agent is shutting down, stopping pod: %s", podArgs.PodName))
354+
err = d.SignalApiDumpProcessToStop(podUID, errors.Errorf("Daemonset agent is shutting down, stopping pod: %s", podArgs.PodName))
360355
if err != nil {
361356
printer.Errorf("Failed to stop api dump process, pod name: %s, error: %v\n", podArgs.PodName, err)
362357
}

cmd/internal/kube/daemonset/kube_events_worker.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,11 +81,6 @@ func (d *Daemonset) handlePodDeleteEvent(pod coreV1.Pod) {
8181
return
8282
}
8383

84-
if podArgs.isEndState() {
85-
printer.Debugf("Pod %s already stopped monitoring, state: %s\n", podArgs.PodName, podArgs.PodTrafficMonitorState)
86-
return
87-
}
88-
8984
var podStatus PodTrafficMonitorState
9085
switch pod.Status.Phase {
9186
case coreV1.PodSucceeded:
@@ -104,7 +99,7 @@ func (d *Daemonset) handlePodDeleteEvent(pod coreV1.Pod) {
10499
return
105100
}
106101

107-
err = d.StopApiDumpProcess(pod.UID, errors.Errorf("got pod delete event, pod: %s", podArgs.PodName))
102+
err = d.SignalApiDumpProcessToStop(pod.UID, errors.Errorf("got pod delete event, pod: %s", podArgs.PodName))
108103
if err != nil {
109104
printer.Errorf("Failed to stop api dump process, pod name: %s, error: %v\n", podArgs.PodName, err)
110105
}

cmd/internal/kube/daemonset/pod_args.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ func (p *PodArgs) changePodTrafficMonitorState(
8383
p.StateChangeMutex.Lock()
8484
defer p.StateChangeMutex.Unlock()
8585

86+
if isTrafficMonitoringInFinalState(p) {
87+
return errors.New(fmt.Sprintf("API dump process for pod %s already in final state, state: %s\n", p.PodName, p.PodTrafficMonitorState))
88+
}
89+
8690
// Check if the current state is allowed for the transition
8791
// If the allowedCurrentStates is empty, then any state is allowed
8892
if len(allowedCurrentStates) != 0 && !slices.Contains(allowedCurrentStates, p.PodTrafficMonitorState) {
@@ -97,8 +101,8 @@ func (p *PodArgs) changePodTrafficMonitorState(
97101
return nil
98102
}
99103

100-
// isEndState checks if the pod traffic monitor state is in the final state.
101-
func (p *PodArgs) isEndState() bool {
104+
// isTrafficMonitoringInFinalState checks if the pod traffic monitor state is in the final state.
105+
func isTrafficMonitoringInFinalState(p *PodArgs) bool {
102106
switch p.PodTrafficMonitorState {
103107
case TrafficMonitoringEnded, TrafficMonitoringFailed, RemovePodFromMap:
104108
return true

cmd/internal/kube/daemonset/pods_healthcheck_worker.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,6 @@ func (d *Daemonset) handleTerminatedPod(podUID types.UID, podStatusErr error, po
6060
return
6161
}
6262

63-
if podArgs.isEndState() {
64-
printer.Debugf("Pod %s already stopped monitoring, state: %s\n", podArgs.PodName, podArgs.PodTrafficMonitorState)
65-
return
66-
}
67-
6863
// If pod doesn't exists anymore, we don't need to check the pod status
6964
// We can directly change the state to PodTerminated
7065
if podDoesNotExists {
@@ -78,7 +73,7 @@ func (d *Daemonset) handleTerminatedPod(podUID types.UID, podStatusErr error, po
7873
return
7974
}
8075

81-
err = d.StopApiDumpProcess(podUID, podStatusErr)
76+
err = d.SignalApiDumpProcessToStop(podUID, podStatusErr)
8277
if err != nil {
8378
printer.Errorf("Failed to stop api dump process, pod name: %s, error: %v\n", podArgs.PodName, err)
8479
}

0 commit comments

Comments
 (0)