Skip to content

Commit f9f54ee

Browse files
[POA-2928] Cover corner case of pod not coming up at all (#87)
I observed some corner cases in the beta environment, where some services are directly moving to the final state before going to the running state. This could happen if there are some config issues like `CreateContainerConfigError`. For example, here we got a Pod Delete event where the pod's status is still Pending. ``` �[31m[ERROR] �[0mPod status is in unknown state, pod name: cronjob-scheduled-work-29008098-8mgxh, status: Pending �[31m[ERROR] �[0mFailed to change pod state, pod name: cronjob-scheduled-work-29008098-8mgxh, from: PodPending to: PodTerminated, error: Invalid current state for pod cronjob-scheduled-work-29008098-8mgxh: PodPending ``` For such corner cases, I have added an if condition, i.e., if a pod doesn't exist anymore, then move its state to `PodTerminated` irrespective of the previous state. This will then stop the apidump process (if it is still active) and remove it from map. This will remove the unnecessary garbage data in maps.
1 parent 432c1d0 commit f9f54ee

File tree

2 files changed

+11
-5
lines changed

2 files changed

+11
-5
lines changed

cmd/internal/kube/daemonset/pods_healthcheck_worker.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,13 @@ func (d *Daemonset) checkPodsHealth() {
3737
podStatus, ok := podStatuses[podUID]
3838
if !ok {
3939
printer.Infof("Pod status not found for podUID %s, Pod doesn't exists anymore\n", podUID)
40-
d.handleTerminatedPod(podUID, errors.Errorf("pod %s doesn't exists anymore", podUID))
40+
d.handleTerminatedPod(podUID, errors.Errorf("pod %s doesn't exists anymore", podUID), true)
4141
}
4242

4343
switch podStatus {
4444
case coreV1.PodSucceeded, coreV1.PodFailed:
4545
printer.Infof("Pod with UID %s has stopped running, status: %s\n", podUID, podStatus)
46-
d.handleTerminatedPod(podUID, errors.Errorf("pod %s has stopped running, status: %s", podUID, podStatus))
46+
d.handleTerminatedPod(podUID, errors.Errorf("pod %s has stopped running, status: %s", podUID, podStatus), false)
4747
case coreV1.PodRunning:
4848
printer.Debugf("Pod with UID %s, status:%s\n", podUID, podStatus)
4949
d.handleUnmonitoredPod(podUID)
@@ -53,7 +53,7 @@ func (d *Daemonset) checkPodsHealth() {
5353

5454
// handleTerminatedPod handles the terminated pod by changing the pod's traffic monitor state to PodTerminated
5555
// and stopping the API dump process for that pod.
56-
func (d *Daemonset) handleTerminatedPod(podUID types.UID, podStatusErr error) {
56+
func (d *Daemonset) handleTerminatedPod(podUID types.UID, podStatusErr error, podDoesNotExists bool) {
5757
podArgs, err := d.getPodArgsFromMap(podUID)
5858
if err != nil {
5959
printer.Infof("Failed to get podArgs for podUID %s: %v\n", podUID, err)
@@ -65,7 +65,13 @@ func (d *Daemonset) handleTerminatedPod(podUID types.UID, podStatusErr error) {
6565
return
6666
}
6767

68-
err = podArgs.changePodTrafficMonitorState(PodTerminated, TrafficMonitoringRunning)
68+
// If pod doesn't exists anymore, we don't need to check the pod status
69+
// We can directly change the state to PodTerminated
70+
if podDoesNotExists {
71+
err = podArgs.changePodTrafficMonitorState(PodTerminated)
72+
} else {
73+
err = podArgs.changePodTrafficMonitorState(PodTerminated, TrafficMonitoringRunning)
74+
}
6975
if err != nil {
7076
printer.Infof("Failed to change pod state, pod name: %s, from: %s to: %s, error: %v\n",
7177
podArgs.PodName, podArgs.PodTrafficMonitorState, PodTerminated, err)

cmd/internal/kube/daemonset/telemetry.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ func (d *Daemonset) sendTelemetry() {
2727
// dumpPodsApiDumpProcessState logs the current state of active pods.
2828
// It prints a formatted table with the pod name, project ID, and current state for each pod.
2929
func (d *Daemonset) dumpPodsApiDumpProcessState() {
30-
logf := printer.Debugf
30+
logf := printer.Infof
3131

3232
const hrBr = "================================================================================" +
3333
"===========================================================================================\n"

0 commit comments

Comments
 (0)