Skip to content

Commit 515617d

Browse files
authored
Add a supervisor process to the agent (#112)
With this patch, we use the agent process to be it's own supervisor. This is controlled with __X_AKITA_NO_FORK environment variable. If it's set to "1", the process won't fork itself and we will have the old behaviour. For all other values, the first thing it is going to do is fork and execute itself. The root process will then monitor the child process and restart it if necessary.
1 parent f80a1e3 commit 515617d

File tree

2 files changed

+178
-0
lines changed

2 files changed

+178
-0
lines changed

cmd/root.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,12 @@ func printFlagsWarning(cmd *cobra.Command) {
166166
}
167167

168168
func Execute() {
169+
err := runSupervisor()
170+
if err != nil {
171+
printer.Errorln(err)
172+
os.Exit(126)
173+
}
174+
169175
defer telemetry.Shutdown()
170176

171177
if cmd, err := rootCmd.ExecuteC(); err != nil {

cmd/supervisor.go

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
package cmd
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"os/signal"
7+
"strconv"
8+
"syscall"
9+
"time"
10+
11+
"github.com/postmanlabs/postman-insights-agent/printer"
12+
)
13+
14+
var numRuns uint64 = 0
15+
16+
func runChild(pwd string) (int, error) {
17+
numRuns += 1
18+
19+
args := os.Args
20+
env := os.Environ()
21+
22+
env = append(env, "__X_AKITA_CHILD=true")
23+
env = append(env, fmt.Sprintf("__X_AKITA_NUM_RUNS=%d", numRuns))
24+
25+
pid, err := syscall.ForkExec(args[0], args, &syscall.ProcAttr{
26+
Dir: pwd,
27+
Env: env,
28+
Sys: &syscall.SysProcAttr{
29+
Setsid: true,
30+
},
31+
Files: []uintptr{0, 1, 2},
32+
})
33+
if err != nil {
34+
return 0, err
35+
}
36+
37+
return pid, nil
38+
}
39+
40+
func collectStatus(pid int) (*os.ProcessState, error) {
41+
proc, err := os.FindProcess(pid)
42+
if err != nil {
43+
return nil, err
44+
}
45+
46+
return proc.Wait()
47+
}
48+
49+
func runningInsideDocker() bool {
50+
c, _ := strconv.ParseBool(os.Getenv("__X_AKITA_CLI_DOCKER"))
51+
return c == true
52+
}
53+
54+
func isChildProcess() bool {
55+
c, _ := strconv.ParseBool(os.Getenv("__X_AKITA_CHILD"))
56+
return c == true
57+
}
58+
59+
func runSupervisor() error {
60+
if !runningInsideDocker() {
61+
return nil
62+
}
63+
64+
if isChildProcess() {
65+
return nil
66+
}
67+
68+
maxRuns, err := strconv.ParseUint(os.Getenv("__X_AKITA_MAX_RUNS"), 10, 64)
69+
if err != nil {
70+
maxRuns = 0
71+
printer.Debugf("unable to parse __X_AKITA_MAX_RUNS, using default value of 0 (no restriction)\n")
72+
}
73+
74+
delay, err := strconv.ParseInt(os.Getenv("__X_AKITA_DELAY"), 10, 64)
75+
if err != nil {
76+
delay = 1
77+
printer.Debugf("unable to parse __X_AKITA_DELAY, using default value of 1\n")
78+
}
79+
80+
if delay <= 0 {
81+
delay = 1
82+
printer.Debugf("__X_AKITA_DELAY must be greater than 0, using default value of 1\n")
83+
}
84+
85+
pwd, err := os.Getwd()
86+
if err != nil {
87+
return err
88+
}
89+
90+
sigs := make(chan os.Signal)
91+
spawnSignal := make(chan bool)
92+
93+
signal.Notify(sigs, syscall.SIGTERM, syscall.SIGINT, syscall.SIGCHLD)
94+
95+
printer.Debugf("starting the child process, run %d of %d\n", 1, maxRuns)
96+
97+
pid, err := runChild(pwd)
98+
if err != nil {
99+
return err
100+
}
101+
102+
for {
103+
select {
104+
case sig := <- sigs:
105+
sigNum, ok := sig.(syscall.Signal)
106+
if !ok {
107+
return fmt.Errorf("unable to process the signal %v\n", sig)
108+
}
109+
110+
switch sigNum {
111+
case syscall.SIGINT, syscall.SIGTERM:
112+
if pid != 0 {
113+
printer.Debugf("sending %v to child\n", sigNum)
114+
115+
err := syscall.Kill(pid, sigNum)
116+
if err != nil {
117+
return err
118+
}
119+
120+
_, err = collectStatus(pid)
121+
if err != nil {
122+
return err
123+
}
124+
}
125+
126+
syscall.Exit(128 + int(sigNum))
127+
128+
case syscall.SIGCHLD:
129+
if pid == 0 {
130+
continue
131+
}
132+
133+
status, err := collectStatus(pid)
134+
if err != nil {
135+
return err
136+
}
137+
138+
printer.Debugf("child exited with %d\n", status.ExitCode())
139+
140+
if status.ExitCode() >= 0 && status.ExitCode() < 126 {
141+
syscall.Exit(status.ExitCode())
142+
}
143+
144+
pid = 0
145+
146+
if numRuns == maxRuns {
147+
return fmt.Errorf("maximum number of runs reached (%d), bailing out", maxRuns)
148+
}
149+
150+
printer.Debugf("retrying after %d seconds\n", delay)
151+
152+
go func() {
153+
<- time.After(time.Duration(delay) * time.Second)
154+
spawnSignal <- true
155+
} ()
156+
}
157+
158+
case <- spawnSignal:
159+
printer.Debugf("starting the child process, run %d of %d\n", numRuns + 1, maxRuns)
160+
161+
pid, err = runChild(pwd)
162+
if err != nil {
163+
return err
164+
}
165+
}
166+
}
167+
168+
// Unreachable
169+
panic("internal error")
170+
171+
return nil
172+
}

0 commit comments

Comments
 (0)