Skip to content

add rollback to config experiments on Windows #37494

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitlab/e2e/e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,8 @@ new-e2e-installer-windows:
- EXTRA_PARAMS: --run "TestAgentConfig$/TestConfigUpgradeSuccessful$"
- EXTRA_PARAMS: --run "TestAgentConfig$/TestConfigUpgradeFailure$"
- EXTRA_PARAMS: --run "TestAgentConfig$/TestConfigUpgradeNewAgents$"
- EXTRA_PARAMS: --run "TestAgentConfig$/TestRevertsConfigExperimentWhenServiceDies$"
- EXTRA_PARAMS: --run "TestAgentConfig$/TestRevertsConfigExperimentWhenTimeout$"
# install-exe
- EXTRA_PARAMS: --run "TestInstallExe$/TestInstallAgentPackage$"
# install-script
Expand Down
25 changes: 25 additions & 0 deletions cmd/installer/subcommands/daemon/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ type cliParams struct {
pkg string
version string
catalog string
configs string
}

func apiCommands(global *command.GlobalParams) []*cobra.Command {
Expand All @@ -45,6 +46,20 @@ func apiCommands(global *command.GlobalParams) []*cobra.Command {
})
},
}

setConfigCatalogCmd := &cobra.Command{
Hidden: true,
Use: "set-config-catalog configs",
Short: "Internal command to set the config catalog to use",
Args: cobra.ExactArgs(1),
RunE: func(_ *cobra.Command, args []string) error {
return experimentFxWrapper(setConfigCatalog, &cliParams{
GlobalParams: *global,
configs: args[0],
})
},
}

installCmd := &cobra.Command{
Use: "install package version",
Aliases: []string{"install"},
Expand Down Expand Up @@ -155,6 +170,7 @@ func apiCommands(global *command.GlobalParams) []*cobra.Command {
}
return []*cobra.Command{
setCatalogCmd,
setConfigCatalogCmd,
startExperimentCmd,
stopExperimentCmd,
promoteExperimentCmd,
Expand Down Expand Up @@ -190,6 +206,15 @@ func catalog(params *cliParams, client localapiclient.Component) error {
return nil
}

func setConfigCatalog(params *cliParams, client localapiclient.Component) error {
err := client.SetConfigCatalog(params.configs)
if err != nil {
fmt.Println("Error setting config catalog:", err)
return err
}
return nil
}

func start(params *cliParams, client localapiclient.Component) error {
err := client.StartExperiment(params.pkg, params.version)
if err != nil {
Expand Down
16 changes: 15 additions & 1 deletion pkg/fleet/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ type Daemon interface {
Stop(ctx context.Context) error

SetCatalog(c catalog)
SetConfigCatalog(configs map[string]installerConfig)
Install(ctx context.Context, url string, args []string) error
Remove(ctx context.Context, pkg string) error
StartExperiment(ctx context.Context, url string) error
Expand All @@ -88,6 +89,7 @@ type daemonImpl struct {
catalog catalog
catalogOverride catalog
configs map[string]installerConfig
configsOverride map[string]installerConfig
requests chan remoteAPIRequest
requestsWG sync.WaitGroup
taskDB *taskDB
Expand Down Expand Up @@ -148,6 +150,7 @@ func newDaemon(rc *remoteConfig, installer func(env *env.Env) installer.Installe
catalog: catalog{},
catalogOverride: catalog{},
configs: make(map[string]installerConfig),
configsOverride: make(map[string]installerConfig),
stopChan: make(chan struct{}),
taskDB: taskDB,
}
Expand Down Expand Up @@ -256,6 +259,13 @@ func (d *daemonImpl) SetCatalog(c catalog) {
d.catalogOverride = c
}

// SetConfigCatalog sets the config catalog override.
func (d *daemonImpl) SetConfigCatalog(configs map[string]installerConfig) {
d.m.Lock()
defer d.m.Unlock()
d.configsOverride = configs
}

// Start starts remote config and the garbage collector.
func (d *daemonImpl) Start(_ context.Context) error {
d.m.Lock()
Expand Down Expand Up @@ -425,7 +435,11 @@ func (d *daemonImpl) startConfigExperiment(ctx context.Context, pkg string, vers
defer d.refreshState(ctx)

log.Infof("Daemon: Starting config experiment version %s for package %s", version, pkg)
config, ok := d.configs[version]
configs := d.configs
if len(d.configsOverride) > 0 {
configs = d.configsOverride
}
config, ok := configs[version]
if !ok {
return fmt.Errorf("could not find config version %s", version)
}
Expand Down
43 changes: 43 additions & 0 deletions pkg/fleet/daemon/local_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ func (l *localAPIImpl) handler() http.Handler {
r := mux.NewRouter().Headers("Content-Type", "application/json").Subrouter()
r.HandleFunc("/status", l.status).Methods(http.MethodGet)
r.HandleFunc("/catalog", l.setCatalog).Methods(http.MethodPost)
r.HandleFunc("/config_catalog", l.setConfigCatalog).Methods(http.MethodPost)
r.HandleFunc("/{package}/experiment/start", l.startExperiment).Methods(http.MethodPost)
r.HandleFunc("/{package}/experiment/stop", l.stopExperiment).Methods(http.MethodPost)
r.HandleFunc("/{package}/experiment/promote", l.promoteExperiment).Methods(http.MethodPost)
Expand Down Expand Up @@ -115,6 +116,23 @@ func (l *localAPIImpl) setCatalog(w http.ResponseWriter, r *http.Request) {
l.daemon.SetCatalog(catalog)
}

func (l *localAPIImpl) setConfigCatalog(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
var configs map[string]installerConfig
var response APIResponse
defer func() {
_ = json.NewEncoder(w).Encode(response)
}()
err := json.NewDecoder(r.Body).Decode(&configs)
if err != nil {
w.WriteHeader(http.StatusBadRequest)
response.Error = &APIError{Message: err.Error()}
return
}
log.Infof("Received local request to set config catalog")
l.daemon.SetConfigCatalog(configs)
}

// example: curl -X POST --unix-socket /opt/datadog-packages/run/installer.sock -H 'Content-Type: application/json' http://installer/datadog-agent/experiment/start -d '{"version":"1.21.5"}'
func (l *localAPIImpl) startExperiment(w http.ResponseWriter, r *http.Request) {
pkg := mux.Vars(r)["package"]
Expand Down Expand Up @@ -304,6 +322,7 @@ type LocalAPIClient interface {
Status() (StatusResponse, error)

SetCatalog(catalog string) error
SetConfigCatalog(configs string) error
Install(pkg, version string) error
Remove(pkg string) error
StartExperiment(pkg, version string) error
Expand Down Expand Up @@ -368,6 +387,30 @@ func (c *localAPIClientImpl) SetCatalog(catalog string) error {
return nil
}

// SetConfigCatalog sets the config catalog for the daemon.
func (c *localAPIClientImpl) SetConfigCatalog(configs string) error {
req, err := http.NewRequest(http.MethodPost, fmt.Sprintf("http://%s/config_catalog", c.addr), bytes.NewBuffer([]byte(configs)))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Accept", "application/json")
resp, err := c.client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
var response APIResponse
err = json.NewDecoder(resp.Body).Decode(&response)
if err != nil {
return err
}
if response.Error != nil {
return fmt.Errorf("error setting config catalog: %s", response.Error.Message)
}
return nil
}

// StartExperiment starts an experiment for a package.
func (c *localAPIClientImpl) StartExperiment(pkg, version string) error {
params := experimentTaskParams{
Expand Down
7 changes: 4 additions & 3 deletions pkg/fleet/daemon/local_api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2016-present Datadog, Inc.

// for now the installer is not supported on windows
//go:build !windows

package daemon

import (
Expand Down Expand Up @@ -99,6 +96,10 @@ func (m *testDaemon) SetCatalog(catalog catalog) {
m.Called(catalog)
}

func (m *testDaemon) SetConfigCatalog(configs map[string]installerConfig) {
m.Called(configs)
}

type testLocalAPI struct {
i *testDaemon
s *localAPIImpl
Expand Down
18 changes: 16 additions & 2 deletions pkg/fleet/installer/installer.go
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,11 @@ func (i *installerImpl) InstallConfigExperiment(ctx context.Context, pkg string,
)
}

// HACK: close so package can be updated as watchdog runs
if pkg == packageDatadogAgent && runtime.GOOS == "windows" {
i.db.Close()
}

return i.hooks.PostStartConfigExperiment(ctx, pkg)
}

Expand All @@ -532,11 +537,20 @@ func (i *installerImpl) RemoveConfigExperiment(ctx context.Context, pkg string)
i.m.Lock()
defer i.m.Unlock()

err := i.hooks.PreStopConfigExperiment(ctx, pkg)
repository := i.configs.Get(pkg)
state, err := repository.GetState()
if err != nil {
return fmt.Errorf("could not get repository state: %w", err)
}
if !state.HasExperiment() {
// Return early
return nil
}

err = i.hooks.PreStopConfigExperiment(ctx, pkg)
if err != nil {
return fmt.Errorf("could not stop experiment: %w", err)
}
repository := i.configs.Get(pkg)
err = repository.DeleteExperiment(ctx)
if err != nil {
return installerErrors.Wrap(
Expand Down
79 changes: 76 additions & 3 deletions pkg/fleet/installer/packages/datadog_agent_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -589,8 +589,25 @@ func setFleetPoliciesDir(path string) error {

// postStartConfigExperimentDatadogAgent runs post start scripts for a config experiment.
//
// Sets the fleet_policies_dir registry key to the experiment config path and restarts the agent service.
func postStartConfigExperimentDatadogAgent(_ HookContext) error {
// Function requirements:
// - be its own process, not run within the daemon
//
// Rollback notes:
// The config experiment uses a watchdog to monitor the Agent service.
// If the service fails to start or stops running, the watchdog will restore
// the stable config using the remove-config-experiment command.
// This ensures the system remains in a consistent state even if the experiment
// config causes issues.
// - If the new config is working properly then it will receive "promote"
// from the backend and will set an event to stop the watchdog.
// - If the new config fails to start the Agent, then after a timeout the
// watchdog will restore the stable config.
func postStartConfigExperimentDatadogAgent(ctx HookContext) error {
// open event that signal the end of the experiment
// this will terminate other running instances of the watchdog
// this allows for running multiple experiments in sequence
_ = setWatchdogStopEvent()

// Set the registry key to point to the experiment config
experimentPath := filepath.Join(paths.ConfigsPath, "datadog-agent", "experiment")
err := setFleetPoliciesDir(experimentPath)
Expand All @@ -601,15 +618,63 @@ func postStartConfigExperimentDatadogAgent(_ HookContext) error {
// Start the agent service to pick up the new config
err = winutil.RestartService("datadogagent")
if err != nil {
// Agent failed to start, restore stable config
restoreErr := restoreStableConfigFromExperiment(ctx)
if restoreErr != nil {
log.Error(restoreErr)
err = fmt.Errorf("%w, %w", err, restoreErr)
}
return fmt.Errorf("failed to start agent service: %w", err)
}

// Start watchdog to monitor the agent service
timeout := getWatchdogTimeout()
err = startWatchdog(ctx, time.Now().Add(timeout))
if err != nil {
log.Errorf("Config watchdog failed: %s", err)
// If watchdog fails, restore stable config
restoreErr := restoreStableConfigFromExperiment(ctx)
if restoreErr != nil {
log.Error(restoreErr)
err = fmt.Errorf("%w, %w", err, restoreErr)
}
return err
}

return nil
}

// restoreStableConfigFromExperiment restores the stable config using the remove-config-experiment command.
//
// call remove-config-experiment to:
// - restore stable config
// - update repository state / remove experiment link
//
// The updated repository state will cause the stable daemon to skip the stop-experiment
// operation received from the backend, which avoids restarting the services again.
func restoreStableConfigFromExperiment(ctx HookContext) error {
env := getenv()
installer, err := newInstallerExec(env)
if err != nil {
return fmt.Errorf("failed to create installer exec: %w", err)
}
err = installer.RemoveConfigExperiment(ctx, ctx.Package)
if err != nil {
return fmt.Errorf("failed to restore stable config: %w", err)
}

return nil
}

// preStopConfigExperimentDatadogAgent runs pre stop scripts for a config experiment.
//
// Sets the fleet_policies_dir registry key to the stable config path and restarts the agent service.
func preStopConfigExperimentDatadogAgent(_ HookContext) error {
// set watchdog stop to make sure the watchdog stops
// don't care if it fails cause we will proceed with the stop anyway
// this will just stop a watchdog that is running
_ = setWatchdogStopEvent()

// Set the registry key to point to the previous stable config
stablePath := filepath.Join(paths.ConfigsPath, "datadog-agent", "stable")
err := setFleetPoliciesDir(stablePath)
Expand All @@ -629,9 +694,17 @@ func preStopConfigExperimentDatadogAgent(_ HookContext) error {
//
// Sets the fleet_policies_dir registry key to the stable config path and restarts the agent service.
func postPromoteConfigExperimentDatadogAgent(_ HookContext) error {
err := setWatchdogStopEvent()
if err != nil {
// if we can't set the event it means the watchdog has failed
// In this case, we were already promoting the experiment
// so we can continue without error
log.Errorf("failed to set premote event: %s", err)
}

// Set the registry key to point to the stable config (which now contains the promoted experiment)
stablePath := filepath.Join(paths.ConfigsPath, "datadog-agent", "stable")
err := setFleetPoliciesDir(stablePath)
err = setFleetPoliciesDir(stablePath)
if err != nil {
return err
}
Expand Down
6 changes: 3 additions & 3 deletions test/new-e2e/tests/installer/windows/base_suite.go
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ func (s *BaseSuite) AssertSuccessfulConfigStartExperiment(configID string) {
s.Require().NoError(err)

s.Require().Host(s.Env().RemoteHost).HasDatadogInstaller().Status().
HasConfigState("datadog-agent").
HasConfigState(consts.AgentPackage).
WithExperimentConfigEqual(configID).
HasARunningDatadogAgentService()
}
Expand All @@ -407,7 +407,7 @@ func (s *BaseSuite) AssertSuccessfulConfigPromoteExperiment(configID string) {
s.Require().NoError(err)

s.Require().Host(s.Env().RemoteHost).HasDatadogInstaller().Status().
HasConfigState("datadog-agent").
HasConfigState(consts.AgentPackage).
WithStableConfigEqual(configID).
WithExperimentConfigEqual("").
HasARunningDatadogAgentService()
Expand All @@ -421,7 +421,7 @@ func (s *BaseSuite) AssertSuccessfulConfigStopExperiment() {
s.Require().NoError(err)

s.Require().Host(s.Env().RemoteHost).HasDatadogInstaller().Status().
HasConfigState("datadog-agent").
HasConfigState(consts.AgentPackage).
WithExperimentConfigEqual("").
HasARunningDatadogAgentService()
}
Loading
Loading