Skip to content

Commit 8522331

Browse files
authored
Extend MaintenanceWindows parameter usage (#2810)
Consider maintenance window when migrating master pods and replacing pods (rolling update)
1 parent 46d5ebe commit 8522331

File tree

11 files changed

+168
-30
lines changed

11 files changed

+168
-30
lines changed

docs/administrator.md

+3
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,9 @@ Note that, changes in `SPILO_CONFIGURATION` env variable under `bootstrap.dcs`
208208
path are ignored for the diff. They will be applied through Patroni's rest api
209209
interface, following a restart of all instances.
210210

211+
Rolling update is postponed until the next maintenance window if any is defined
212+
under the `maintenanceWindows` cluster manifest parameter.
213+
211214
The operator also support lazy updates of the Spilo image. In this case the
212215
StatefulSet is only updated, but no rolling update follows. This feature saves
213216
you a switchover - and hence downtime - when you know pods are re-started later

docs/reference/cluster_manifest.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,9 @@ These parameters are grouped directly under the `spec` key in the manifest.
116116

117117
* **maintenanceWindows**
118118
a list which defines specific time frames when certain maintenance operations
119-
are allowed. So far, it is only implemented for automatic major version
120-
upgrades. Accepted formats are "01:00-06:00" for daily maintenance windows or
121-
"Sat:00:00-04:00" for specific days, with all times in UTC.
119+
such as automatic major upgrades or rolling updates are allowed. Accepted formats
120+
are "01:00-06:00" for daily maintenance windows or "Sat:00:00-04:00" for specific
121+
days, with all times in UTC.
122122

123123
* **users**
124124
a map of usernames to user flags for the users that should be created in the

e2e/tests/test_e2e.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -1251,7 +1251,7 @@ def get_annotations():
12511251
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
12521252
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
12531253

1254-
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
1254+
# no pod replacement outside of the maintenance window
12551255
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
12561256
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
12571257
self.eventuallyEqual(check_version, 14, "Version should not be upgraded")
@@ -1276,7 +1276,7 @@ def get_annotations():
12761276
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_16)
12771277
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
12781278

1279-
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
1279+
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
12801280
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
12811281
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
12821282
self.eventuallyEqual(check_version, 16, "Version should be upgraded from 14 to 16")
@@ -1303,7 +1303,7 @@ def get_annotations():
13031303
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_17)
13041304
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
13051305

1306-
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
1306+
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
13071307
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
13081308
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
13091309
self.eventuallyEqual(check_version, 16, "Version should not be upgraded because annotation for last upgrade's failure is set")
@@ -1313,7 +1313,6 @@ def get_annotations():
13131313
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
13141314
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
13151315

1316-
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
13171316
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
13181317
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
13191318

pkg/cluster/cluster.go

+42-2
Original file line numberDiff line numberDiff line change
@@ -1731,18 +1731,58 @@ func (c *Cluster) GetStatus() *ClusterStatus {
17311731
return status
17321732
}
17331733

1734+
func (c *Cluster) GetSwitchoverSchedule() string {
1735+
var possibleSwitchover, schedule time.Time
1736+
1737+
now := time.Now().UTC()
1738+
for _, window := range c.Spec.MaintenanceWindows {
1739+
// in the best case it is possible today
1740+
possibleSwitchover = time.Date(now.Year(), now.Month(), now.Day(), window.StartTime.Hour(), window.StartTime.Minute(), 0, 0, time.UTC)
1741+
if window.Everyday {
1742+
if now.After(possibleSwitchover) {
1743+
// we are already past the time for today, try tomorrow
1744+
possibleSwitchover = possibleSwitchover.AddDate(0, 0, 1)
1745+
}
1746+
} else {
1747+
if now.Weekday() != window.Weekday {
1748+
// get closest possible time for this window
1749+
possibleSwitchover = possibleSwitchover.AddDate(0, 0, int((7+window.Weekday-now.Weekday())%7))
1750+
} else if now.After(possibleSwitchover) {
1751+
// we are already past the time for today, try next week
1752+
possibleSwitchover = possibleSwitchover.AddDate(0, 0, 7)
1753+
}
1754+
}
1755+
1756+
if (schedule == time.Time{}) || possibleSwitchover.Before(schedule) {
1757+
schedule = possibleSwitchover
1758+
}
1759+
}
1760+
return schedule.Format("2006-01-02T15:04+00")
1761+
}
1762+
17341763
// Switchover does a switchover (via Patroni) to a candidate pod
17351764
func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) error {
1736-
17371765
var err error
17381766
c.logger.Debugf("switching over from %q to %q", curMaster.Name, candidate)
1767+
1768+
if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) {
1769+
c.logger.Infof("postponing switchover, not in maintenance window")
1770+
schedule := c.GetSwitchoverSchedule()
1771+
1772+
if err := c.patroni.Switchover(curMaster, candidate.Name, schedule); err != nil {
1773+
return fmt.Errorf("could not schedule switchover: %v", err)
1774+
}
1775+
c.logger.Infof("switchover is scheduled at %s", schedule)
1776+
return nil
1777+
}
1778+
17391779
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switching over from %q to %q", curMaster.Name, candidate)
17401780
stopCh := make(chan struct{})
17411781
ch := c.registerPodSubscriber(candidate)
17421782
defer c.unregisterPodSubscriber(candidate)
17431783
defer close(stopCh)
17441784

1745-
if err = c.patroni.Switchover(curMaster, candidate.Name); err == nil {
1785+
if err = c.patroni.Switchover(curMaster, candidate.Name, ""); err == nil {
17461786
c.logger.Debugf("successfully switched over from %q to %q", curMaster.Name, candidate)
17471787
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Successfully switched over from %q to %q", curMaster.Name, candidate)
17481788
_, err = c.waitForPodLabel(ch, stopCh, nil)

pkg/cluster/cluster_test.go

+88
Original file line numberDiff line numberDiff line change
@@ -2057,3 +2057,91 @@ func TestCompareVolumeMounts(t *testing.T) {
20572057
})
20582058
}
20592059
}
2060+
2061+
func TestGetSwitchoverSchedule(t *testing.T) {
2062+
now := time.Now()
2063+
2064+
futureTimeStart := now.Add(1 * time.Hour)
2065+
futureWindowTimeStart := futureTimeStart.Format("15:04")
2066+
futureWindowTimeEnd := now.Add(2 * time.Hour).Format("15:04")
2067+
pastTimeStart := now.Add(-2 * time.Hour)
2068+
pastWindowTimeStart := pastTimeStart.Format("15:04")
2069+
pastWindowTimeEnd := now.Add(-1 * time.Hour).Format("15:04")
2070+
2071+
tests := []struct {
2072+
name string
2073+
windows []acidv1.MaintenanceWindow
2074+
expected string
2075+
}{
2076+
{
2077+
name: "everyday maintenance windows is later today",
2078+
windows: []acidv1.MaintenanceWindow{
2079+
{
2080+
Everyday: true,
2081+
StartTime: mustParseTime(futureWindowTimeStart),
2082+
EndTime: mustParseTime(futureWindowTimeEnd),
2083+
},
2084+
},
2085+
expected: futureTimeStart.Format("2006-01-02T15:04+00"),
2086+
},
2087+
{
2088+
name: "everyday maintenance window is tomorrow",
2089+
windows: []acidv1.MaintenanceWindow{
2090+
{
2091+
Everyday: true,
2092+
StartTime: mustParseTime(pastWindowTimeStart),
2093+
EndTime: mustParseTime(pastWindowTimeEnd),
2094+
},
2095+
},
2096+
expected: pastTimeStart.AddDate(0, 0, 1).Format("2006-01-02T15:04+00"),
2097+
},
2098+
{
2099+
name: "weekday maintenance windows is later today",
2100+
windows: []acidv1.MaintenanceWindow{
2101+
{
2102+
Weekday: now.Weekday(),
2103+
StartTime: mustParseTime(futureWindowTimeStart),
2104+
EndTime: mustParseTime(futureWindowTimeEnd),
2105+
},
2106+
},
2107+
expected: futureTimeStart.Format("2006-01-02T15:04+00"),
2108+
},
2109+
{
2110+
name: "weekday maintenance windows is passed for today",
2111+
windows: []acidv1.MaintenanceWindow{
2112+
{
2113+
Weekday: now.Weekday(),
2114+
StartTime: mustParseTime(pastWindowTimeStart),
2115+
EndTime: mustParseTime(pastWindowTimeEnd),
2116+
},
2117+
},
2118+
expected: pastTimeStart.AddDate(0, 0, 7).Format("2006-01-02T15:04+00"),
2119+
},
2120+
{
2121+
name: "choose the earliest window",
2122+
windows: []acidv1.MaintenanceWindow{
2123+
{
2124+
Weekday: now.AddDate(0, 0, 2).Weekday(),
2125+
StartTime: mustParseTime(futureWindowTimeStart),
2126+
EndTime: mustParseTime(futureWindowTimeEnd),
2127+
},
2128+
{
2129+
Everyday: true,
2130+
StartTime: mustParseTime(pastWindowTimeStart),
2131+
EndTime: mustParseTime(pastWindowTimeEnd),
2132+
},
2133+
},
2134+
expected: pastTimeStart.AddDate(0, 0, 1).Format("2006-01-02T15:04+00"),
2135+
},
2136+
}
2137+
2138+
for _, tt := range tests {
2139+
t.Run(tt.name, func(t *testing.T) {
2140+
cluster.Spec.MaintenanceWindows = tt.windows
2141+
schedule := cluster.GetSwitchoverSchedule()
2142+
if schedule != tt.expected {
2143+
t.Errorf("Expected GetSwitchoverSchedule to return %s, returned: %s", tt.expected, schedule)
2144+
}
2145+
})
2146+
}
2147+
}

pkg/cluster/majorversionupgrade.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ func (c *Cluster) majorVersionUpgrade() error {
129129
return nil
130130
}
131131

132-
if !isInMainternanceWindow(c.Spec.MaintenanceWindows) {
132+
if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) {
133133
c.logger.Infof("skipping major version upgrade, not in maintenance window")
134134
return nil
135135
}

pkg/cluster/resources.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,8 @@ func (c *Cluster) preScaleDown(newStatefulSet *appsv1.StatefulSet) error {
162162
return fmt.Errorf("pod %q does not belong to cluster", podName)
163163
}
164164

165-
if err := c.patroni.Switchover(&masterPod[0], masterCandidatePod.Name); err != nil {
166-
return fmt.Errorf("could not failover: %v", err)
165+
if err := c.patroni.Switchover(&masterPod[0], masterCandidatePod.Name, ""); err != nil {
166+
return fmt.Errorf("could not switchover: %v", err)
167167
}
168168

169169
return nil

pkg/cluster/sync.go

+9-1
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,7 @@ func (c *Cluster) syncStatefulSet() error {
497497
)
498498
podsToRecreate := make([]v1.Pod, 0)
499499
isSafeToRecreatePods := true
500+
postponeReasons := make([]string, 0)
500501
switchoverCandidates := make([]spec.NamespacedName, 0)
501502

502503
pods, err := c.listPods()
@@ -646,12 +647,19 @@ func (c *Cluster) syncStatefulSet() error {
646647
c.logger.Debug("syncing Patroni config")
647648
if configPatched, restartPrimaryFirst, restartWait, err = c.syncPatroniConfig(pods, c.Spec.Patroni, requiredPgParameters); err != nil {
648649
c.logger.Warningf("Patroni config updated? %v - errors during config sync: %v", configPatched, err)
650+
postponeReasons = append(postponeReasons, "errors during Patroni config sync")
649651
isSafeToRecreatePods = false
650652
}
651653

652654
// restart Postgres where it is still pending
653655
if err = c.restartInstances(pods, restartWait, restartPrimaryFirst); err != nil {
654656
c.logger.Errorf("errors while restarting Postgres in pods via Patroni API: %v", err)
657+
postponeReasons = append(postponeReasons, "errors while restarting Postgres via Patroni API")
658+
isSafeToRecreatePods = false
659+
}
660+
661+
if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) {
662+
postponeReasons = append(postponeReasons, "not in maintenance window")
655663
isSafeToRecreatePods = false
656664
}
657665

@@ -666,7 +674,7 @@ func (c *Cluster) syncStatefulSet() error {
666674
}
667675
c.eventRecorder.Event(c.GetReference(), v1.EventTypeNormal, "Update", "Rolling update done - pods have been recreated")
668676
} else {
669-
c.logger.Warningf("postpone pod recreation until next sync because of errors during config sync")
677+
c.logger.Warningf("postpone pod recreation until next sync - reason: %s", strings.Join(postponeReasons, `', '`))
670678
}
671679
}
672680

pkg/cluster/util.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -663,7 +663,7 @@ func parseResourceRequirements(resourcesRequirement v1.ResourceRequirements) (ac
663663
return resources, nil
664664
}
665665

666-
func isInMainternanceWindow(specMaintenanceWindows []acidv1.MaintenanceWindow) bool {
666+
func isInMaintenanceWindow(specMaintenanceWindows []acidv1.MaintenanceWindow) bool {
667667
if len(specMaintenanceWindows) == 0 {
668668
return true
669669
}

pkg/cluster/util_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -650,7 +650,7 @@ func Test_trimCronjobName(t *testing.T) {
650650
}
651651
}
652652

653-
func TestIsInMaintenanceWindow(t *testing.T) {
653+
func TestisInMaintenanceWindow(t *testing.T) {
654654
now := time.Now()
655655
futureTimeStart := now.Add(1 * time.Hour)
656656
futureTimeStartFormatted := futureTimeStart.Format("15:04")
@@ -705,8 +705,8 @@ func TestIsInMaintenanceWindow(t *testing.T) {
705705
for _, tt := range tests {
706706
t.Run(tt.name, func(t *testing.T) {
707707
cluster.Spec.MaintenanceWindows = tt.windows
708-
if isInMainternanceWindow(cluster.Spec.MaintenanceWindows) != tt.expected {
709-
t.Errorf("Expected isInMainternanceWindow to return %t", tt.expected)
708+
if isInMaintenanceWindow(cluster.Spec.MaintenanceWindows) != tt.expected {
709+
t.Errorf("Expected isInMaintenanceWindow to return %t", tt.expected)
710710
}
711711
})
712712
}

pkg/util/patroni/patroni.go

+13-13
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,19 @@ import (
2020
)
2121

2222
const (
23-
failoverPath = "/failover"
24-
configPath = "/config"
25-
clusterPath = "/cluster"
26-
statusPath = "/patroni"
27-
restartPath = "/restart"
28-
ApiPort = 8008
29-
timeout = 30 * time.Second
23+
switchoverPath = "/switchover"
24+
configPath = "/config"
25+
clusterPath = "/cluster"
26+
statusPath = "/patroni"
27+
restartPath = "/restart"
28+
ApiPort = 8008
29+
timeout = 30 * time.Second
3030
)
3131

3232
// Interface describe patroni methods
3333
type Interface interface {
3434
GetClusterMembers(master *v1.Pod) ([]ClusterMember, error)
35-
Switchover(master *v1.Pod, candidate string) error
35+
Switchover(master *v1.Pod, candidate string, scheduled_at string) error
3636
SetPostgresParameters(server *v1.Pod, options map[string]string) error
3737
SetStandbyClusterParameters(server *v1.Pod, options map[string]interface{}) error
3838
GetMemberData(server *v1.Pod) (MemberData, error)
@@ -103,7 +103,7 @@ func (p *Patroni) httpPostOrPatch(method string, url string, body *bytes.Buffer)
103103
}
104104
}()
105105

106-
if resp.StatusCode != http.StatusOK {
106+
if resp.StatusCode < http.StatusOK || resp.StatusCode >= 300 {
107107
bodyBytes, err := io.ReadAll(resp.Body)
108108
if err != nil {
109109
return fmt.Errorf("could not read response: %v", err)
@@ -128,25 +128,25 @@ func (p *Patroni) httpGet(url string) (string, error) {
128128
return "", fmt.Errorf("could not read response: %v", err)
129129
}
130130

131-
if response.StatusCode != http.StatusOK {
131+
if response.StatusCode < http.StatusOK || response.StatusCode >= 300 {
132132
return string(bodyBytes), fmt.Errorf("patroni returned '%d'", response.StatusCode)
133133
}
134134

135135
return string(bodyBytes), nil
136136
}
137137

138138
// Switchover by calling Patroni REST API
139-
func (p *Patroni) Switchover(master *v1.Pod, candidate string) error {
139+
func (p *Patroni) Switchover(master *v1.Pod, candidate string, scheduled_at string) error {
140140
buf := &bytes.Buffer{}
141-
err := json.NewEncoder(buf).Encode(map[string]string{"leader": master.Name, "member": candidate})
141+
err := json.NewEncoder(buf).Encode(map[string]string{"leader": master.Name, "member": candidate, "scheduled_at": scheduled_at})
142142
if err != nil {
143143
return fmt.Errorf("could not encode json: %v", err)
144144
}
145145
apiURLString, err := apiURL(master)
146146
if err != nil {
147147
return err
148148
}
149-
return p.httpPostOrPatch(http.MethodPost, apiURLString+failoverPath, buf)
149+
return p.httpPostOrPatch(http.MethodPost, apiURLString+switchoverPath, buf)
150150
}
151151

152152
//TODO: add an option call /patroni to check if it is necessary to restart the server

0 commit comments

Comments
 (0)