Skip to content

Commit 82f70fb

Browse files
authored
Merge pull request #246 from arangodb/bugfix/resilience-improvements
Resilience improvements
2 parents f0a0029 + cb84514 commit 82f70fb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+2452
-382
lines changed

examples/metrics/dashboard.json

Lines changed: 1056 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,22 @@
11
# This example shows how to integrate with the Prometheus Operator
22
# to bring metrics from kube-arangodb to Prometheus.
33

4-
apiVersion: v1
5-
kind: Service
6-
metadata:
7-
name: arango-deployment-operator
8-
labels:
9-
app: arango-deployment-operator
10-
spec:
11-
selector:
12-
app: arango-deployment-operator
13-
ports:
14-
- name: metrics
15-
port: 8528
16-
17-
---
18-
194
apiVersion: monitoring.coreos.com/v1
205
kind: ServiceMonitor
216
metadata:
227
name: arango-deployment-operator
8+
namespace: monitoring
239
labels:
24-
team: frontend
10+
prometheus: kube-prometheus
2511
spec:
2612
selector:
2713
matchLabels:
2814
app: arango-deployment-operator
15+
namespaceSelector:
16+
matchNames:
17+
- default
2918
endpoints:
30-
- port: metrics
19+
- port: server
3120
scheme: https
3221
tlsConfig:
3322
insecureSkipVerify: true
34-

lifecycle.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,4 +147,6 @@ func cmdLifecycleCopyRun(cmd *cobra.Command, args []string) {
147147
if err := os.Chmod(targetPath, 0755); err != nil {
148148
cliLog.Fatal().Err(err).Msg("Failed to chmod")
149149
}
150+
151+
cliLog.Info().Msgf("Executable copied to %s", targetPath)
150152
}

pkg/apis/deployment/v1alpha/conditions.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ const (
4040
// ConditionTypeCleanedOut indicates that the member (dbserver) has been cleaned out.
4141
// Always check in combination with ConditionTypeTerminated.
4242
ConditionTypeCleanedOut ConditionType = "CleanedOut"
43+
// ConditionTypeAgentRecoveryNeeded indicates that the member (agent) will no
44+
// longer recover from its current volume and there has to be rebuild
45+
// using the recovery procedure.
46+
ConditionTypeAgentRecoveryNeeded ConditionType = "AgentRecoveryNeeded"
4347
// ConditionTypePodSchedulingFailure indicates that one or more pods belonging to the deployment cannot be schedule.
4448
ConditionTypePodSchedulingFailure ConditionType = "PodSchedulingFailure"
4549
// ConditionTypeSecretsChanged indicates that the value of one of more secrets used by

pkg/apis/deployment/v1alpha/member_status_list.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ package v1alpha
2424

2525
import (
2626
"math/rand"
27+
"sort"
2728

2829
"github.com/pkg/errors"
2930
)
@@ -83,7 +84,9 @@ func (l *MemberStatusList) add(m MemberStatus) error {
8384
return maskAny(errors.Wrapf(AlreadyExistsError, "Member '%s' already exists", m.ID))
8485
}
8586
}
86-
*l = append(src, m)
87+
newList := append(src, m)
88+
sort.Slice(newList, func(i, j int) bool { return newList[i].ID < newList[j].ID })
89+
*l = newList
8790
return nil
8891
}
8992

pkg/deployment/access_package.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,15 +108,15 @@ func (d *Deployment) ensureAccessPackage(apSecretName string) error {
108108

109109
// Fetch client authentication CA
110110
clientAuthSecretName := spec.Sync.Authentication.GetClientCASecretName()
111-
clientAuthCert, clientAuthKey, _, err := k8sutil.GetCASecret(d.deps.KubeCli.CoreV1(), clientAuthSecretName, ns, nil)
111+
clientAuthCert, clientAuthKey, _, err := k8sutil.GetCASecret(secrets, clientAuthSecretName, nil)
112112
if err != nil {
113113
log.Debug().Err(err).Msg("Failed to get client-auth CA secret")
114114
return maskAny(err)
115115
}
116116

117117
// Fetch TLS CA public key
118118
tlsCASecretName := spec.Sync.TLS.GetCASecretName()
119-
tlsCACert, err := k8sutil.GetCACertficateSecret(d.deps.KubeCli.CoreV1(), tlsCASecretName, ns)
119+
tlsCACert, err := k8sutil.GetCACertficateSecret(secrets, tlsCASecretName)
120120
if err != nil {
121121
log.Debug().Err(err).Msg("Failed to get TLS CA secret")
122122
return maskAny(err)

pkg/deployment/context_impl.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,9 @@ func (d *Deployment) GetSyncServerClient(ctx context.Context, group api.ServerGr
171171
log := d.deps.Log
172172
kubecli := d.deps.KubeCli
173173
ns := d.apiObject.GetNamespace()
174+
secrets := kubecli.CoreV1().Secrets(ns)
174175
secretName := d.apiObject.Spec.Sync.Monitoring.GetTokenSecretName()
175-
monitoringToken, err := k8sutil.GetTokenSecret(kubecli.CoreV1(), secretName, ns)
176+
monitoringToken, err := k8sutil.GetTokenSecret(secrets, secretName)
176177
if err != nil {
177178
log.Debug().Err(err).Str("secret-name", secretName).Msg("Failed to get sync monitoring secret")
178179
return nil, maskAny(err)
@@ -331,7 +332,8 @@ func (d *Deployment) GetPvc(pvcName string) (*v1.PersistentVolumeClaim, error) {
331332
func (d *Deployment) GetTLSKeyfile(group api.ServerGroup, member api.MemberStatus) (string, error) {
332333
secretName := k8sutil.CreateTLSKeyfileSecretName(d.apiObject.GetName(), group.AsRole(), member.ID)
333334
ns := d.apiObject.GetNamespace()
334-
result, err := k8sutil.GetTLSKeyfileSecret(d.deps.KubeCli.CoreV1(), secretName, ns)
335+
secrets := d.deps.KubeCli.CoreV1().Secrets(ns)
336+
result, err := k8sutil.GetTLSKeyfileSecret(secrets, secretName)
335337
if err != nil {
336338
return "", maskAny(err)
337339
}
@@ -353,8 +355,9 @@ func (d *Deployment) DeleteTLSKeyfile(group api.ServerGroup, member api.MemberSt
353355
// Returns: publicKey, privateKey, ownerByDeployment, error
354356
func (d *Deployment) GetTLSCA(secretName string) (string, string, bool, error) {
355357
ns := d.apiObject.GetNamespace()
358+
secrets := d.deps.KubeCli.CoreV1().Secrets(ns)
356359
owner := d.apiObject.AsOwner()
357-
cert, priv, isOwned, err := k8sutil.GetCASecret(d.deps.KubeCli.CoreV1(), secretName, ns, &owner)
360+
cert, priv, isOwned, err := k8sutil.GetCASecret(secrets, secretName, &owner)
358361
if err != nil {
359362
return "", "", false, maskAny(err)
360363
}

pkg/deployment/deployment.go

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import (
4242
"github.com/arangodb/kube-arangodb/pkg/deployment/resilience"
4343
"github.com/arangodb/kube-arangodb/pkg/deployment/resources"
4444
"github.com/arangodb/kube-arangodb/pkg/generated/clientset/versioned"
45+
"github.com/arangodb/kube-arangodb/pkg/util"
4546
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
4647
"github.com/arangodb/kube-arangodb/pkg/util/retry"
4748
"github.com/arangodb/kube-arangodb/pkg/util/trigger"
@@ -78,8 +79,8 @@ type deploymentEvent struct {
7879

7980
const (
8081
deploymentEventQueueSize = 256
81-
minInspectionInterval = time.Second // Ensure we inspect the generated resources no less than with this interval
82-
maxInspectionInterval = time.Minute // Ensure we inspect the generated resources no less than with this interval
82+
minInspectionInterval = util.Interval(time.Second) // Ensure we inspect the generated resources no less than with this interval
83+
maxInspectionInterval = util.Interval(time.Minute) // Ensure we inspect the generated resources no less than with this interval
8384
)
8485

8586
// Deployment is the in process state of an ArangoDeployment.
@@ -140,6 +141,7 @@ func New(config Config, deps Dependencies, apiObject *api.ArangoDeployment) (*De
140141
ci := newClusterScalingIntegration(d)
141142
d.clusterScalingIntegration = ci
142143
go ci.ListenForClusterEvents(d.stopCh)
144+
go d.resources.RunDeploymentHealthLoop(d.stopCh)
143145
}
144146
if config.AllowChaos {
145147
d.chaosMonkey = chaos.NewMonkey(deps.Log, d)
@@ -247,21 +249,21 @@ func (d *Deployment) run() {
247249
}
248250

249251
case <-d.inspectTrigger.Done():
252+
log.Debug().Msg("Inspect deployment...")
250253
inspectionInterval = d.inspectDeployment(inspectionInterval)
254+
log.Debug().Str("interval", inspectionInterval.String()).Msg("...inspected deployment")
251255

252256
case <-d.updateDeploymentTrigger.Done():
257+
inspectionInterval = minInspectionInterval
253258
if err := d.handleArangoDeploymentUpdatedEvent(); err != nil {
254259
d.CreateEvent(k8sutil.NewErrorEvent("Failed to handle deployment update", err, d.GetAPIObject()))
255260
}
256261

257-
case <-time.After(inspectionInterval):
262+
case <-inspectionInterval.After():
258263
// Trigger inspection
259264
d.inspectTrigger.Trigger()
260265
// Backoff with next interval
261-
inspectionInterval = time.Duration(float64(inspectionInterval) * 1.5)
262-
if inspectionInterval > maxInspectionInterval {
263-
inspectionInterval = maxInspectionInterval
264-
}
266+
inspectionInterval = inspectionInterval.Backoff(1.5, maxInspectionInterval)
265267
}
266268
}
267269
}

pkg/deployment/deployment_inspector.go

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,26 +27,35 @@ import (
2727
"time"
2828

2929
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
30+
"github.com/arangodb/kube-arangodb/pkg/metrics"
31+
"github.com/arangodb/kube-arangodb/pkg/util"
3032
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3133
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3234
)
3335

36+
var (
37+
inspectDeploymentDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_deployment_duration", "Amount of time taken by a single inspection of a deployment (in sec)", metrics.DeploymentName)
38+
)
39+
3440
// inspectDeployment inspects the entire deployment, creates
3541
// a plan to update if needed and inspects underlying resources.
3642
// This function should be called when:
3743
// - the deployment has changed
3844
// - any of the underlying resources has changed
3945
// - once in a while
4046
// Returns the delay until this function should be called again.
41-
func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration {
47+
func (d *Deployment) inspectDeployment(lastInterval util.Interval) util.Interval {
4248
log := d.deps.Log
49+
start := time.Now()
4350

4451
nextInterval := lastInterval
4552
hasError := false
4653
ctx := context.Background()
54+
deploymentName := d.apiObject.GetName()
55+
defer metrics.SetDuration(inspectDeploymentDurationGauges.WithLabelValues(deploymentName), start)
4756

4857
// Check deployment still exists
49-
updated, err := d.deps.DatabaseCRCli.DatabaseV1alpha().ArangoDeployments(d.apiObject.GetNamespace()).Get(d.apiObject.GetName(), metav1.GetOptions{})
58+
updated, err := d.deps.DatabaseCRCli.DatabaseV1alpha().ArangoDeployments(d.apiObject.GetNamespace()).Get(deploymentName, metav1.GetOptions{})
5059
if k8sutil.IsNotFound(err) {
5160
// Deployment is gone
5261
log.Info().Msg("Deployment is gone")
@@ -87,13 +96,17 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration
8796
}
8897

8998
// Inspection of generated resources needed
90-
if err := d.resources.InspectPods(ctx); err != nil {
99+
if x, err := d.resources.InspectPods(ctx); err != nil {
91100
hasError = true
92101
d.CreateEvent(k8sutil.NewErrorEvent("Pod inspection failed", err, d.apiObject))
102+
} else {
103+
nextInterval = nextInterval.ReduceTo(x)
93104
}
94-
if err := d.resources.InspectPVCs(ctx); err != nil {
105+
if x, err := d.resources.InspectPVCs(ctx); err != nil {
95106
hasError = true
96107
d.CreateEvent(k8sutil.NewErrorEvent("PVC inspection failed", err, d.apiObject))
108+
} else {
109+
nextInterval = nextInterval.ReduceTo(x)
97110
}
98111

99112
// Check members for resilience
@@ -149,9 +162,11 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration
149162
}
150163

151164
// At the end of the inspect, we cleanup terminated pods.
152-
if err := d.resources.CleanupTerminatedPods(); err != nil {
165+
if x, err := d.resources.CleanupTerminatedPods(); err != nil {
153166
hasError = true
154167
d.CreateEvent(k8sutil.NewErrorEvent("Pod cleanup failed", err, d.apiObject))
168+
} else {
169+
nextInterval = nextInterval.ReduceTo(x)
155170
}
156171
}
157172

@@ -164,10 +179,7 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration
164179
} else {
165180
d.recentInspectionErrors = 0
166181
}
167-
if nextInterval > maxInspectionInterval {
168-
nextInterval = maxInspectionInterval
169-
}
170-
return nextInterval
182+
return nextInterval.ReduceTo(maxInspectionInterval)
171183
}
172184

173185
// triggerInspection ensures that an inspection is run soon.

pkg/deployment/metrics.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Ewout Prangsma
21+
//
22+
23+
package deployment
24+
25+
const (
26+
// Component name for metrics of this package
27+
metricsComponent = "deployment"
28+
)

pkg/deployment/reconcile/plan_builder.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,10 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject,
101101
status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error {
102102
for _, m := range members {
103103
if m.Phase == api.MemberPhaseFailed && len(plan) == 0 {
104+
log.Debug().
105+
Str("id", m.ID).
106+
Str("role", group.AsRole()).
107+
Msg("Creating member replacement plan because member has failed")
104108
newID := ""
105109
if group == api.ServerGroupAgents {
106110
newID = m.ID // Agents cannot (yet) be replaced with new IDs
@@ -117,6 +121,10 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject,
117121
// Check for cleaned out dbserver in created state
118122
for _, m := range status.Members.DBServers {
119123
if len(plan) == 0 && m.Phase == api.MemberPhaseCreated && m.Conditions.IsTrue(api.ConditionTypeCleanedOut) {
124+
log.Debug().
125+
Str("id", m.ID).
126+
Str("role", api.ServerGroupDBServers.AsRole()).
127+
Msg("Creating dbserver replacement plan because server is cleanout in created phase")
120128
plan = append(plan,
121129
api.NewAction(api.ActionTypeRemoveMember, api.ServerGroupDBServers, m.ID),
122130
api.NewAction(api.ActionTypeAddMember, api.ServerGroupDBServers, ""),
@@ -398,13 +406,18 @@ func createRotateMemberPlan(log zerolog.Logger, member api.MemberStatus,
398406
// member.
399407
func createUpgradeMemberPlan(log zerolog.Logger, member api.MemberStatus,
400408
group api.ServerGroup, reason string, imageName string, status api.DeploymentStatus) api.Plan {
409+
upgradeAction := api.ActionTypeUpgradeMember
410+
if group.IsStateless() {
411+
upgradeAction = api.ActionTypeRotateMember
412+
}
401413
log.Debug().
402414
Str("id", member.ID).
403415
Str("role", group.AsRole()).
404416
Str("reason", reason).
417+
Str("action", string(upgradeAction)).
405418
Msg("Creating upgrade plan")
406419
plan := api.Plan{
407-
api.NewAction(api.ActionTypeUpgradeMember, group, member.ID, reason),
420+
api.NewAction(upgradeAction, group, member.ID, reason),
408421
api.NewAction(api.ActionTypeWaitForMemberUp, group, member.ID),
409422
}
410423
if status.CurrentImage == nil || status.CurrentImage.Image != imageName {

pkg/deployment/resources/certificates_client_auth.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ const (
4242

4343
// createClientAuthCACertificate creates a client authentication CA certificate and stores it in a secret with name
4444
// specified in the given spec.
45-
func createClientAuthCACertificate(log zerolog.Logger, cli v1.CoreV1Interface, spec api.SyncAuthenticationSpec, deploymentName, namespace string, ownerRef *metav1.OwnerReference) error {
45+
func createClientAuthCACertificate(log zerolog.Logger, secrets k8sutil.SecretInterface, spec api.SyncAuthenticationSpec, deploymentName string, ownerRef *metav1.OwnerReference) error {
4646
log = log.With().Str("secret", spec.GetClientCASecretName()).Logger()
4747
options := certificates.CreateCertificateOptions{
4848
CommonName: fmt.Sprintf("%s Client Authentication Root Certificate", deploymentName),
@@ -57,7 +57,7 @@ func createClientAuthCACertificate(log zerolog.Logger, cli v1.CoreV1Interface, s
5757
log.Debug().Err(err).Msg("Failed to create CA certificate")
5858
return maskAny(err)
5959
}
60-
if err := k8sutil.CreateCASecret(cli, spec.GetClientCASecretName(), namespace, cert, priv, ownerRef); err != nil {
60+
if err := k8sutil.CreateCASecret(secrets, spec.GetClientCASecretName(), cert, priv, ownerRef); err != nil {
6161
if k8sutil.IsAlreadyExists(err) {
6262
log.Debug().Msg("CA Secret already exists")
6363
} else {
@@ -71,10 +71,10 @@ func createClientAuthCACertificate(log zerolog.Logger, cli v1.CoreV1Interface, s
7171

7272
// createClientAuthCertificateKeyfile creates a client authentication certificate for a specific user and stores
7373
// it in a secret with the given name.
74-
func createClientAuthCertificateKeyfile(log zerolog.Logger, cli v1.CoreV1Interface, commonName string, ttl time.Duration, spec api.SyncAuthenticationSpec, secretName, namespace string, ownerRef *metav1.OwnerReference) error {
74+
func createClientAuthCertificateKeyfile(log zerolog.Logger, secrets v1.SecretInterface, commonName string, ttl time.Duration, spec api.SyncAuthenticationSpec, secretName string, ownerRef *metav1.OwnerReference) error {
7575
log = log.With().Str("secret", secretName).Logger()
7676
// Load CA certificate
77-
caCert, caKey, _, err := k8sutil.GetCASecret(cli, spec.GetClientCASecretName(), namespace, nil)
77+
caCert, caKey, _, err := k8sutil.GetCASecret(secrets, spec.GetClientCASecretName(), nil)
7878
if err != nil {
7979
log.Debug().Err(err).Msg("Failed to load CA certificate")
8080
return maskAny(err)
@@ -100,7 +100,7 @@ func createClientAuthCertificateKeyfile(log zerolog.Logger, cli v1.CoreV1Interfa
100100
}
101101
keyfile := strings.TrimSpace(cert) + "\n" +
102102
strings.TrimSpace(priv)
103-
if err := k8sutil.CreateTLSKeyfileSecret(cli, secretName, namespace, keyfile, ownerRef); err != nil {
103+
if err := k8sutil.CreateTLSKeyfileSecret(secrets, secretName, keyfile, ownerRef); err != nil {
104104
if k8sutil.IsAlreadyExists(err) {
105105
log.Debug().Msg("Server Secret already exists")
106106
} else {

0 commit comments

Comments
 (0)