Skip to content

Commit dec8976

Browse files
Merge pull request #362 from mshitrit/support-hcp-upgrade
Support hcp upgrade
2 parents 4bfd53b + 3f18ff0 commit dec8976

File tree

4 files changed

+122
-38
lines changed

4 files changed

+122
-38
lines changed

controllers/cluster/upgrade_checker.go

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@ package cluster
22

33
import (
44
"context"
5-
"errors"
65

76
"github.com/go-logr/logr"
87
gerrors "github.com/pkg/errors"
98

9+
corev1 "k8s.io/api/core/v1"
1010
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1111
"sigs.k8s.io/controller-runtime/pkg/client"
1212
"sigs.k8s.io/controller-runtime/pkg/manager"
@@ -15,16 +15,19 @@ import (
1515
clusterversion "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
1616
)
1717

18-
var unsupportedUpgradeCheckerErr = errors.New(
19-
"the cluster doesn't have any upgrade state representation." +
20-
" Currently only OpenShift/OKD is supported")
18+
const (
19+
// currentMachineConfigAnnotationKey is used to fetch current targetConfigVersionHash
20+
currentMachineConfigAnnotationKey = "machineconfiguration.openshift.io/currentConfig"
21+
// desiredMachineConfigAnnotationKey is used to indicate the version a node should be updating to
22+
desiredMachineConfigAnnotationKey = "machineconfiguration.openshift.io/desiredConfig"
23+
)
2124

2225
// UpgradeChecker checks if the cluster is currently under upgrade.
2326
// error should be thrown if it can't reliably determine if it's under upgrade or not.
2427
type UpgradeChecker interface {
2528
// Check if the cluster is currently under upgrade.
2629
// error should be thrown if it can't reliably determine if it's under upgrade or not.
27-
Check() (bool, error)
30+
Check(nodesToBeRemediated []corev1.Node) (bool, error)
2831
}
2932

3033
type openshiftClusterUpgradeStatusChecker struct {
@@ -36,7 +39,11 @@ type openshiftClusterUpgradeStatusChecker struct {
3639
// force implementation of interface
3740
var _ UpgradeChecker = &openshiftClusterUpgradeStatusChecker{}
3841

39-
func (o *openshiftClusterUpgradeStatusChecker) Check() (bool, error) {
42+
func (o *openshiftClusterUpgradeStatusChecker) Check(nodesToBeRemediated []corev1.Node) (bool, error) {
43+
if o.isNodeUpgrading(nodesToBeRemediated) {
44+
return true, nil
45+
}
46+
4047
cvs, err := o.clusterVersionsClient.List(context.Background(), metav1.ListOptions{})
4148
if err != nil {
4249
return false, gerrors.Wrap(err, "failed to check for Openshift cluster upgrade status")
@@ -52,13 +59,27 @@ func (o *openshiftClusterUpgradeStatusChecker) Check() (bool, error) {
5259
return false, nil
5360
}
5461

62+
func (o *openshiftClusterUpgradeStatusChecker) isNodeUpgrading(nodesToBeRemediated []corev1.Node) bool {
63+
// We can identify an HCP (Hosted Control Plane) cluster upgrade by checking annotations of the CP node.
64+
for _, node := range nodesToBeRemediated {
65+
if len(node.Annotations) == 0 || len(node.Annotations[desiredMachineConfigAnnotationKey]) == 0 {
66+
continue
67+
}
68+
69+
if node.Annotations[currentMachineConfigAnnotationKey] != node.Annotations[desiredMachineConfigAnnotationKey] {
70+
return true
71+
}
72+
}
73+
return false
74+
}
75+
5576
type noopClusterUpgradeStatusChecker struct {
5677
}
5778

5879
// force implementation of interface
5980
var _ UpgradeChecker = &noopClusterUpgradeStatusChecker{}
6081

61-
func (n *noopClusterUpgradeStatusChecker) Check() (bool, error) {
82+
func (n *noopClusterUpgradeStatusChecker) Check([]corev1.Node) (bool, error) {
6283
return false, nil
6384
}
6485

controllers/nodehealthcheck_controller.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ func (r *NodeHealthCheckReconciler) Reconcile(ctx context.Context, req ctrl.Requ
249249
updateRequeueAfter(&result, requeueAfter)
250250

251251
// TODO consider setting Disabled condition?
252-
if r.isClusterUpgrading() {
252+
if r.isClusterUpgrading(matchingNodes) {
253253
msg := "Postponing potential remediations because of ongoing cluster upgrade"
254254
log.Info(msg)
255255
commonevents.NormalEvent(r.Recorder, nhc, utils.EventReasonRemediationSkipped, msg)
@@ -388,8 +388,8 @@ func (r *NodeHealthCheckReconciler) Reconcile(ctx context.Context, req ctrl.Requ
388388
return result, nil
389389
}
390390

391-
func (r *NodeHealthCheckReconciler) isClusterUpgrading() bool {
392-
clusterUpgrading, err := r.ClusterUpgradeStatusChecker.Check()
391+
func (r *NodeHealthCheckReconciler) isClusterUpgrading(nodesToBeRemediated []v1.Node) bool {
392+
clusterUpgrading, err := r.ClusterUpgradeStatusChecker.Check(nodesToBeRemediated)
393393
if err != nil {
394394
// if we can't reliably tell if the cluster is upgrading then just continue with remediation.
395395
// TODO finer error handling may help to decide otherwise here.

controllers/nodehealthcheck_controller_test.go

Lines changed: 79 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1689,38 +1689,95 @@ var _ = Describe("Node Health Check CR", func() {
16891689
When("Nodes are candidates for remediation and cluster is upgrading", func() {
16901690
BeforeEach(func() {
16911691
clusterUpgradeRequeueAfter = 5 * time.Second
1692-
upgradeChecker.Upgrading = true
16931692
setupObjects(1, 2, true)
16941693
})
1694+
When("non HCP Upgrade", func() {
1695+
BeforeEach(func() {
1696+
upgradeChecker.Upgrading = true
1697+
})
16951698

1696-
AfterEach(func() {
1697-
upgradeChecker.Upgrading = false
1698-
})
1699+
AfterEach(func() {
1700+
upgradeChecker.Upgrading = false
1701+
})
16991702

1700-
It("doesn't not remediate but requeues reconciliation and updates status", func() {
1701-
cr := findRemediationCRForNHC(unhealthyNodeName, underTest)
1702-
Expect(cr).To(BeNil())
1703+
It("doesn't not remediate but requeues reconciliation and updates status", func() {
1704+
cr := findRemediationCRForNHC(unhealthyNodeName, underTest)
1705+
Expect(cr).To(BeNil())
17031706

1704-
Expect(*underTest.Status.HealthyNodes).To(Equal(0))
1705-
Expect(*underTest.Status.ObservedNodes).To(Equal(0))
1706-
Expect(underTest.Status.UnhealthyNodes).To(BeEmpty())
1707-
Expect(underTest.Status.Phase).To(Equal(v1alpha1.PhaseEnabled))
1708-
Expect(underTest.Status.Reason).ToNot(BeEmpty())
1707+
Expect(*underTest.Status.HealthyNodes).To(Equal(0))
1708+
Expect(*underTest.Status.ObservedNodes).To(Equal(0))
1709+
Expect(underTest.Status.UnhealthyNodes).To(BeEmpty())
1710+
Expect(underTest.Status.Phase).To(Equal(v1alpha1.PhaseEnabled))
1711+
Expect(underTest.Status.Reason).ToNot(BeEmpty())
17091712

1710-
By("stopping upgrade and waiting for requeue")
1711-
upgradeChecker.Upgrading = false
1712-
time.Sleep(10 * time.Second)
1713-
cr = findRemediationCRForNHC(unhealthyNodeName, underTest)
1714-
Expect(cr).ToNot(BeNil())
1713+
By("stopping upgrade and waiting for requeue")
1714+
upgradeChecker.Upgrading = false
1715+
time.Sleep(10 * time.Second)
1716+
cr = findRemediationCRForNHC(unhealthyNodeName, underTest)
1717+
Expect(cr).ToNot(BeNil())
1718+
1719+
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed())
1720+
Expect(*underTest.Status.HealthyNodes).To(Equal(2))
1721+
Expect(*underTest.Status.ObservedNodes).To(Equal(3))
1722+
Expect(underTest.Status.UnhealthyNodes).To(HaveLen(1))
1723+
})
17151724

1716-
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed())
1717-
Expect(*underTest.Status.HealthyNodes).To(Equal(2))
1718-
Expect(*underTest.Status.ObservedNodes).To(Equal(3))
1719-
Expect(underTest.Status.UnhealthyNodes).To(HaveLen(1))
17201725
})
1726+
When("HCP Upgrade", func() {
1727+
var currentMachineConfigAnnotationKey = "machineconfiguration.openshift.io/currentConfig"
1728+
var desiredMachineConfigAnnotationKey = "machineconfiguration.openshift.io/desiredConfig"
1729+
BeforeEach(func() {
1730+
//Use a real upgrade checker instead of mock
1731+
prevUpgradeChecker := nhcReconciler.ClusterUpgradeStatusChecker
1732+
nhcReconciler.ClusterUpgradeStatusChecker = ocpUpgradeChecker
1733+
DeferCleanup(func() {
1734+
nhcReconciler.ClusterUpgradeStatusChecker = prevUpgradeChecker
1735+
})
17211736

1722-
})
1737+
//Simulate HCP Upgrade on the unhealthy node
1738+
upgradingNode := objects[0]
1739+
upgradingNodeAnnotations := map[string]string{}
1740+
if upgradingNode.GetAnnotations() != nil {
1741+
upgradingNodeAnnotations = upgradingNode.GetAnnotations()
1742+
}
1743+
upgradingNodeAnnotations[currentMachineConfigAnnotationKey] = "fakeVersion1"
1744+
upgradingNodeAnnotations[desiredMachineConfigAnnotationKey] = "fakeVersion2"
1745+
upgradingNode.SetAnnotations(upgradingNodeAnnotations)
1746+
1747+
})
1748+
1749+
It("doesn't not remediate but requeues reconciliation and updates status", func() {
1750+
1751+
cr := findRemediationCRForNHC(unhealthyNodeName, underTest)
1752+
Expect(cr).To(BeNil())
1753+
1754+
Expect(*underTest.Status.HealthyNodes).To(Equal(0))
1755+
Expect(*underTest.Status.ObservedNodes).To(Equal(0))
1756+
Expect(underTest.Status.UnhealthyNodes).To(BeEmpty())
1757+
Expect(underTest.Status.Phase).To(Equal(v1alpha1.PhaseEnabled))
1758+
Expect(underTest.Status.Reason).ToNot(BeEmpty())
17231759

1760+
By("stopping upgrade and waiting for requeue")
1761+
unhealthyNode := &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: unhealthyNodeName}}
1762+
Expect(k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(unhealthyNode), unhealthyNode)).To(Succeed())
1763+
if unhealthyNode.GetAnnotations()[currentMachineConfigAnnotationKey] != unhealthyNode.GetAnnotations()[desiredMachineConfigAnnotationKey] {
1764+
// Simulating upgrade complete.
1765+
unhealthyNode.Annotations[currentMachineConfigAnnotationKey] = unhealthyNode.GetAnnotations()[desiredMachineConfigAnnotationKey]
1766+
Expect(k8sClient.Update(context.TODO(), unhealthyNode)).To(Succeed())
1767+
}
1768+
1769+
time.Sleep(10 * time.Second)
1770+
cr = findRemediationCRForNHC(unhealthyNodeName, underTest)
1771+
Expect(cr).ToNot(BeNil())
1772+
1773+
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed())
1774+
Expect(*underTest.Status.HealthyNodes).To(Equal(2))
1775+
Expect(*underTest.Status.ObservedNodes).To(Equal(3))
1776+
Expect(underTest.Status.UnhealthyNodes).To(HaveLen(1))
1777+
})
1778+
1779+
})
1780+
})
17241781
Context("Machine owners", func() {
17251782
When("Metal3RemediationTemplate is in correct namespace", func() {
17261783

controllers/suite_test.go

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,13 @@ var (
120120
ctx context.Context
121121
cancel context.CancelFunc
122122

123-
upgradeChecker *fakeClusterUpgradeChecker
124-
fakeTime *time.Time
125-
fakeRecorder *record.FakeRecorder
123+
upgradeChecker *fakeClusterUpgradeChecker
124+
ocpUpgradeChecker cluster.UpgradeChecker
125+
126+
fakeTime *time.Time
127+
fakeRecorder *record.FakeRecorder
128+
129+
nhcReconciler *NodeHealthCheckReconciler
126130
)
127131

128132
func TestAPIs(t *testing.T) {
@@ -238,15 +242,17 @@ var _ = BeforeSuite(func() {
238242

239243
mhcEvents := make(chan event.GenericEvent)
240244
fakeRecorder = record.NewFakeRecorder(1000)
241-
err = (&NodeHealthCheckReconciler{
245+
ocpUpgradeChecker, _ = cluster.NewClusterUpgradeStatusChecker(k8sManager, cluster.Capabilities{IsOnOpenshift: true})
246+
nhcReconciler = &NodeHealthCheckReconciler{
242247
Client: k8sManager.GetClient(),
243248
Log: k8sManager.GetLogger().WithName("test reconciler"),
244249
Recorder: fakeRecorder,
245250
ClusterUpgradeStatusChecker: upgradeChecker,
246251
MHCChecker: mhcChecker,
247252
MHCEvents: mhcEvents,
248253
Capabilities: caps,
249-
}).SetupWithManager(k8sManager)
254+
}
255+
err = nhcReconciler.SetupWithManager(k8sManager)
250256
Expect(err).NotTo(HaveOccurred())
251257

252258
err = (&MachineHealthCheckReconciler{
@@ -286,7 +292,7 @@ type fakeClusterUpgradeChecker struct {
286292
// force implementation of interface
287293
var _ cluster.UpgradeChecker = &fakeClusterUpgradeChecker{}
288294

289-
func (c *fakeClusterUpgradeChecker) Check() (bool, error) {
295+
func (c *fakeClusterUpgradeChecker) Check([]v1.Node) (bool, error) {
290296
return c.Upgrading, c.Err
291297
}
292298

0 commit comments

Comments
 (0)