From c7404b95cd3604c21976e5e94d85000982fc9b13 Mon Sep 17 00:00:00 2001 From: Kevin W Monroe Date: Sun, 24 Mar 2024 18:37:40 -0500 Subject: [PATCH] deployment errors if all config isn't provided at deploy time (#5) * lint/license adjustments * spelling corrections, example usage * consistent status messages, formatting, use block vs waiting appropriately * no need for duplicate cni-joined and cni-changed relations; ensure state gates are bools; no update-status unless configured; need image secret before tigera, and license after * fix non-leader premature kubectl calls - non-leader units should wait for the cni relation before we consider them 'configured' to ensure they'll have a valid kubeconfig available. - waiting_for_cni in update_status is no longer needed since units won't be 'configured' until that check passes. - k-c-p may be in the middle of restarting the apiserver when we attempt to query the tigera pod; handle that as if no pod is available. we'll try again on the next hook that calls update_status. * address failing integration tests - test against stable bundle - ensure model debug and halt on hook failure - try/catch creation of install / crd manifests independently - let our status handler fail since we're in a retry loop (fail well) * retry kubectl calls (up to 3m) with tenacity --------- Co-authored-by: Mateo Florido <32885896+mateoflorido@users.noreply.github.com> --- LICENSE | 2 +- config.yaml | 17 +- pyproject.toml | 8 +- requirements.txt | 1 + src/charm.py | 176 ++++++++++---------- src/peer.py | 10 +- tests/integration/setup.sh | 4 +- tests/integration/test_calico_enterprise.py | 2 +- tests/unit/test_charm.py | 2 +- tests/unit/test_peer.py | 8 +- tox.ini | 22 +-- 11 files changed, 127 insertions(+), 125 deletions(-) diff --git a/LICENSE b/LICENSE index bb22228..c4a371b 100644 --- a/LICENSE +++ b/LICENSE @@ -187,7 +187,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2023 pguimaraes + Copyright 2024 Canonical Ltd. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/config.yaml b/config.yaml index 266659d..ec4b450 100644 --- a/config.yaml +++ b/config.yaml @@ -18,7 +18,7 @@ options: default: "" type: string description: | - LicenseKey should be a file, appiled as a bas64 string + LicenseKey should be a file, applied as a base64 string file format: kind: LicenseKey @@ -27,7 +27,8 @@ options: certificate: token: - $ juju config calico-enterprise license="$(cat license | base64 -w0)" + example use: + juju config calico-enterprise license="$(cat license | base64 -w0)" addons: default: False type: boolean @@ -66,7 +67,10 @@ options: default: "" type: string description: | - String in the format of "user:password" run through base64 encoding + Credentials in the format :, applied as a base64 string + + example use: + juju config calico-enterprise image_registry_secret="$(echo user:password | base64 -w0)" tigera_version: default: 'distro' type: string @@ -119,10 +123,9 @@ options: description: | If set, BGP will be configured in the early stage (pre-k8s) and passed on to k8s. If the configuration is unset or has a string len=0, then bgp_parameters is considered - empty. This option can be configured as follows: - $ juju config tigera bgp_parameters="$(cat bgp_params.yaml)" + empty. - For example: + file format: - hostname: asn: stableAddress: @@ -136,3 +139,5 @@ options: peerASN: <2> - ... + example use: + juju config calico-enterprise bgp_parameters="$(cat bgp_params.yaml)" diff --git a/pyproject.toml b/pyproject.toml index 3f51442..9f1ba88 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,9 @@ target-version = ["py38"] # Linting tools configuration [tool.ruff] line-length = 99 -select = ["E", "W", "F", "C", "N", "D", "I001"] +extend-exclude = ["__pycache__", "*.egg_info"] + +[tool.ruff.lint] extend-ignore = [ "D203", "D204", @@ -32,8 +34,8 @@ extend-ignore = [ "D413", ] ignore = ["E501", "D107"] -extend-exclude = ["__pycache__", "*.egg_info"] per-file-ignores = {"tests/*" = ["D100","D101","D102","D103","D104"]} +select = ["E", "W", "F", "C", "N", "D", "I001"] -[tool.ruff.mccabe] +[tool.ruff.lint.mccabe] max-complexity = 10 diff --git a/requirements.txt b/requirements.txt index 4269f1a..3b26c33 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ ops >= 1.5.0 git+https://github.com/charmed-kubernetes/conctl#egg=conctl Jinja2 < 3.1 pydantic <=1.9.0 # Python 3.6 support requires <= 1.9.0 +tenacity diff --git a/src/charm.py b/src/charm.py index 6f7b96f..d19654a 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1,16 +1,7 @@ #!/usr/bin/env python3 -# Copyright 2023 pguimaraes +# Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. -# -# Learn more at: https://juju.is/docs/sdk - -"""Charm the service. - -Refer to the following post for a quick-start guide that will help you -develop a new k8s charm using the Operator Framework: - -https://discourse.charmhub.io/t/4208 -""" +"""Dispatch logic for the calico-enterprise networking charm.""" import binascii import ipaddress @@ -22,7 +13,7 @@ from base64 import b64decode from dataclasses import dataclass from subprocess import CalledProcessError, check_output -from typing import Optional, Tuple +from typing import Optional, Tuple, cast import yaml from jinja2 import Environment, FileSystemLoader @@ -31,6 +22,7 @@ from ops.main import main from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, StatusBase, WaitingStatus from peer import CalicoEnterprisePeer +from tenacity import retry, stop_after_delay, wait_exponential VALID_LOG_LEVELS = ["info", "debug", "warning", "error", "critical"] @@ -84,7 +76,6 @@ def __init__(self, *args): self.framework.observe(self.on.remove, self.on_remove) self.framework.observe(self.on.update_status, self.on_update_status) self.framework.observe(self.on.upgrade_charm, self.on_upgrade_charm) - self.framework.observe(self.on.cni_relation_joined, self.on_cni_relation_joined) self.framework.observe(self.on.cni_relation_changed, self.on_cni_relation_changed) self.framework.observe(self.on.start, self.on_config_changed) self.framework.observe(self.on.upgrade_charm, self.on_config_changed) @@ -106,9 +97,18 @@ def __init__(self, *args): ### Support Methods ### ####################### + @retry(reraise=True, stop=stop_after_delay(180), wait=wait_exponential()) def kubectl(self, *args): - """Kubectl command implementation.""" + """Run a kubectl cli command with a config file. + + Returns stdout or raises an error if the command fails. + """ + # Retry this command for up to three minutes. The lifecycle of this charm is managed + # independently of our k8s api server; this means the k8s control plane may be doing + # something else (like restarting) when we invoke kubectl. Account for this with a + # sensible retry. cmd = ["kubectl", "--kubeconfig", KUBECONFIG_PATH] + list(args) + log.info("Executing {}".format(cmd)) return check_output(cmd).decode("utf-8") def is_kubeconfig_available(self): @@ -166,7 +166,7 @@ def image_registry_secret(self) -> Tuple[Optional[RegistrySecret], str]: """Read Image Registry secret to username:password.""" value = self.model.config["image_registry_secret"] if not value: - return None, "Registry secret is required" + return None, "Missing required 'image_registry_secret' config" try: decoded = b64decode(value).decode() except (binascii.Error, UnicodeDecodeError): @@ -174,7 +174,7 @@ def image_registry_secret(self) -> Tuple[Optional[RegistrySecret], str]: decoded = value split = decoded.split(":") if len(split) != 2: - return None, "Registry secret isn't formatted as " + return None, "Registry secret isn't formatted as :" return RegistrySecret(*split), "" ####################### @@ -198,18 +198,18 @@ def preflight_checks(self): return False if not self.model.config["license"]: - self.unit.status = BlockedStatus("Missing license config") + self.unit.status = BlockedStatus("Missing required 'license' config") return False try: ipaddress.ip_network(self.model.config["pod_cidr"]) except ValueError: - self.unit.status = BlockedStatus("Pod-to-Pod configuration is not valid CIDR") + self.unit.status = BlockedStatus("'pod_cidr' config is not valid CIDR") return False try: ipaddress.ip_network(self.model.config["stable_ip_cidr"]) except ValueError: - self.unit.status = BlockedStatus("Stable IP network is not valid CIDR") + self.unit.status = BlockedStatus("'stable_ip_cidr' config is not valid CIDR") return False # TODO: Fix this logic check. # hostname_found = False @@ -257,34 +257,44 @@ def apply_tigera_operator(self): if not pathlib.Path(KUBECONFIG_PATH).exists(): self.unit.status = BlockedStatus("Waiting for Kubeconfig to become available") return False + installation_manifest = self.manifests / "tigera-operator.yaml" - crds_manifest = self.manifests / "custom-resources.yaml" try: self.kubectl("create", "-f", installation_manifest) + except CalledProcessError: + log.warning("Installation manifests failed - tigera operator may be incomplete.") + + crds_manifest = self.manifests / "custom-resources.yaml" + try: self.kubectl("create", "-f", crds_manifest) except CalledProcessError: - # TODO implement a check which checks for tigera resources - log.warning("Kubectl create failed - tigera operator may not be deployed.") - return True + log.warning("CRD manifests failed - tigera operator may be incomplete.") + + # TODO implement a check which checks for tigera resources + return True def tigera_operator_deployment_status(self) -> StatusBase: """Check if tigera operator is ready. returns error in the event of a failed state. """ - output = self.kubectl( - "get", - "pods", - "-l", - "k8s-app=tigera-operator", - "-n", - "tigera-operator", - "-o", - "jsonpath={.items}", - ) - pods = yaml.safe_load(output) + try: + output = self.kubectl( + "get", + "pods", + "-l", + "k8s-app=tigera-operator", + "-n", + "tigera-operator", + "-o", + "jsonpath={.items}", + ) + except CalledProcessError: + log.warning("Kubectl get pods failed - tigera operator may not be deployed.") + output = "" + pods = yaml.safe_load(output) if output else [] if len(pods) == 0: - return WaitingStatus("tigera-operator POD not found") + return WaitingStatus("tigera-operator POD not found yet") elif len(pods) > 1: return WaitingStatus(f"Too many tigera-operator PODs (num: {len(pods)})") status = pods[0]["status"] @@ -322,15 +332,19 @@ def check_tigera_status(self): Returns True if the tigera is ready. """ - output = self.kubectl( - "wait", - "-n", - "tigera-operator", - "--for=condition=ready", - "pod", - "-l", - "k8s-app=tigera-operator", - ) + try: + output = self.kubectl( + "wait", + "-n", + "tigera-operator", + "--for=condition=ready", + "pod", + "-l", + "k8s-app=tigera-operator", + ) + except CalledProcessError: + output = "" + return True if "met" in output else False def implement_early_network(self): @@ -353,18 +367,14 @@ def waiting_for_cni_relation(self): def pre_tigera_init_config(self): """Create required namespaces and label nodes before creating bgp_layout.""" if not pathlib.Path(KUBECONFIG_PATH).exists(): - self.unit.status = BlockedStatus("Waiting for Kubeconfig to become available") + self.unit.status = WaitingStatus("Waiting for Kubeconfig to become available") return False - - try: - self.kubectl("create", "ns", "tigera-operator") - self.kubectl("create", "ns", "calico-system") - except CalledProcessError: - pass if not self.peers.bgp_layout.nodes: - self.unit.status = WaitingStatus("bgp_parameters is required.") + self.unit.status = WaitingStatus("Waiting for BGP data from peers") return False + self.kubectl("create", "ns", "tigera-operator") + self.kubectl("create", "ns", "calico-system") for node in self.peers.bgp_layout.nodes: hostname = node.hostname if not hostname: @@ -415,7 +425,7 @@ def configure_bgp(self): def set_active_status(self): """Set active if cni is configured.""" - if self.stored.tigera_cni_configured: + if cast(bool, self.stored.tigera_cni_configured): self.unit.status = ActiveStatus() self.unit.set_workload_version(self.tigera_version) if self.unit.is_leader(): @@ -428,42 +438,31 @@ def set_active_status(self): def on_install(self, event): """Installation routine. - Execute the Early Network stage if requested. The config of Tigera itself depends on k8s deployed - and should be executed in the config-changed hook. + Execute the Early Network stage if requested. The config of Tigera itself depends on k8s + deployed and should be executed in the config-changed hook. """ if not self.model.config["disable_early_network"]: self.implement_early_network() def on_update_status(self, event): - """State machine of the charm. + """Update status. - 1) Only change the status if in active - 2) If kubeconfig and CNI relation exist + Unit must be in a configured state before status updates are made. """ - if isinstance(self.unit.status, ActiveStatus): - log.info(f"on_update_status: unit not in active status: {self.unit.status}") - if self.waiting_for_cni_relation(): - self.unit.status = WaitingStatus("Waiting for CNI relation") + if not cast(bool, self.stored.tigera_configured): + log.info("on_update_status: unit has not been configured yet; skipping status update.") return self.unit.status = self.tigera_operator_deployment_status() def on_cni_relation_changed(self, event): """Run CNI relation changed hook.""" - if not self.stored.tigera_configured: + if not cast(bool, self.stored.tigera_configured): self.on_config_changed(event) self.cni_to_calico_enterprise(event) self.configure_cni_relation() - self.set_active_status() - - def on_cni_relation_joined(self, event): - """Run CNI relation joined hook.""" - if not self.stored.tigera_configured: - self.on_config_changed(event) - self.cni_to_calico_enterprise(event) - self.configure_cni_relation() self.set_active_status() def on_remove(self, event): @@ -477,8 +476,8 @@ def on_upgrade_charm(self, event): def on_config_changed(self, event): # noqa C901, TODO: consider using reconciler """Config changed event processing. - The leader needs to know the BGP information about every node and only the leader should apply - the changes in the deployment. + The leader needs to know the BGP information about every node and only the leader should + apply the changes in the deployment. 1) Check if the CNI relation exists 2) Return if not leader 3) Apply tigera operator @@ -488,6 +487,11 @@ def on_config_changed(self, event): # noqa C901, TODO: consider using reconcile # TODO: Enters a defer loop # event.defer() return + + if self.waiting_for_cni_relation(): + self.unit.status = WaitingStatus("Waiting for CNI relation") + return + if not self.unit.is_leader(): # Only the leader should manage the operator setup log.info( @@ -502,17 +506,7 @@ def on_config_changed(self, event): # noqa C901, TODO: consider using reconcile # event.defer() return - if self.waiting_for_cni_relation(): - self.unit.status = WaitingStatus("Waiting for CNI relation") - return - - self.unit.status = MaintenanceStatus("Applying Tigera Operator") - if not self.apply_tigera_operator(): - # TODO: Enters a defer loop - # event.defer() - return - - self.unit.status = MaintenanceStatus("Configuring image secret and license file...") + self.unit.status = MaintenanceStatus("Configuring image secret") secret, err = self.image_registry_secret() if self.model.config["image_registry"] and secret: try: @@ -536,6 +530,14 @@ def on_config_changed(self, event): # noqa C901, TODO: consider using reconcile "-n", "tigera-operator", ) + + self.unit.status = MaintenanceStatus("Applying Tigera Operator") + if not self.apply_tigera_operator(): + # TODO: Enters a defer loop + # event.defer() + return + + self.unit.status = MaintenanceStatus("Configuring license") with tempfile.NamedTemporaryFile("w") as license: license.write(b64decode(self.model.config["license"]).rstrip().decode("utf-8")) license.flush() @@ -544,8 +546,6 @@ def on_config_changed(self, event): # noqa C901, TODO: consider using reconcile self.unit.status = MaintenanceStatus("Generating bgp yamls...") self.configure_bgp() - self.unit.status = MaintenanceStatus("Applying Installation CRD") - nic_autodetection = None if self.model.config["nic_autodetection_regex"]: if self.model.config["nic_autodetection_skip_interface"]: @@ -596,6 +596,10 @@ def on_config_changed(self, event): # noqa C901, TODO: consider using reconcile if self.check_tigera_status(): break time.sleep(24) + else: + self.unit.status = BlockedStatus("Tigera operator deployment failed: see debug logs") + return + self.unit.status = ActiveStatus("Node Configured") self.stored.tigera_configured = True diff --git a/src/peer.py b/src/peer.py index 3e7857b..6a29680 100644 --- a/src/peer.py +++ b/src/peer.py @@ -40,7 +40,7 @@ def _localhost_ips() -> List[ip_address]: class BGPPeer(BaseModel): - """Represents a host interface's bpg peer info.""" + """Represents a host interface's bgp peer info.""" ip: str = Field(alias="peerIP") asn: int = Field(alias="peerASNumber") @@ -61,7 +61,7 @@ def __hash__(self): class BGPLabels(BaseModel): - """Represents a host's bpg label info.""" + """Represents a host's bgp label info.""" rack: str @@ -74,7 +74,7 @@ class StableAddress(BaseModel): class BGPParameters(BaseModel): - """Represents a host's bpg label info.""" + """Represents a host's bgp label info.""" as_number: int = Field(alias="asNumber") interface_addresses: List[str] = Field(alias="interfaceAddresses") @@ -160,7 +160,7 @@ def __init__(self, parent: CharmBase, endpoint="calico-enterprise"): self.framework.observe(events.relation_joined, self.peer_change) self.framework.observe(events.relation_changed, self.peer_change) - def pubilsh_bgp_parameters(self): + def publish_bgp_parameters(self): """Publish bgp parameters to peer relation.""" if bgp_parameters := _early_service_cfg(): for relation in self.model.relations[self.endpoint]: @@ -171,7 +171,7 @@ def peer_change(self, event): """Respond to any changes in the peer data.""" if len(self._computed_bgp_layout(local_only=True).nodes) == 0: log.info(f"Sharing bgp params from {self.model.unit.name}") - self.pubilsh_bgp_parameters() + self.publish_bgp_parameters() self.on.bgp_parameters_changed.emit() def quorum_data(self, key: str) -> Optional[str]: diff --git a/tests/integration/setup.sh b/tests/integration/setup.sh index 9d0f175..ec94530 100755 --- a/tests/integration/setup.sh +++ b/tests/integration/setup.sh @@ -144,7 +144,7 @@ function juju_create_manual_model() { juju bootstrap manual-cloud fi - juju add-model ${JUJU_MODEL} --config="${MODEL_CONFIG}" + juju add-model ${JUJU_MODEL} --config="${MODEL_CONFIG}" --config logging-config='=DEBUG' --config automatically-retry-hooks=false juju add-space bgp juju add-space mgmt juju add-space tor-network @@ -199,4 +199,4 @@ if [ -z ${JUJU_MODEL} ]; then JUJU_MODEL="calico-enterprise" fi -$1 "${@:2}" \ No newline at end of file +$1 "${@:2}" diff --git a/tests/integration/test_calico_enterprise.py b/tests/integration/test_calico_enterprise.py index 7b46f57..222bcb7 100644 --- a/tests/integration/test_calico_enterprise.py +++ b/tests/integration/test_calico_enterprise.py @@ -41,7 +41,7 @@ async def test_build_and_deploy( charm = await ops_test.build_charm(".") overlays = [ - ops_test.Bundle("kubernetes-core", channel="edge"), + ops_test.Bundle("kubernetes-core", channel="stable"), Path("tests/data/charm.yaml"), ] diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 4be6ef3..1503aba 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -1,4 +1,4 @@ -# Copyright 2023 pguimaraes +# Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. # # Learn more about testing at: https://juju.is/docs/sdk/testing diff --git a/tests/unit/test_peer.py b/tests/unit/test_peer.py index 21f4adf..a1d1305 100644 --- a/tests/unit/test_peer.py +++ b/tests/unit/test_peer.py @@ -1,4 +1,4 @@ -# Copyright 2023 pguimaraes +# Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. # # Learn more about testing at: https://juju.is/docs/sdk/testing @@ -159,7 +159,7 @@ def test_publish_bgp_parameters_no_service(harness, charm): harness.disable_hooks() with mock.patch("peer.CALICO_EARLY_SERVICE") as patched: patched.exists.return_value = False - charm.peers.pubilsh_bgp_parameters() + charm.peers.publish_bgp_parameters() for relation in charm.model.relations["calico-enterprise"]: assert relation.data[charm.model.unit].get("bgp-parameters") is None @@ -178,7 +178,7 @@ def test_publish_bgp_parameters_invalid_calico_early(early_service, harness, cha elif failure == "Invalid Config": early_service.side_effect = ["CALICO_EARLY_NETWORKING=magic.yaml", "invalid"] - charm.peers.pubilsh_bgp_parameters() + charm.peers.publish_bgp_parameters() for relation in charm.model.relations["calico-enterprise"]: assert relation.data[charm.model.unit].get("bgp-parameters") is None @@ -187,7 +187,7 @@ def test_publish_bgp_parameters_json_passthru(harness, charm): harness.disable_hooks() with mock.patch("peer._early_service_cfg") as patched: expected = patched.return_value.json.return_value = "expected" - charm.peers.pubilsh_bgp_parameters() + charm.peers.publish_bgp_parameters() for relation in charm.model.relations["calico-enterprise"]: assert relation.data[charm.model.unit]["bgp-parameters"] == expected diff --git a/tox.ini b/tox.ini index 855ea43..fc41065 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,3 @@ -# Copyright 2023 pguimaraes -# See LICENSE file for licensing details. - [tox] skipsdist=True skip_missing_interpreters = True @@ -31,26 +28,19 @@ deps = black ruff commands = - black {[vars]all_path} - ruff --fix {[vars]all_path} + black -l 99 {[vars]all_path} + ruff check --fix {[vars]all_path} [testenv:lint] description = Check code against coding style standards deps = black - ruff codespell + ruff commands = - # uncomment the following line if this charm owns a lib - # codespell {[vars]lib_path} - codespell {toxinidir} \ - --skip {toxinidir}/.git \ - --skip {toxinidir}/.tox \ - --skip {toxinidir}/upstream \ - --skip {toxinidir}/icon.svg - - ruff {[vars]all_path} - black --check --diff {[vars]all_path} + codespell {[vars]all_path} + black -l 99 --check --diff {[vars]all_path} + ruff check {[vars]all_path} [testenv:unit] description = Run unit tests