From 967804228179d51db7af45ea03926d8278723e61 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 11 Mar 2024 12:58:52 +0100 Subject: [PATCH 01/48] Added load test based on shuffling numpy arrays --- .../docker-compose.override.yaml | 4 +- examples/load-test/.gitignore | 6 ++ examples/load-test/README.md | 56 +++++++++++ examples/load-test/client/entrypoint | 98 +++++++++++++++++++ examples/load-test/client/fedn.yaml | 5 + .../load-test/docker-compose.override.yaml | 15 +++ examples/load-test/init_fedn.py | 8 ++ examples/load-test/requirements.txt | 1 + examples/load-test/run_clients.py | 59 +++++++++++ 9 files changed, 250 insertions(+), 2 deletions(-) create mode 100644 examples/load-test/.gitignore create mode 100644 examples/load-test/README.md create mode 100644 examples/load-test/client/entrypoint create mode 100644 examples/load-test/client/fedn.yaml create mode 100644 examples/load-test/docker-compose.override.yaml create mode 100644 examples/load-test/init_fedn.py create mode 100644 examples/load-test/requirements.txt create mode 100644 examples/load-test/run_clients.py diff --git a/examples/async-simulation/docker-compose.override.yaml b/examples/async-simulation/docker-compose.override.yaml index 61034ce69..efb97345a 100644 --- a/examples/async-simulation/docker-compose.override.yaml +++ b/examples/async-simulation/docker-compose.override.yaml @@ -6,10 +6,10 @@ services: client: build: args: - REQUIREMENTS: examples/async-simulation/requirements.txt + REQUIREMENTS: examples/load-test/requirements.txt deploy: replicas: 2 volumes: - ${HOST_REPO_DIR:-.}/fedn:/app/fedn - - ${HOST_REPO_DIR:-.}/examples/async-simulation/data:/var/data + - ${HOST_REPO_DIR:-.}/examples/load-test/data:/var/data - /var/run/docker.sock:/var/run/docker.sock diff --git a/examples/load-test/.gitignore b/examples/load-test/.gitignore new file mode 100644 index 000000000..4ab9fa59f --- /dev/null +++ b/examples/load-test/.gitignore @@ -0,0 +1,6 @@ +data +*.npz +*.tgz +*.tar.gz +.async-simulation +client.yaml \ No newline at end of file diff --git a/examples/load-test/README.md b/examples/load-test/README.md new file mode 100644 index 000000000..dbc6c0103 --- /dev/null +++ b/examples/load-test/README.md @@ -0,0 +1,56 @@ +# LOAD TEST +This example can be used as a load test for FEDn. + +No actual machine learning is being done - the clients generate a +random array of a configurable size. In this way a developer can +test the performance / scalability of a given FEDn network in a flexible +way simply by shuffling around and aggregating numeric arrays. + +## Prerequisites +- [Python 3.8, 3.9 or 3.10](https://www.python.org/downloads) +- [Docker](https://docs.docker.com/get-docker) +- [Docker Compose](https://docs.docker.com/compose/install) + +## Running the example (pseudo-distributed, single host) + +Clone FEDn and locate into this directory. +```sh +git clone https://github.com/scaleoutsystems/fedn.git +cd fedn/examples/load-test +``` + +### Preparing the environment, the local data, the compute package and seed model + +Install FEDn: +``` +pip install fedn +``` + +Standing in examples/load-test +``` +pip install -r requirements.txt +``` + +Create the compute package and a seed model that you will be asked to upload in the next step. +``` +tar -czvf package.tgz client +``` + +``` +python client/entrypoint init_seed +``` + +### Deploy FEDn and two clients +docker-compose -f ../../docker-compose.yaml -f docker-compose.override.yaml up + +### Initialize the FEDn network +Edit 'init_fedn.py' to configure the FEDn host (controller) to connect to, then +``` +python init_fedn.py +``` + +Launch clients +> **Note**: run with `--scale client=N` to start *N* clients. + +## Clean up +You can clean up by running `docker-compose down -v`. diff --git a/examples/load-test/client/entrypoint b/examples/load-test/client/entrypoint new file mode 100644 index 000000000..dd2216fc0 --- /dev/null +++ b/examples/load-test/client/entrypoint @@ -0,0 +1,98 @@ +# /bin/python +import time + +import fire +import numpy as np + +from fedn.utils.helpers.helpers import get_helper, save_metadata, save_metrics + +HELPER_MODULE = 'numpyhelper' +ARRAY_SIZE = 1000000 + + +def save_model(weights, out_path): + """ Save model to disk. + + :param model: The model to save. + :type model: torch.nn.Module + :param out_path: The path to save to. + :type out_path: str + """ + helper = get_helper(HELPER_MODULE) + helper.save(weights, out_path) + + +def load_model(model_path): + """ Load model from disk. + + param model_path: The path to load from. + :type model_path: str + :return: The loaded model. + :rtype: torch.nn.Module + """ + helper = get_helper(HELPER_MODULE) + weights = helper.load(model_path) + return weights + + +def init_seed(out_path='seed.npz'): + """ Initialize seed model. + + :param out_path: The path to save the seed model to. + :type out_path: str + """ + # Init and save + weights = [np.random.rand(1, ARRAY_SIZE)] + save_model(weights, out_path) + + +def train(in_model_path, out_model_path): + """ Train model. + + """ + + # Load model + weights = load_model(in_model_path) + + # Train + time.sleep(np.random.randint(4, 15)) + + # Metadata needed for aggregation server side + metadata = { + 'num_examples': ARRAY_SIZE, + } + + # Save JSON metadata file + save_metadata(metadata, out_model_path) + + # Save model update + save_model(weights, out_model_path) + + +def validate(in_model_path, out_json_path): + """ Validate model. + + :param in_model_path: The path to the input model. + :type in_model_path: str + :param out_json_path: The path to save the output JSON to. + :type out_json_path: str + :param data_path: The path to the data file. + :type data_path: str + """ + weights = load_model(in_model_path) + + # JSON schema + report = { + "mean": np.mean(weights), + } + + # Save JSON + save_metrics(report, out_json_path) + + +if __name__ == '__main__': + fire.Fire({ + 'init_seed': init_seed, + 'train': train, + 'validate': validate + }) diff --git a/examples/load-test/client/fedn.yaml b/examples/load-test/client/fedn.yaml new file mode 100644 index 000000000..68cb70cef --- /dev/null +++ b/examples/load-test/client/fedn.yaml @@ -0,0 +1,5 @@ +entry_points: + train: + command: /venv/bin/python entrypoint train $ENTRYPOINT_OPTS + validate: + command: /venv/bin/python entrypoint validate $ENTRYPOINT_OPTS \ No newline at end of file diff --git a/examples/load-test/docker-compose.override.yaml b/examples/load-test/docker-compose.override.yaml new file mode 100644 index 000000000..61034ce69 --- /dev/null +++ b/examples/load-test/docker-compose.override.yaml @@ -0,0 +1,15 @@ +# Compose schema version +version: '3.3' + +# Overriding requirements +services: + client: + build: + args: + REQUIREMENTS: examples/async-simulation/requirements.txt + deploy: + replicas: 2 + volumes: + - ${HOST_REPO_DIR:-.}/fedn:/app/fedn + - ${HOST_REPO_DIR:-.}/examples/async-simulation/data:/var/data + - /var/run/docker.sock:/var/run/docker.sock diff --git a/examples/load-test/init_fedn.py b/examples/load-test/init_fedn.py new file mode 100644 index 000000000..23078fcd9 --- /dev/null +++ b/examples/load-test/init_fedn.py @@ -0,0 +1,8 @@ +from fedn import APIClient + +DISCOVER_HOST = '127.0.0.1' +DISCOVER_PORT = 8092 + +client = APIClient(DISCOVER_HOST, DISCOVER_PORT) +client.set_package('package.tgz', 'numpyhelper') +client.set_initial_model('seed.npz') diff --git a/examples/load-test/requirements.txt b/examples/load-test/requirements.txt new file mode 100644 index 000000000..c6bceff1d --- /dev/null +++ b/examples/load-test/requirements.txt @@ -0,0 +1 @@ +fire==0.3.1 \ No newline at end of file diff --git a/examples/load-test/run_clients.py b/examples/load-test/run_clients.py new file mode 100644 index 000000000..780360b7b --- /dev/null +++ b/examples/load-test/run_clients.py @@ -0,0 +1,59 @@ +"""This scripts starts N_CLIENTS using the SDK. + +If you are running with a local deploy of FEDn +using docker compose, you need to make sure that clients +are able to resolve the name "combiner" to 127.0.0.1 + +One way to accomplish this is to edit your /etc/host, +adding the line: + +combiner 127.0.0.1 + +""" + + +import copy +import time + +from fedn import APIClient +from fedn.network.clients.client import Client + +DISCOVER_HOST = '127.0.0.1' +DISCOVER_PORT = 8092 +N_CLIENTS = 5 +CLIENTS_AVAILABLE_TIME = 120 + +config = {'discover_host': DISCOVER_HOST, 'discover_port': DISCOVER_PORT, 'token': None, 'name': 'testclient', + 'client_id': 1, 'remote_compute_context': True, 'force_ssl': False, 'dry_run': False, 'secure': False, + 'preshared_cert': False, 'verify': False, 'preferred_combiner': False, + 'validator': True, 'trainer': True, 'init': None, 'logfile': 'test.log', 'heartbeat_interval': 2, + 'reconnect_after_missed_heartbeat': 30} + +if __name__ == '__main__': + + # Start up N_CLIENTS clients + clients = [] + for i in range(N_CLIENTS): + config_i = copy.deepcopy(config) + config['name'] = 'client{}'.format(i) + clients.append(Client(config)) + + # Run a session + client = APIClient(DISCOVER_HOST, DISCOVER_PORT) + + session_config_fedavg = { + "helper": "numpyhelper", + "session_id": str(uuid.uuid4()), + "aggregator": "fedavg", + "round_timeout": 30, + "rounds": 5, + } + + result_fedavg = client.start_session(**session_config_fedavg) + while not client.session_is_finished(session_id): + time.sleep(2) + + # Disconnect clients + time.sleep(CLIENTS_AVAILABLE_TIME) + for client in clients: + client.detach() From 52e4b8840cd53978af7c4114e658aa2a31bcb1f5 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 11 Mar 2024 13:01:32 +0100 Subject: [PATCH 02/48] Added load test based on shuffling numpy arrays --- examples/load-test/README.md | 13 +++++-------- examples/load-test/run_clients.py | 4 ++-- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/examples/load-test/README.md b/examples/load-test/README.md index dbc6c0103..20ec0ba77 100644 --- a/examples/load-test/README.md +++ b/examples/load-test/README.md @@ -40,17 +40,14 @@ tar -czvf package.tgz client python client/entrypoint init_seed ``` -### Deploy FEDn and two clients -docker-compose -f ../../docker-compose.yaml -f docker-compose.override.yaml up - -### Initialize the FEDn network +### Initialize the FEDn network and run an experiment Edit 'init_fedn.py' to configure the FEDn host (controller) to connect to, then ``` python init_fedn.py ``` -Launch clients -> **Note**: run with `--scale client=N` to start *N* clients. +Launch clients and run a training session/experiment: -## Clean up -You can clean up by running `docker-compose down -v`. +``` +python run_clients.py +``` diff --git a/examples/load-test/run_clients.py b/examples/load-test/run_clients.py index 780360b7b..a1db333f0 100644 --- a/examples/load-test/run_clients.py +++ b/examples/load-test/run_clients.py @@ -50,8 +50,8 @@ } result_fedavg = client.start_session(**session_config_fedavg) - while not client.session_is_finished(session_id): - time.sleep(2) + while not client.session_is_finished(session_config_fedavg['session_id']): + time.sleep(1) # Disconnect clients time.sleep(CLIENTS_AVAILABLE_TIME) From 58a5e1154aa5034b21abff72e617437ba6f06747 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 11 Mar 2024 13:13:21 +0100 Subject: [PATCH 03/48] complete example --- examples/async-simulation/requirements.txt | 3 ++- examples/load-test/.gitignore | 1 - examples/load-test/docker-compose.override.yaml | 15 --------------- examples/load-test/run_clients.py | 7 +++---- 4 files changed, 5 insertions(+), 21 deletions(-) delete mode 100644 examples/load-test/docker-compose.override.yaml diff --git a/examples/async-simulation/requirements.txt b/examples/async-simulation/requirements.txt index c6bceff1d..890d084ef 100644 --- a/examples/async-simulation/requirements.txt +++ b/examples/async-simulation/requirements.txt @@ -1 +1,2 @@ -fire==0.3.1 \ No newline at end of file +fire==0.3.1 +numpy \ No newline at end of file diff --git a/examples/load-test/.gitignore b/examples/load-test/.gitignore index 4ab9fa59f..3442476f9 100644 --- a/examples/load-test/.gitignore +++ b/examples/load-test/.gitignore @@ -2,5 +2,4 @@ data *.npz *.tgz *.tar.gz -.async-simulation client.yaml \ No newline at end of file diff --git a/examples/load-test/docker-compose.override.yaml b/examples/load-test/docker-compose.override.yaml deleted file mode 100644 index 61034ce69..000000000 --- a/examples/load-test/docker-compose.override.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# Compose schema version -version: '3.3' - -# Overriding requirements -services: - client: - build: - args: - REQUIREMENTS: examples/async-simulation/requirements.txt - deploy: - replicas: 2 - volumes: - - ${HOST_REPO_DIR:-.}/fedn:/app/fedn - - ${HOST_REPO_DIR:-.}/examples/async-simulation/data:/var/data - - /var/run/docker.sock:/var/run/docker.sock diff --git a/examples/load-test/run_clients.py b/examples/load-test/run_clients.py index a1db333f0..3c9fca28a 100644 --- a/examples/load-test/run_clients.py +++ b/examples/load-test/run_clients.py @@ -20,8 +20,7 @@ DISCOVER_HOST = '127.0.0.1' DISCOVER_PORT = 8092 -N_CLIENTS = 5 -CLIENTS_AVAILABLE_TIME = 120 +N_CLIENTS = 3 config = {'discover_host': DISCOVER_HOST, 'discover_port': DISCOVER_PORT, 'token': None, 'name': 'testclient', 'client_id': 1, 'remote_compute_context': True, 'force_ssl': False, 'dry_run': False, 'secure': False, @@ -53,7 +52,7 @@ while not client.session_is_finished(session_config_fedavg['session_id']): time.sleep(1) - # Disconnect clients - time.sleep(CLIENTS_AVAILABLE_TIME) + print("Session ID: ", session_config_fedavg['session_id']) + for client in clients: client.detach() From 4e9dd7c2b5590d470a2d73022c277e295922bde1 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 25 Mar 2024 08:40:03 +0100 Subject: [PATCH 04/48] Code checks --- examples/load-test/README.md | 2 ++ examples/load-test/run_clients.py | 1 + 2 files changed, 3 insertions(+) diff --git a/examples/load-test/README.md b/examples/load-test/README.md index 20ec0ba77..f941cf461 100644 --- a/examples/load-test/README.md +++ b/examples/load-test/README.md @@ -21,6 +21,8 @@ cd fedn/examples/load-test ### Preparing the environment, the local data, the compute package and seed model +We recommend that you use a virtual environment. + Install FEDn: ``` pip install fedn diff --git a/examples/load-test/run_clients.py b/examples/load-test/run_clients.py index 3c9fca28a..cb7ca1222 100644 --- a/examples/load-test/run_clients.py +++ b/examples/load-test/run_clients.py @@ -14,6 +14,7 @@ import copy import time +import uuid from fedn import APIClient from fedn.network.clients.client import Client From 8905724ffbf9be269f9417522b7b3ef124debe1b Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Wed, 4 Dec 2024 16:01:05 +0100 Subject: [PATCH 05/48] Script to save seed model to file --- examples/cifar100/init_seed.py | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 examples/cifar100/init_seed.py diff --git a/examples/cifar100/init_seed.py b/examples/cifar100/init_seed.py new file mode 100644 index 000000000..7fa67f2cf --- /dev/null +++ b/examples/cifar100/init_seed.py @@ -0,0 +1,41 @@ +import torch +import torch.nn as nn +import torchvision.models as models + +from fedn.utils.helpers.helpers import get_helper + +HELPER_MODULE = "numpyhelper" +helper = get_helper(HELPER_MODULE) + + +# Function to replace BatchNorm layers with GroupNorm +def replace_bn_with_gn(module, num_groups=32): + for name, child in module.named_children(): + if isinstance(child, nn.BatchNorm2d): + num_channels = child.num_features + setattr(module, name, nn.GroupNorm(num_groups=num_groups, num_channels=num_channels)) + else: + replace_bn_with_gn(child, num_groups) # Apply recursively to nested modules + + +def compile_model(): + # Load ResNet-18 and replace BatchNorm with GroupNorm + resnet18 = models.resnet18(weights=None) + replace_bn_with_gn(resnet18) + # Modify final layer for CIFAR-100 (100 classes) + resnet18.fc = nn.Linear(512, 100) + return resnet18 + + +def save_parameters(model, out_path): + parameters_np = [val.cpu().numpy() for _, val in model.state_dict().items()] + helper.save(parameters_np, out_path) + + +def init_seed(out_path="seed.npz"): + model = compile_model() + save_parameters(model, out_path) + + +if __name__ == "__main__": + init_seed("seed.npz") From 74034f1d5cedc327a741661670431a434649f9f7 Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Wed, 4 Dec 2024 16:02:19 +0100 Subject: [PATCH 06/48] Script to split dataset across clients --- examples/cifar100/data.py | 292 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 292 insertions(+) create mode 100644 examples/cifar100/data.py diff --git a/examples/cifar100/data.py b/examples/cifar100/data.py new file mode 100644 index 000000000..597e4845b --- /dev/null +++ b/examples/cifar100/data.py @@ -0,0 +1,292 @@ +import os +import pickle +from typing import List, Tuple + +import numpy as np +import torch +import torchvision.transforms as transforms +from scipy.stats import dirichlet +from torch.utils.data import DataLoader, Dataset, Subset +from torchvision import datasets, transforms + +# Set a fixed random seed for reproducibility +RANDOM_SEED = 42 +np.random.seed(RANDOM_SEED) +# testloader = DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) + + +def fine_to_coarse_labels(fine_labels: np.ndarray) -> np.ndarray: + coarse = np.array( + [ + 4, + 1, + 14, + 8, + 0, + 6, + 7, + 7, + 18, + 3, + 3, + 14, + 9, + 18, + 7, + 11, + 3, + 9, + 7, + 11, + 6, + 11, + 5, + 10, + 7, + 6, + 13, + 15, + 3, + 15, + 0, + 11, + 1, + 10, + 12, + 14, + 16, + 9, + 11, + 5, + 5, + 19, + 8, + 8, + 15, + 13, + 14, + 17, + 18, + 10, + 16, + 4, + 17, + 4, + 2, + 0, + 17, + 4, + 18, + 17, + 10, + 3, + 2, + 12, + 12, + 16, + 12, + 1, + 9, + 19, + 2, + 10, + 0, + 1, + 16, + 12, + 9, + 13, + 15, + 13, + 16, + 19, + 2, + 4, + 6, + 19, + 5, + 5, + 8, + 19, + 18, + 1, + 2, + 15, + 6, + 0, + 17, + 8, + 14, + 13, + ] + ) + return coarse[fine_labels] + + +class CIFAR100Federated: + def __init__(self, root_dir: str = "./data/splits"): + """Initialize the splitter + :param root_dir: Directory to save the split datasets + """ + self.root_dir = root_dir + self.splits = {} + os.makedirs(root_dir, exist_ok=True) + + # Load the full dataset + self.transform_train = transforms.Compose( + [ + transforms.RandomCrop(24), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)), + ] + ) + self.trainset = datasets.CIFAR100(root="./data", train=True, download=True, transform=self.transform_train) + + self.transform_test = transforms.Compose( + [transforms.CenterCrop(24), transforms.ToTensor(), transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761))] + ) + self.testset = datasets.CIFAR100(root="./data", train=False, download=True, transform=self.transform_test) + + def create_splits(self, num_splits: int, balanced: bool, iid: bool) -> None: + """Create dataset splits based on specified parameters + :param num_splits: Number of splits to create + :param balanced: Whether splits should have equal size + :param iid: Whether splits should be IID + """ + config_key = f"splits_{num_splits}_bal_{balanced}_iid_{iid}" + + if iid: + indices = self._create_iid_splits(num_splits, balanced) + else: + indices = self._create_non_iid_splits(num_splits, balanced) + + # Save splits + for i, split_indices in enumerate(indices): + split_path = os.path.join(self.root_dir, f"{config_key}_split_{i}.pkl") + with open(split_path, "wb") as f: + pickle.dump(split_indices, f) + + self.splits[config_key] = indices + + def _create_iid_splits(self, num_splits: int, balanced: bool) -> List[np.ndarray]: + """Create IID splits of the dataset""" + indices = np.arange(len(self.trainset)) + np.random.shuffle(indices) + + if balanced: + # Equal size splits + split_size = len(indices) // num_splits + return [indices[i * split_size : (i + 1) * split_size] for i in range(num_splits)] + else: + # Random size splits + split_points = sorted(np.random.choice(len(indices) - 1, num_splits - 1, replace=False)) + return np.split(indices, split_points) + + def _create_non_iid_splits(self, num_splits: int, balanced: bool) -> List[np.ndarray]: + """Create non-IID splits using Pachinko Allocation Method (PAM)""" + # Initialize parameters + alpha = 0.1 # Root Dirichlet parameter + beta = 10.0 # Coarse-to-fine Dirichlet parameter + total_examples = len(self.trainset) + + # Calculate examples per split + if balanced: + examples_per_split = [total_examples // num_splits] * num_splits + else: + # Use Dirichlet to create unbalanced split sizes + split_ratios = np.random.dirichlet([0.5] * num_splits) # Lower alpha = more unbalanced + examples_per_split = np.round(split_ratios * total_examples).astype(int) + # Ensure we use exactly total_examples + examples_per_split[-1] = total_examples - examples_per_split[:-1].sum() + + # Get fine labels and map them to coarse labels + fine_labels = np.array(self.trainset.targets) + coarse_labels = fine_to_coarse_labels(fine_labels) + + # Initialize DAG structure (track available labels) + available_coarse = list(range(20)) # 20 coarse labels as list instead of set + available_fine = {c: set(np.where(coarse_labels == c)[0]) for c in available_coarse} + + indices_per_split = [] + for split_idx in range(num_splits): + split_indices = [] + N = examples_per_split[split_idx] # Use the pre-calculated split size + + # Sample root distribution over coarse labels + coarse_probs = dirichlet.rvs(alpha=[alpha] * len(available_coarse), size=1, random_state=RANDOM_SEED + split_idx)[0] + + # Sample fine label distributions for each available coarse label + fine_distributions = {} + for c in available_coarse: + if len(available_fine[c]) > 0: + fine_probs = dirichlet.rvs(alpha=[beta] * len(available_fine[c]), size=1, random_state=RANDOM_SEED + split_idx + c)[0] + fine_distributions[c] = fine_probs + + # Sample N examples for this split + for _ in range(N): + if len(available_coarse) == 0: + break + + # Sample coarse label + coarse_idx = np.random.choice(available_coarse, p=coarse_probs) + + if len(available_fine[coarse_idx]) == 0: + # Remove empty coarse label and renormalize + idx_to_remove = available_coarse.index(coarse_idx) + available_coarse.remove(coarse_idx) + coarse_probs = self._renormalize(coarse_probs, idx_to_remove) + continue + + # Sample fine label + fine_probs = fine_distributions[coarse_idx] + available_fine_indices = list(available_fine[coarse_idx]) + fine_probs = fine_probs[: len(available_fine_indices)] + fine_probs = fine_probs / fine_probs.sum() # Renormalize + fine_idx = np.random.choice(available_fine_indices, p=fine_probs) + + # Add example to split + split_indices.append(fine_idx) + + # Remove selected example + available_fine[coarse_idx].remove(fine_idx) + + # Renormalize if necessary + if len(available_fine[coarse_idx]) == 0: + idx_to_remove = available_coarse.index(coarse_idx) + available_coarse.remove(coarse_idx) + coarse_probs = self._renormalize(coarse_probs, idx_to_remove) + + indices_per_split.append(np.array(split_indices)) + + return indices_per_split + + def _renormalize(self, probs: np.ndarray, removed_idx: int) -> np.ndarray: + """Implementation of Algorithm 8 from the paper""" + # Create a list of valid indices (excluding the removed index) + valid_indices = [i for i in range(len(probs)) if i != removed_idx] + + # Select only the probabilities for valid indices + valid_probs = probs[valid_indices] + + # Normalize the remaining probabilities + return valid_probs / valid_probs.sum() + + def get_split(self, split_id: int, num_splits: int, balanced: bool, iid: bool) -> Dataset: + """Get a specific split of the dataset + :param split_id: ID of the split to retrieve + :param num_splits: Total number of splits + :param balanced: Whether splits are balanced + :param iid: Whether splits are IID + :return: Dataset split + """ + config_key = f"splits_{num_splits}_bal_{balanced}_iid_{iid}" + split_path = os.path.join(self.root_dir, f"{config_key}_split_{split_id}.pkl") + + if not os.path.exists(split_path): + self.create_splits(num_splits, balanced, iid) + + with open(split_path, "rb") as f: + indices = pickle.load(f) + + return Subset(self.trainset, indices) From 2f2f4c5820fe8acf2eafbb1a50d0006c276ec017 Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Wed, 4 Dec 2024 16:02:45 +0100 Subject: [PATCH 07/48] Add gitignore --- examples/cifar100/.gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 examples/cifar100/.gitignore diff --git a/examples/cifar100/.gitignore b/examples/cifar100/.gitignore new file mode 100644 index 000000000..7a2a2b575 --- /dev/null +++ b/examples/cifar100/.gitignore @@ -0,0 +1,2 @@ +data/* +results/* From 5483a93a1b5ab224a8c966d789cc10231f5f2d0e Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Wed, 4 Dec 2024 17:06:00 +0100 Subject: [PATCH 08/48] Function to get data loader for a given subset --- examples/cifar100/data.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/examples/cifar100/data.py b/examples/cifar100/data.py index 597e4845b..3833e7e73 100644 --- a/examples/cifar100/data.py +++ b/examples/cifar100/data.py @@ -290,3 +290,24 @@ def get_split(self, split_id: int, num_splits: int, balanced: bool, iid: bool) - indices = pickle.load(f) return Subset(self.trainset, indices) + + +def get_data_loader(num_splits: int, balanced: bool, iid: bool, batch_size: int = 100, is_train: bool = True): + """Get a data loader for the CIFAR-100 dataset + :param num_splits: Number of splits to create + :param balanced: Whether splits are balanced + :param iid: Whether splits are IID + :param batch_size: Batch size + :param is_train: Whether to get the training or test data loader + :return: Data loader + """ + split_id = os.environ.get("FEDN_DATA_SPLIT_ID", 0) + + cifar_data = CIFAR100Federated() + + if is_train: + dataset = cifar_data.get_split(split_id=split_id, num_splits=num_splits, balanced=balanced, iid=iid) + else: + dataset = cifar_data.testset + + return DataLoader(dataset, batch_size=batch_size, shuffle=is_train) From 3d4b38b0c8ade37f7f9924ce8ac54b3922c8eba1 Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Wed, 4 Dec 2024 17:06:21 +0100 Subject: [PATCH 09/48] Script to upload seed model --- examples/cifar100/init_fedn.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 examples/cifar100/init_fedn.py diff --git a/examples/cifar100/init_fedn.py b/examples/cifar100/init_fedn.py new file mode 100644 index 000000000..e4547b5bc --- /dev/null +++ b/examples/cifar100/init_fedn.py @@ -0,0 +1,13 @@ +from config import settings +from fedn import APIClient + +client = APIClient( + host=settings["DISCOVER_HOST"], + port=settings["DISCOVER_PORT"], + secure=settings["SECURE"], + verify=settings["VERIFY"], + token=settings["ADMIN_TOKEN"], +) + +result = client.set_active_model("seed.npz") +print(result["message"]) From b7348ec59ef31d7817bf8aa45a155a965a9b52ea Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Wed, 4 Dec 2024 17:06:47 +0100 Subject: [PATCH 10/48] File to store configuration settings --- examples/cifar100/config.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 examples/cifar100/config.py diff --git a/examples/cifar100/config.py b/examples/cifar100/config.py new file mode 100644 index 000000000..ac3567e73 --- /dev/null +++ b/examples/cifar100/config.py @@ -0,0 +1,9 @@ +settings = { + "N_CLIENTS": 5, + "DISCOVER_HOST": "localhost", + "DISCOVER_PORT": 8092, + "SECURE": False, + "VERIFY": False, + "ADMIN_TOKEN": None, + "CLIENT_TOKEN": None, +} From 6f8adf854e97821946bc93b26ca6de1ffc7c7758 Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Thu, 5 Dec 2024 17:05:08 +0100 Subject: [PATCH 11/48] Add training config to config.py --- examples/cifar100/config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/cifar100/config.py b/examples/cifar100/config.py index ac3567e73..f2d5132df 100644 --- a/examples/cifar100/config.py +++ b/examples/cifar100/config.py @@ -6,4 +6,8 @@ "VERIFY": False, "ADMIN_TOKEN": None, "CLIENT_TOKEN": None, + "BATCH_SIZE": 128, + "EPOCHS": 1, + "BALANCED": True, + "IID": True, } From f3b17014415ed7aa35edf45b98da782aaed1b2b3 Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Thu, 5 Dec 2024 17:05:57 +0100 Subject: [PATCH 12/48] Default values for data split config --- examples/cifar100/data.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/cifar100/data.py b/examples/cifar100/data.py index 3833e7e73..762ea0fa5 100644 --- a/examples/cifar100/data.py +++ b/examples/cifar100/data.py @@ -292,7 +292,7 @@ def get_split(self, split_id: int, num_splits: int, balanced: bool, iid: bool) - return Subset(self.trainset, indices) -def get_data_loader(num_splits: int, balanced: bool, iid: bool, batch_size: int = 100, is_train: bool = True): +def get_data_loader(num_splits: int = 5, balanced: bool = True, iid: bool = True, batch_size: int = 100, is_train: bool = True): """Get a data loader for the CIFAR-100 dataset :param num_splits: Number of splits to create :param balanced: Whether splits are balanced @@ -301,13 +301,14 @@ def get_data_loader(num_splits: int, balanced: bool, iid: bool, batch_size: int :param is_train: Whether to get the training or test data loader :return: Data loader """ - split_id = os.environ.get("FEDN_DATA_SPLIT_ID", 0) - cifar_data = CIFAR100Federated() if is_train: + split_id = os.environ.get("FEDN_DATA_SPLIT_ID", 0) dataset = cifar_data.get_split(split_id=split_id, num_splits=num_splits, balanced=balanced, iid=iid) + print(f"Getting data loader for split {split_id} of trainset (size: {len(dataset)})") else: dataset = cifar_data.testset + print(f"Getting data loader for testset (size: {len(dataset)})") return DataLoader(dataset, batch_size=batch_size, shuffle=is_train) From ca28544d1268199ad17ebae2f3f45afba151d55b Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Thu, 5 Dec 2024 17:06:39 +0100 Subject: [PATCH 13/48] Add function to load params from npz file --- examples/cifar100/init_seed.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/examples/cifar100/init_seed.py b/examples/cifar100/init_seed.py index 7fa67f2cf..477a4dcd3 100644 --- a/examples/cifar100/init_seed.py +++ b/examples/cifar100/init_seed.py @@ -1,3 +1,5 @@ +import collections + import torch import torch.nn as nn import torchvision.models as models @@ -37,5 +39,22 @@ def init_seed(out_path="seed.npz"): save_parameters(model, out_path) +def load_parameters(model_path): + """Load model parameters from file and populate model. + + param model_path: The path to load from. + :type model_path: str + :return: The loaded model. + :rtype: torch.nn.Module + """ + model = compile_model() + parameters_np = helper.load(model_path) + + params_dict = zip(model.state_dict().keys(), parameters_np) + state_dict = collections.OrderedDict({key: torch.tensor(x) for key, x in params_dict}) + model.load_state_dict(state_dict, strict=True) + return model + + if __name__ == "__main__": init_seed("seed.npz") From ab7756724f579ac0ab1e4fe755272120fa420270 Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Thu, 5 Dec 2024 17:07:48 +0100 Subject: [PATCH 14/48] Script to connect client, train and validate --- examples/cifar100/run_client.py | 204 ++++++++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 examples/cifar100/run_client.py diff --git a/examples/cifar100/run_client.py b/examples/cifar100/run_client.py new file mode 100644 index 000000000..002cfdefd --- /dev/null +++ b/examples/cifar100/run_client.py @@ -0,0 +1,204 @@ +import argparse +import io +import os +import uuid + +import torch +import torch.nn as nn +import torch.optim as optim +from data import get_data_loader +from init_seed import load_parameters, save_parameters + +from config import settings +from fedn import FednClient +from fedn.network.clients.fedn_client import ConnectToApiResult +from fedn.utils.helpers.helpers import get_helper + +helper = get_helper("numpyhelper") + + +def get_api_url(api_url: str, api_port: int, secure: bool = False): + if secure: + url = f"https://{api_url}:{api_port}" if api_port else f"https://{api_url}" + else: + url = f"http://{api_url}:{api_port}" if api_port else f"http://{api_url}" + if not url.endswith("/"): + url += "/" + return url + + +def on_train(in_model, client_settings): + # Save model to temp file + inpath = helper.get_tmp_path() + with open(inpath, "wb") as fh: + fh.write(in_model.getbuffer()) + + # Load model from temp file + resnet18 = load_parameters(inpath) + os.unlink(inpath) + + # Move model to GPU if available + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + resnet18 = resnet18.to(device) + + # Define loss function and optimizer + criterion = nn.CrossEntropyLoss() + learning_rate = 0.001 + weight_decay = 5e-4 + optimizer = optim.Adam(resnet18.parameters(), lr=learning_rate, weight_decay=weight_decay) + scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200) + + # Get data loader for trainset + trainloader = get_data_loader( + num_splits=settings["N_CLIENTS"], + balanced=settings["BALANCED"], + iid=settings["IID"], + is_train=True, + batch_size=settings["BATCH_SIZE"], + ) + + # Calculate number of batches + num_batches = len(trainloader) + + # Training loop + num_epochs = settings["EPOCHS"] + for epoch in range(num_epochs): + resnet18.train() + + for batch_idx, (inputs, labels) in enumerate(trainloader): + inputs, labels = inputs.to(device), labels.to(device) + + optimizer.zero_grad() + outputs = resnet18(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + if batch_idx % 10 == 0: + print(f"Epoch: {epoch}, Batch: {batch_idx}/{num_batches}, Loss: {loss.item():.4f}") + + scheduler.step() + + # Save model parameters + outpath = helper.get_tmp_path() + save_parameters(resnet18, outpath) + with open(outpath, "rb") as fr: + out_model = io.BytesIO(fr.read()) + os.unlink(outpath) + + # Return model and metadata + training_metadata = { + "num_examples": len(trainloader.dataset), + "batch_size": settings["BATCH_SIZE"], + "epochs": num_epochs, + "lr": learning_rate, + } + metadata = {"training_metadata": training_metadata} + return out_model, metadata + + +def on_validate(in_model): + # Save model to temp file + inpath = helper.get_tmp_path() + with open(inpath, "wb") as fh: + fh.write(in_model.getbuffer()) + + # Load model from temp file + resnet18 = load_parameters(inpath) + os.unlink(inpath) + + # Move model to GPU if available + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + resnet18 = resnet18.to(device) + resnet18.eval() + + criterion = nn.CrossEntropyLoss() + + # Calculate training metrics + trainloader = get_data_loader( + num_splits=settings["N_CLIENTS"], + balanced=settings["BALANCED"], + iid=settings["IID"], + is_train=True, + batch_size=settings["BATCH_SIZE"], + ) + train_loss = 0 + train_correct = 0 + train_total = 0 + + with torch.no_grad(): + for inputs, labels in trainloader: + inputs, labels = inputs.to(device), labels.to(device) + outputs = resnet18(inputs) + loss = criterion(outputs, labels) + + train_loss += loss.item() + _, predicted = outputs.max(1) + train_total += labels.size(0) + train_correct += predicted.eq(labels).sum().item() + + train_accuracy = train_correct / train_total + train_loss = train_loss / len(trainloader) + + # Calculate test metrics + testloader = get_data_loader( + is_train=False, + batch_size=settings["BATCH_SIZE"], + ) + test_loss = 0 + test_correct = 0 + test_total = 0 + + with torch.no_grad(): + for inputs, labels in testloader: + inputs, labels = inputs.to(device), labels.to(device) + outputs = resnet18(inputs) + loss = criterion(outputs, labels) + + test_loss += loss.item() + _, predicted = outputs.max(1) + test_total += labels.size(0) + test_correct += predicted.eq(labels).sum().item() + + test_accuracy = test_correct / test_total + test_loss = test_loss / len(testloader) + + metrics = { + "test_accuracy": test_accuracy, + "test_loss": test_loss, + "train_accuracy": train_accuracy, + "train_loss": train_loss, + } + return metrics + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="CIFAR100 Client") + parser.add_argument("--split-id", type=int, required=True, help="The split ID") + args = parser.parse_args() + + client = FednClient(train_callback=on_train, validate_callback=on_validate) + url = get_api_url(settings["DISCOVER_HOST"], settings["DISCOVER_PORT"], settings["SECURE"]) + client.set_name(f"cifar100-client-{args.split_id}") + client.set_client_id(str(uuid.uuid4())) + + controller_config = { + "name": client.name, + "client_id": client.client_id, + "package": "local", + "preferred_combiner": "", + } + + result, combiner_config = client.connect_to_api(url=url, token=settings["CLIENT_TOKEN"], json=controller_config) + + if result != ConnectToApiResult.Assigned: + print("Failed to connect to API, exiting.") + exit(1) + + result = client.init_grpchandler(config=combiner_config, client_name=client.client_id, token=settings["CLIENT_TOKEN"]) + + if not result: + print("Failed to initialize gRPC handler, exiting.") + exit(1) + + client.run() From 89ec85bd5d11d32cf29e1426822cb8b6c7478298 Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Thu, 5 Dec 2024 17:15:09 +0100 Subject: [PATCH 15/48] Add npz files to gitignore --- examples/cifar100/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/cifar100/.gitignore b/examples/cifar100/.gitignore index 7a2a2b575..158b11f47 100644 --- a/examples/cifar100/.gitignore +++ b/examples/cifar100/.gitignore @@ -1,2 +1,3 @@ data/* results/* +*.npz \ No newline at end of file From cbd86babe110f941e2a49a4bc8e7c583b12d8938 Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Thu, 5 Dec 2024 17:34:06 +0100 Subject: [PATCH 16/48] Ruff (bypass pickle warning since data comes from trusted source) --- examples/cifar100/.gitignore | 2 +- examples/cifar100/data.py | 6 ++---- examples/cifar100/init_seed.py | 4 ++-- examples/cifar100/run_client.py | 3 +-- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/examples/cifar100/.gitignore b/examples/cifar100/.gitignore index 158b11f47..3fee66770 100644 --- a/examples/cifar100/.gitignore +++ b/examples/cifar100/.gitignore @@ -1,3 +1,3 @@ data/* results/* -*.npz \ No newline at end of file +*.npz diff --git a/examples/cifar100/data.py b/examples/cifar100/data.py index 762ea0fa5..450e9e5a5 100644 --- a/examples/cifar100/data.py +++ b/examples/cifar100/data.py @@ -1,10 +1,8 @@ import os import pickle -from typing import List, Tuple +from typing import List import numpy as np -import torch -import torchvision.transforms as transforms from scipy.stats import dirichlet from torch.utils.data import DataLoader, Dataset, Subset from torchvision import datasets, transforms @@ -287,7 +285,7 @@ def get_split(self, split_id: int, num_splits: int, balanced: bool, iid: bool) - self.create_splits(num_splits, balanced, iid) with open(split_path, "rb") as f: - indices = pickle.load(f) + indices = pickle.load(f) # noqa: S301 return Subset(self.trainset, indices) diff --git a/examples/cifar100/init_seed.py b/examples/cifar100/init_seed.py index 477a4dcd3..4f5fe6b5e 100644 --- a/examples/cifar100/init_seed.py +++ b/examples/cifar100/init_seed.py @@ -1,8 +1,8 @@ import collections import torch -import torch.nn as nn -import torchvision.models as models +from torch import nn +from torchvision import models from fedn.utils.helpers.helpers import get_helper diff --git a/examples/cifar100/run_client.py b/examples/cifar100/run_client.py index 002cfdefd..caf6c6cb6 100644 --- a/examples/cifar100/run_client.py +++ b/examples/cifar100/run_client.py @@ -4,10 +4,9 @@ import uuid import torch -import torch.nn as nn -import torch.optim as optim from data import get_data_loader from init_seed import load_parameters, save_parameters +from torch import nn, optim from config import settings from fedn import FednClient From fdbb46bedafc2931863d55d0709f5e67fe362980 Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Fri, 6 Dec 2024 16:23:01 +0100 Subject: [PATCH 17/48] Added requirements file --- examples/cifar100/requirements.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 examples/cifar100/requirements.txt diff --git a/examples/cifar100/requirements.txt b/examples/cifar100/requirements.txt new file mode 100644 index 000000000..c1236222d --- /dev/null +++ b/examples/cifar100/requirements.txt @@ -0,0 +1,3 @@ +torch +torchvision +scipy \ No newline at end of file From e1ce6f3761d51cb28d5ac4d64405447b2ebe76c8 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 9 Dec 2024 10:28:03 +0100 Subject: [PATCH 18/48] Refactored entrypoint --- examples/load-test/README.md | 10 ++++------ examples/load-test/client/entrypoint | 10 +++------- examples/load-test/client/fedn.yaml | 4 ++-- examples/load-test/requirements.txt | 2 +- examples/mnist-pytorch/client/model.py | 6 ++++-- 5 files changed, 14 insertions(+), 18 deletions(-) diff --git a/examples/load-test/README.md b/examples/load-test/README.md index f941cf461..aca3e2b2c 100644 --- a/examples/load-test/README.md +++ b/examples/load-test/README.md @@ -1,15 +1,13 @@ # LOAD TEST -This example can be used as a load test for FEDn. +This example is intended to be used as a load test for FEDn. No actual machine learning is being done - the clients generate a random array of a configurable size. In this way a developer can -test the performance / scalability of a given FEDn network in a flexible -way simply by shuffling around and aggregating numeric arrays. +test the performance of FEDn deployment in a flexible +way by sending and aggregating numeric arrays of varying size. ## Prerequisites -- [Python 3.8, 3.9 or 3.10](https://www.python.org/downloads) -- [Docker](https://docs.docker.com/get-docker) -- [Docker Compose](https://docs.docker.com/compose/install) +- [Python 3.9 or 3.10](https://www.python.org/downloads) ## Running the example (pseudo-distributed, single host) diff --git a/examples/load-test/client/entrypoint b/examples/load-test/client/entrypoint index dd2216fc0..0b7de82ca 100644 --- a/examples/load-test/client/entrypoint +++ b/examples/load-test/client/entrypoint @@ -1,7 +1,7 @@ # /bin/python +import sys import time -import fire import numpy as np from fedn.utils.helpers.helpers import get_helper, save_metadata, save_metrics @@ -90,9 +90,5 @@ def validate(in_model_path, out_json_path): save_metrics(report, out_json_path) -if __name__ == '__main__': - fire.Fire({ - 'init_seed': init_seed, - 'train': train, - 'validate': validate - }) +if __name__ == "__main__": + train(sys.argv[1], sys.argv[2]) diff --git a/examples/load-test/client/fedn.yaml b/examples/load-test/client/fedn.yaml index 68cb70cef..fe46a99dc 100644 --- a/examples/load-test/client/fedn.yaml +++ b/examples/load-test/client/fedn.yaml @@ -1,5 +1,5 @@ entry_points: train: - command: /venv/bin/python entrypoint train $ENTRYPOINT_OPTS + command: python train validate: - command: /venv/bin/python entrypoint validate $ENTRYPOINT_OPTS \ No newline at end of file + command: python validate \ No newline at end of file diff --git a/examples/load-test/requirements.txt b/examples/load-test/requirements.txt index c6bceff1d..296d65452 100644 --- a/examples/load-test/requirements.txt +++ b/examples/load-test/requirements.txt @@ -1 +1 @@ -fire==0.3.1 \ No newline at end of file +numpy \ No newline at end of file diff --git a/examples/mnist-pytorch/client/model.py b/examples/mnist-pytorch/client/model.py index 6ad344770..d400a56bc 100644 --- a/examples/mnist-pytorch/client/model.py +++ b/examples/mnist-pytorch/client/model.py @@ -40,7 +40,8 @@ def save_parameters(model, out_path): :param out_path: The path to save to. :type out_path: str """ - parameters_np = [val.cpu().numpy() for _, val in model.state_dict().items()] + parameters_np = [val.cpu().numpy() + for _, val in model.state_dict().items()] helper.save(parameters_np, out_path) @@ -56,7 +57,8 @@ def load_parameters(model_path): parameters_np = helper.load(model_path) params_dict = zip(model.state_dict().keys(), parameters_np) - state_dict = collections.OrderedDict({key: torch.tensor(x) for key, x in params_dict}) + state_dict = collections.OrderedDict( + {key: torch.tensor(x) for key, x in params_dict}) model.load_state_dict(state_dict, strict=True) return model From 30d7bc535b5c53ec3e69347421d879f8c5089f4f Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 9 Dec 2024 10:29:25 +0100 Subject: [PATCH 19/48] Refactored entrypoint --- examples/load-test/client/model.py | 50 +++++++++++++++++++++++++++ examples/load-test/client/train.py | 38 ++++++++++++++++++++ examples/load-test/client/validate.py | 36 +++++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 examples/load-test/client/model.py create mode 100644 examples/load-test/client/train.py create mode 100644 examples/load-test/client/validate.py diff --git a/examples/load-test/client/model.py b/examples/load-test/client/model.py new file mode 100644 index 000000000..9bcd55048 --- /dev/null +++ b/examples/load-test/client/model.py @@ -0,0 +1,50 @@ +# /bin/python +import sys +import time + +import numpy as np + +from fedn.utils.helpers.helpers import get_helper, save_metadata, save_metrics + +HELPER_MODULE = 'numpyhelper' +ARRAY_SIZE = 1000000 + + +def save_model(weights, out_path): + """ Save model to disk. + + :param model: The model to save. + :type model: torch.nn.Module + :param out_path: The path to save to. + :type out_path: str + """ + helper = get_helper(HELPER_MODULE) + helper.save(weights, out_path) + + +def load_model(model_path): + """ Load model from disk. + + param model_path: The path to load from. + :type model_path: str + :return: The loaded model. + :rtype: torch.nn.Module + """ + helper = get_helper(HELPER_MODULE) + weights = helper.load(model_path) + return weights + + +def init_seed(out_path='seed.npz'): + """ Initialize seed model. + + :param out_path: The path to save the seed model to. + :type out_path: str + """ + # Init and save + weights = [np.random.rand(1, ARRAY_SIZE)] + save_model(weights, out_path) + + +if __name__ == "__main__": + init_seed("../seed.npz") diff --git a/examples/load-test/client/train.py b/examples/load-test/client/train.py new file mode 100644 index 000000000..5bfe38ede --- /dev/null +++ b/examples/load-test/client/train.py @@ -0,0 +1,38 @@ +# /bin/python +import sys +import time + +import numpy as np +from model import load_model, save_model + +from fedn.utils.helpers.helpers import get_helper, save_metadata, save_metrics + +HELPER_MODULE = 'numpyhelper' +ARRAY_SIZE = 10000 + + +def train(in_model_path, out_model_path): + """ Train model. + + """ + + # Load model + weights = load_model(in_model_path) + + # Train + time.sleep(np.random.randint(4, 15)) + + # Metadata needed for aggregation server side + metadata = { + 'num_examples': ARRAY_SIZE, + } + + # Save JSON metadata file + save_metadata(metadata, out_model_path) + + # Save model update + save_model(weights, out_model_path) + + +if __name__ == "__main__": + train(sys.argv[1], sys.argv[2]) diff --git a/examples/load-test/client/validate.py b/examples/load-test/client/validate.py new file mode 100644 index 000000000..c79a0d169 --- /dev/null +++ b/examples/load-test/client/validate.py @@ -0,0 +1,36 @@ +# /bin/python +import sys +import time + +import numpy as np +from model import load_model, save_model + +from fedn.utils.helpers.helpers import get_helper, save_metadata, save_metrics + +HELPER_MODULE = 'numpyhelper' +ARRAY_SIZE = 1000000 + + +def validate(in_model_path, out_json_path): + """ Validate model. + + :param in_model_path: The path to the input model. + :type in_model_path: str + :param out_json_path: The path to save the output JSON to. + :type out_json_path: str + :param data_path: The path to the data file. + :type data_path: str + """ + weights = load_model(in_model_path) + + # JSON schema + report = { + "mean": np.mean(weights), + } + + # Save JSON + save_metrics(report, out_json_path) + + +if __name__ == "__main__": + validate(sys.argv[1], sys.argv[2]) From 9e096b010aaf145e36df0ee5f31367c3265e8aa3 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 9 Dec 2024 10:35:52 +0100 Subject: [PATCH 20/48] clean up --- examples/load-test/client/entrypoint | 94 ---------------------------- 1 file changed, 94 deletions(-) delete mode 100644 examples/load-test/client/entrypoint diff --git a/examples/load-test/client/entrypoint b/examples/load-test/client/entrypoint deleted file mode 100644 index 0b7de82ca..000000000 --- a/examples/load-test/client/entrypoint +++ /dev/null @@ -1,94 +0,0 @@ -# /bin/python -import sys -import time - -import numpy as np - -from fedn.utils.helpers.helpers import get_helper, save_metadata, save_metrics - -HELPER_MODULE = 'numpyhelper' -ARRAY_SIZE = 1000000 - - -def save_model(weights, out_path): - """ Save model to disk. - - :param model: The model to save. - :type model: torch.nn.Module - :param out_path: The path to save to. - :type out_path: str - """ - helper = get_helper(HELPER_MODULE) - helper.save(weights, out_path) - - -def load_model(model_path): - """ Load model from disk. - - param model_path: The path to load from. - :type model_path: str - :return: The loaded model. - :rtype: torch.nn.Module - """ - helper = get_helper(HELPER_MODULE) - weights = helper.load(model_path) - return weights - - -def init_seed(out_path='seed.npz'): - """ Initialize seed model. - - :param out_path: The path to save the seed model to. - :type out_path: str - """ - # Init and save - weights = [np.random.rand(1, ARRAY_SIZE)] - save_model(weights, out_path) - - -def train(in_model_path, out_model_path): - """ Train model. - - """ - - # Load model - weights = load_model(in_model_path) - - # Train - time.sleep(np.random.randint(4, 15)) - - # Metadata needed for aggregation server side - metadata = { - 'num_examples': ARRAY_SIZE, - } - - # Save JSON metadata file - save_metadata(metadata, out_model_path) - - # Save model update - save_model(weights, out_model_path) - - -def validate(in_model_path, out_json_path): - """ Validate model. - - :param in_model_path: The path to the input model. - :type in_model_path: str - :param out_json_path: The path to save the output JSON to. - :type out_json_path: str - :param data_path: The path to the data file. - :type data_path: str - """ - weights = load_model(in_model_path) - - # JSON schema - report = { - "mean": np.mean(weights), - } - - # Save JSON - save_metrics(report, out_json_path) - - -if __name__ == "__main__": - train(sys.argv[1], sys.argv[2]) From a76b150227649a467fb39fe3ea702fcc7f19914e Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 9 Dec 2024 10:37:02 +0100 Subject: [PATCH 21/48] Clean --- examples/load-test/requirements.txt | 1 - examples/load-test/run_clients.py | 59 ----------------------------- 2 files changed, 60 deletions(-) delete mode 100644 examples/load-test/requirements.txt delete mode 100644 examples/load-test/run_clients.py diff --git a/examples/load-test/requirements.txt b/examples/load-test/requirements.txt deleted file mode 100644 index 296d65452..000000000 --- a/examples/load-test/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -numpy \ No newline at end of file diff --git a/examples/load-test/run_clients.py b/examples/load-test/run_clients.py deleted file mode 100644 index cb7ca1222..000000000 --- a/examples/load-test/run_clients.py +++ /dev/null @@ -1,59 +0,0 @@ -"""This scripts starts N_CLIENTS using the SDK. - -If you are running with a local deploy of FEDn -using docker compose, you need to make sure that clients -are able to resolve the name "combiner" to 127.0.0.1 - -One way to accomplish this is to edit your /etc/host, -adding the line: - -combiner 127.0.0.1 - -""" - - -import copy -import time -import uuid - -from fedn import APIClient -from fedn.network.clients.client import Client - -DISCOVER_HOST = '127.0.0.1' -DISCOVER_PORT = 8092 -N_CLIENTS = 3 - -config = {'discover_host': DISCOVER_HOST, 'discover_port': DISCOVER_PORT, 'token': None, 'name': 'testclient', - 'client_id': 1, 'remote_compute_context': True, 'force_ssl': False, 'dry_run': False, 'secure': False, - 'preshared_cert': False, 'verify': False, 'preferred_combiner': False, - 'validator': True, 'trainer': True, 'init': None, 'logfile': 'test.log', 'heartbeat_interval': 2, - 'reconnect_after_missed_heartbeat': 30} - -if __name__ == '__main__': - - # Start up N_CLIENTS clients - clients = [] - for i in range(N_CLIENTS): - config_i = copy.deepcopy(config) - config['name'] = 'client{}'.format(i) - clients.append(Client(config)) - - # Run a session - client = APIClient(DISCOVER_HOST, DISCOVER_PORT) - - session_config_fedavg = { - "helper": "numpyhelper", - "session_id": str(uuid.uuid4()), - "aggregator": "fedavg", - "round_timeout": 30, - "rounds": 5, - } - - result_fedavg = client.start_session(**session_config_fedavg) - while not client.session_is_finished(session_config_fedavg['session_id']): - time.sleep(1) - - print("Session ID: ", session_config_fedavg['session_id']) - - for client in clients: - client.detach() From 9292e526c58f5718e29c0ea404cbc8a8e99f3993 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 9 Dec 2024 11:13:04 +0100 Subject: [PATCH 22/48] clean up --- examples/load-test/init_fedn.py | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 examples/load-test/init_fedn.py diff --git a/examples/load-test/init_fedn.py b/examples/load-test/init_fedn.py deleted file mode 100644 index 23078fcd9..000000000 --- a/examples/load-test/init_fedn.py +++ /dev/null @@ -1,8 +0,0 @@ -from fedn import APIClient - -DISCOVER_HOST = '127.0.0.1' -DISCOVER_PORT = 8092 - -client = APIClient(DISCOVER_HOST, DISCOVER_PORT) -client.set_package('package.tgz', 'numpyhelper') -client.set_initial_model('seed.npz') From d4a605890b36d722766fe65c19f6588b85da611a Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Tue, 10 Dec 2024 17:52:00 +0100 Subject: [PATCH 23/48] Fixed bug where client would fail if no env in fedn.yaml --- examples/load-test/client/fedn.yaml | 4 +- examples/load-test/client/model.py | 2 +- fedn/network/clients/client_v2.py | 25 +++++++----- fedn/network/clients/fedn_client.py | 59 +++++++++++++++++++---------- 4 files changed, 58 insertions(+), 32 deletions(-) diff --git a/examples/load-test/client/fedn.yaml b/examples/load-test/client/fedn.yaml index fe46a99dc..d2ced7255 100644 --- a/examples/load-test/client/fedn.yaml +++ b/examples/load-test/client/fedn.yaml @@ -1,5 +1,5 @@ entry_points: train: - command: python train + command: python train.py validate: - command: python validate \ No newline at end of file + command: python validate.py \ No newline at end of file diff --git a/examples/load-test/client/model.py b/examples/load-test/client/model.py index 9bcd55048..4d6b89a31 100644 --- a/examples/load-test/client/model.py +++ b/examples/load-test/client/model.py @@ -7,7 +7,7 @@ from fedn.utils.helpers.helpers import get_helper, save_metadata, save_metrics HELPER_MODULE = 'numpyhelper' -ARRAY_SIZE = 1000000 +ARRAY_SIZE = 20000000 def save_model(weights, out_path): diff --git a/fedn/network/clients/client_v2.py b/fedn/network/clients/client_v2.py index 43edc9b79..444e29bbe 100644 --- a/fedn/network/clients/client_v2.py +++ b/fedn/network/clients/client_v2.py @@ -8,7 +8,8 @@ from fedn.common.config import FEDN_CUSTOM_URL_PREFIX from fedn.common.log_config import logger -from fedn.network.clients.fedn_client import ConnectToApiResult, FednClient, GrpcConnectionOptions +from fedn.network.clients.fedn_client import (ConnectToApiResult, FednClient, + GrpcConnectionOptions) from fedn.network.combiner.modelservice import get_tmp_path from fedn.utils.helpers.helpers import get_helper, save_metadata @@ -76,7 +77,8 @@ def _connect_to_api(self) -> Tuple[bool, dict]: if result == ConnectToApiResult.ComputePackageMissing: logger.info("Retrying in 3 seconds") time.sleep(3) - result, response = self.fedn_client.connect_to_api(self.fedn_api_url, self.token, self.client_obj.to_json()) + result, response = self.fedn_client.connect_to_api( + self.fedn_api_url, self.token, self.client_obj.to_json()) if result == ConnectToApiResult.Assigned: return True, response @@ -93,9 +95,9 @@ def start(self): result, combiner_config = self._connect_to_api() if not result: return - if self.client_obj.package == "remote": - result = self.fedn_client.init_remote_compute_package(url=self.fedn_api_url, token=self.token, package_checksum=self.package_checksum) + result = self.fedn_client.init_remote_compute_package( + url=self.fedn_api_url, token=self.token, package_checksum=self.package_checksum) if not result: return @@ -107,7 +109,8 @@ def start(self): self.set_helper(combiner_config) - result: bool = self.fedn_client.init_grpchandler(config=combiner_config, client_name=self.client_obj.client_id, token=self.token) + result: bool = self.fedn_client.init_grpchandler( + config=combiner_config, client_name=self.client_obj.client_id, token=self.token) if not result: return @@ -133,7 +136,8 @@ def set_helper(self, response: GrpcConnectionOptions = None): self.helper = get_helper(helper_type_to_use) def on_train(self, in_model, client_settings): - out_model, meta = self._process_training_request(in_model, client_settings) + out_model, meta = self._process_training_request( + in_model, client_settings) return out_model, meta def on_validation(self, in_model): @@ -162,7 +166,8 @@ def _process_training_request(self, in_model: BytesIO, client_settings: dict) -> tic = time.time() - self.fedn_client.dispatcher.run_cmd("train {} {}".format(inpath, outpath)) + self.fedn_client.dispatcher.run_cmd( + "train {} {}".format(inpath, outpath)) meta["exec_training"] = time.time() - tic @@ -175,7 +180,8 @@ def _process_training_request(self, in_model: BytesIO, client_settings: dict) -> with open(outpath + "-metadata", "r") as fh: training_metadata = json.loads(fh.read()) - logger.info("SETTING Training metadata: {}".format(training_metadata)) + logger.info("SETTING Training metadata: {}".format( + training_metadata)) meta["training_metadata"] = training_metadata os.unlink(inpath) @@ -183,7 +189,8 @@ def _process_training_request(self, in_model: BytesIO, client_settings: dict) -> os.unlink(outpath + "-metadata") except Exception as e: - logger.error("Could not process training request due to error: {}".format(e)) + logger.error( + "Could not process training request due to error: {}".format(e)) out_model = None meta = {"status": "failed", "error": str(e)} diff --git a/fedn/network/clients/fedn_client.py b/fedn/network/clients/fedn_client.py index c38347a7b..15fafe5e2 100644 --- a/fedn/network/clients/fedn_client.py +++ b/fedn/network/clients/fedn_client.py @@ -88,12 +88,14 @@ def connect_to_api(self, url: str, token: str, json: dict) -> Tuple[ConnectToApi ) if response.status_code == 200: - logger.info("Connect to FEDn Api - Client assinged to controller") + logger.info( + "Connect to FEDn Api - Client assigned to controller") json_response = response.json() return ConnectToApiResult.Assigned, json_response elif response.status_code == 203: json_response = response.json() - logger.info("Connect to FEDn Api - Remote compute package missing.") + logger.info( + "Connect to FEDn Api - Remote compute package missing.") return ConnectToApiResult.ComputePackageMissing, json_response elif response.status_code == 401: logger.warning("Connect to FEDn Api - Unauthorized") @@ -155,18 +157,20 @@ def set_dispatcher(self, path) -> bool: def get_or_set_environment(self) -> bool: try: - logger.info("Initiating Dispatcher with entrypoint set to: startup") + logger.info( + "Initiating Dispatcher with entrypoint set to: startup") activate_cmd = self.dispatcher._get_or_create_python_env() self.dispatcher.run_cmd("startup") except KeyError: logger.info("No startup command found in package. Continuing.") - return False + # return False except Exception as e: logger.error(f"Caught exception: {type(e).__name__}") return False if activate_cmd: - logger.info("To activate the virtual environment, run: {}".format(activate_cmd)) + logger.info( + "To activate the virtual environment, run: {}".format(activate_cmd)) return True @@ -181,7 +185,8 @@ def init_grpchandler(self, config: GrpcConnectionOptions, client_name: str, toke port = config["port"] combiner_name = config["host"] - self.grpc_handler = GrpcHandler(host=host, port=port, name=client_name, token=token, combiner_name=combiner_name) + self.grpc_handler = GrpcHandler( + host=host, port=port, name=client_name, token=token, combiner_name=combiner_name) logger.info("Successfully initialized GRPC connection") return True @@ -190,10 +195,12 @@ def init_grpchandler(self, config: GrpcConnectionOptions, client_name: str, toke return False def send_heartbeats(self, client_name: str, client_id: str, update_frequency: float = 2.0): - self.grpc_handler.send_heartbeats(client_name=client_name, client_id=client_id, update_frequency=update_frequency) + self.grpc_handler.send_heartbeats( + client_name=client_name, client_id=client_id, update_frequency=update_frequency) def listen_to_task_stream(self, client_name: str, client_id: str): - self.grpc_handler.listen_to_task_stream(client_name=client_name, client_id=client_id, callback=self._task_stream_callback) + self.grpc_handler.listen_to_task_stream( + client_name=client_name, client_id=client_id, callback=self._task_stream_callback) def _task_stream_callback(self, request): if request.type == fedn.StatusType.MODEL_UPDATE: @@ -208,10 +215,12 @@ def update_local_model(self, request): model_update_id = str(uuid.uuid4()) tic = time.time() - in_model = self.get_model_from_combiner(id=model_id, client_id=self.client_id) + in_model = self.get_model_from_combiner( + id=model_id, client_id=self.client_id) if in_model is None: - logger.error("Could not retrieve model from combiner. Aborting training request.") + logger.error( + "Could not retrieve model from combiner. Aborting training request.") return fetch_model_time = time.time() - tic @@ -241,7 +250,8 @@ def update_local_model(self, request): meta["fetch_model"] = fetch_model_time meta["config"] = request.data - update = self.create_update_message(model_id=model_id, model_update_id=model_update_id, meta=meta, request=request) + update = self.create_update_message( + model_id=model_id, model_update_id=model_update_id, meta=meta, request=request) self.send_model_update(update) @@ -265,10 +275,12 @@ def validate_global_model(self, request): type=fedn.StatusType.MODEL_VALIDATION, ) - in_model = self.get_model_from_combiner(id=model_id, client_id=self.client_id) + in_model = self.get_model_from_combiner( + id=model_id, client_id=self.client_id) if in_model is None: - logger.error("Could not retrieve model from combiner. Aborting validation request.") + logger.error( + "Could not retrieve model from combiner. Aborting validation request.") return if not self.validate_callback: @@ -280,7 +292,8 @@ def validate_global_model(self, request): if metrics is not None: # Send validation - validation = self.create_validation_message(metrics=metrics, request=request) + validation = self.create_validation_message( + metrics=metrics, request=request) result: bool = self.send_model_validation(validation) @@ -295,7 +308,8 @@ def validate_global_model(self, request): ) else: self.send_status( - "Client {} failed to complete model validation.".format(self.name), + "Client {} failed to complete model validation.".format( + self.name), log_level=fedn.LogLevel.WARNING, request=request, sesssion_id=request.session_id, @@ -304,10 +318,12 @@ def validate_global_model(self, request): def predict_global_model(self, request): model_id = request.model_id - model = self.get_model_from_combiner(id=model_id, client_id=self.client_id) + model = self.get_model_from_combiner( + id=model_id, client_id=self.client_id) if model is None: - logger.error("Could not retrieve model from combiner. Aborting prediction request.") + logger.error( + "Could not retrieve model from combiner. Aborting prediction request.") return if not self.predict_callback: @@ -317,7 +333,8 @@ def predict_global_model(self, request): logger.info(f"Running predict callback with model ID: {model_id}") prediction = self.predict_callback(model) - prediction_message = self.create_prediction_message(prediction=prediction, request=request) + prediction_message = self.create_prediction_message( + prediction=prediction, request=request) self.send_model_prediction(prediction_message) @@ -362,9 +379,11 @@ def set_client_id(self, client_id: str): self.client_id = client_id def run(self): - threading.Thread(target=self.send_heartbeats, kwargs={"client_name": self.name, "client_id": self.client_id}, daemon=True).start() + threading.Thread(target=self.send_heartbeats, kwargs={ + "client_name": self.name, "client_id": self.client_id}, daemon=True).start() try: - self.listen_to_task_stream(client_name=self.name, client_id=self.client_id) + self.listen_to_task_stream( + client_name=self.name, client_id=self.client_id) except KeyboardInterrupt: logger.info("Client stopped by user.") From a85fc13c50f80ee63fc51cb91debcb1318a1e179 Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Wed, 11 Dec 2024 11:59:03 +0100 Subject: [PATCH 24/48] Train callback compatible with 0.19.0 --- examples/cifar100/run_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cifar100/run_client.py b/examples/cifar100/run_client.py index caf6c6cb6..e94a9cbed 100644 --- a/examples/cifar100/run_client.py +++ b/examples/cifar100/run_client.py @@ -26,7 +26,7 @@ def get_api_url(api_url: str, api_port: int, secure: bool = False): return url -def on_train(in_model, client_settings): +def on_train(in_model): # Save model to temp file inpath = helper.get_tmp_path() with open(inpath, "wb") as fh: From 8ef8e78aa3a2cfdc8124da6ea66d179882a63608 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Fri, 13 Dec 2024 14:30:06 +0100 Subject: [PATCH 25/48] Fixed error in apidocs --- docs/apiclient.rst | 2 +- examples/load-test/client/fedn.yaml | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/apiclient.rst b/docs/apiclient.rst index 360691d18..d1d03f7b7 100644 --- a/docs/apiclient.rst +++ b/docs/apiclient.rst @@ -60,7 +60,7 @@ using the default aggregator (FedAvg): .. code:: python >>> ... - >>> client.start_session(id="test-session", rounds=3) + >>> client.start_session(id="test-session", helper="numpyhelper", rounds=3) # Wait for training to complete, when controller is idle: >>> client.get_controller_status() # Show model trail: diff --git a/examples/load-test/client/fedn.yaml b/examples/load-test/client/fedn.yaml index d2ced7255..88af8dcc9 100644 --- a/examples/load-test/client/fedn.yaml +++ b/examples/load-test/client/fedn.yaml @@ -1,4 +1,7 @@ +python_env: env.yaml entry_points: + startup: + command: python startup.py train: command: python train.py validate: From 7ef5d51cdacd9d0c39619d73f10cb962bd77942f Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Fri, 13 Dec 2024 14:48:07 +0100 Subject: [PATCH 26/48] Updated dev instructions --- docs/developer.rst | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/docs/developer.rst b/docs/developer.rst index cd55a596b..c117b0e95 100644 --- a/docs/developer.rst +++ b/docs/developer.rst @@ -19,30 +19,32 @@ We provide Dockerfiles and docker-compose template for an all-in-one local sandb .. code-block:: - docker compose \ - -f ../../docker-compose.yaml \ - -f docker-compose.override.yaml \ - up + docker compose up -This starts up local services for MongoDB, Minio, the API Server, one Combiner and two clients. +This starts up local services for MongoDB, Minio, the API Server, and one Combiner. You can verify the deployment on localhost using these urls: - API Server: http://localhost:8092/get_controller_status - Minio: http://localhost:9000 - Mongo Express: http://localhost:8081 -This setup does not include any of the security and authentication features available in a Studio Project, -so we will not require authentication of clients (insecure mode) when using the APIClient: +To connect a native FEDn client to the sandbox deployment, first edit '/etc/hosts' and add the line 'localhost api-server combiner'. Then +create a file `client.yaml` with the following content: .. code-block:: - from fedn import APIClient - client = APIClient(host="localhost", port=8092) - client.set_active_package("package.tgz", helper="numpyhelper") - client.set_active_model("seed.npz") + network_id: fedn-network + discover_host: api-server + discover_port: 8092 + name: myclient -To connect a native FEDn client to the sandbox deployment, you need to make sure that the combiner service can be resolved by the client using the name "combiner". -One way to achieve this is to edit your '/etc/hosts' and add a line '127.0.0.1 combiner'. +Now you can start a client: + +.. code-block:: + fedn client start -in client.yaml --api-url=http://localhost --api-port=8090 + +If you are running the server on a remote machine/VM, simply replace 'localhost' with the IP address or hostname of that machine in the instructions above. +Make sure to remember to open ports 8081, 8090, and 12080 on the server host. Access message logs and validation data from MongoDB ------------------------------------------------------ From a1862f2e5e7050ebc71d79d5cb76da31772a1179 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Fri, 13 Dec 2024 15:22:37 +0100 Subject: [PATCH 27/48] Experiment with chunk_size --- fedn/network/clients/fedn_client.py | 2 ++ fedn/network/combiner/modelservice.py | 28 ++++++++++++++++++--------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/fedn/network/clients/fedn_client.py b/fedn/network/clients/fedn_client.py index 15fafe5e2..ec77f96ec 100644 --- a/fedn/network/clients/fedn_client.py +++ b/fedn/network/clients/fedn_client.py @@ -224,6 +224,7 @@ def update_local_model(self, request): return fetch_model_time = time.time() - tic + print("FETCH_MODEL: ", fetch_model_time) if not self.train_callback: logger.error("No train callback set") @@ -246,6 +247,7 @@ def update_local_model(self, request): tic = time.time() self.send_model_to_combiner(model=out_model, id=model_update_id) meta["upload_model"] = time.time() - tic + print("UPLOAD_MODEL ", meta["upload_model"]) meta["fetch_model"] = fetch_model_time meta["config"] = request.data diff --git a/fedn/network/combiner/modelservice.py b/fedn/network/combiner/modelservice.py index 8600b8bab..ebf99c43a 100644 --- a/fedn/network/combiner/modelservice.py +++ b/fedn/network/combiner/modelservice.py @@ -9,7 +9,9 @@ from fedn.common.log_config import logger from fedn.network.storage.models.tempmodelstorage import TempModelStorage -CHUNK_SIZE = 1024 * 1024 +#CHUNK_SIZE = 1024 * 1024 + +CHUNK_SIZE = 2048 * 1024 def upload_request_generator(mdl, id): @@ -23,9 +25,11 @@ def upload_request_generator(mdl, id): while True: b = mdl.read(CHUNK_SIZE) if b: - result = fedn.ModelRequest(data=b, id=id, status=fedn.ModelStatus.IN_PROGRESS) + result = fedn.ModelRequest( + data=b, id=id, status=fedn.ModelStatus.IN_PROGRESS) else: - result = fedn.ModelRequest(id=id, data=None, status=fedn.ModelStatus.OK) + result = fedn.ModelRequest( + id=id, data=None, status=fedn.ModelStatus.OK) yield result if not b: break @@ -87,7 +91,8 @@ def unpack_model(request_iterator, helper): if request.data: model_buffer.write(request.data) except MemoryError as e: - logger.error(f"Memory error occured when loading model, reach out to the FEDn team if you need a solution to this. {e}") + logger.error( + f"Memory error occured when loading model, reach out to the FEDn team if you need a solution to this. {e}") raise except Exception as e: logger.error(f"Exception occured during model loading: {e}") @@ -209,12 +214,15 @@ def Upload(self, request_iterator, context): for request in request_iterator: if request.status == fedn.ModelStatus.IN_PROGRESS: self.temp_model_storage.get_ptr(request.id).write(request.data) - self.temp_model_storage.set_model_metadata(request.id, fedn.ModelStatus.IN_PROGRESS) + self.temp_model_storage.set_model_metadata( + request.id, fedn.ModelStatus.IN_PROGRESS) if request.status == fedn.ModelStatus.OK and not request.data: - result = fedn.ModelResponse(id=request.id, status=fedn.ModelStatus.OK, message="Got model successfully.") + result = fedn.ModelResponse( + id=request.id, status=fedn.ModelStatus.OK, message="Got model successfully.") # self.temp_model_storage_metadata.update({request.id: fedn.ModelStatus.OK}) - self.temp_model_storage.set_model_metadata(request.id, fedn.ModelStatus.OK) + self.temp_model_storage.set_model_metadata( + request.id, fedn.ModelStatus.OK) self.temp_model_storage.get_ptr(request.id).flush() self.temp_model_storage.get_ptr(request.id).close() return result @@ -229,11 +237,13 @@ def Download(self, request, context): :return: A model response iterator. :rtype: :class:`fedn.network.grpc.fedn_pb2.ModelResponse` """ - logger.info(f"grpc.ModelService.Download: {request.sender.role}:{request.sender.client_id} requested model {request.id}") + logger.info( + f"grpc.ModelService.Download: {request.sender.role}:{request.sender.client_id} requested model {request.id}") try: status = self.temp_model_storage.get_model_metadata(request.id) if status != fedn.ModelStatus.OK: - logger.error(f"model file is not ready: {request.id}, status: {status}") + logger.error( + f"model file is not ready: {request.id}, status: {status}") yield fedn.ModelResponse(id=request.id, data=None, status=status) except Exception: logger.error("Error file does not exist: {}".format(request.id)) From de7a5770952ec9f7763b0d335b5dd511b4d50516 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Fri, 13 Dec 2024 15:32:58 +0100 Subject: [PATCH 28/48] Experiment with chunk_size --- fedn/network/combiner/modelservice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedn/network/combiner/modelservice.py b/fedn/network/combiner/modelservice.py index ebf99c43a..8489de06d 100644 --- a/fedn/network/combiner/modelservice.py +++ b/fedn/network/combiner/modelservice.py @@ -11,7 +11,7 @@ #CHUNK_SIZE = 1024 * 1024 -CHUNK_SIZE = 2048 * 1024 +CHUNK_SIZE = 64 * 1024 def upload_request_generator(mdl, id): From 9d7b1372dcb68ebe752db6a206e687ee04ac8434 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Fri, 13 Dec 2024 15:51:02 +0100 Subject: [PATCH 29/48] Reverting chunk_size, so significant difference --- fedn/network/combiner/modelservice.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fedn/network/combiner/modelservice.py b/fedn/network/combiner/modelservice.py index 8489de06d..f9de18ef4 100644 --- a/fedn/network/combiner/modelservice.py +++ b/fedn/network/combiner/modelservice.py @@ -9,9 +9,7 @@ from fedn.common.log_config import logger from fedn.network.storage.models.tempmodelstorage import TempModelStorage -#CHUNK_SIZE = 1024 * 1024 - -CHUNK_SIZE = 64 * 1024 +CHUNK_SIZE = 1024 * 1024 def upload_request_generator(mdl, id): From 376698f3bced79d0279a45b115b7603c7dfc7a2c Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Fri, 13 Dec 2024 23:06:36 +0100 Subject: [PATCH 30/48] test --- fedn/network/clients/grpc_handler.py | 74 ++++++++++++++++++--------- fedn/network/combiner/modelservice.py | 2 +- 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/fedn/network/clients/grpc_handler.py b/fedn/network/clients/grpc_handler.py index a07854986..9858c530d 100644 --- a/fedn/network/clients/grpc_handler.py +++ b/fedn/network/clients/grpc_handler.py @@ -19,8 +19,10 @@ # Keepalive settings: these help keep the connection open for long-lived clients KEEPALIVE_TIME_MS = 1 * 1000 # send keepalive ping every 60 seconds -KEEPALIVE_TIMEOUT_MS = 30 * 1000 # wait 20 seconds for keepalive ping ack before considering connection dead -KEEPALIVE_PERMIT_WITHOUT_CALLS = True # allow keepalive pings even when there are no RPCs +# wait 20 seconds for keepalive ping ack before considering connection dead +KEEPALIVE_TIMEOUT_MS = 30 * 1000 +# allow keepalive pings even when there are no RPCs +KEEPALIVE_PERMIT_WITHOUT_CALLS = True MAX_CONNECTION_IDLE_MS = 30000 MAX_CONNECTION_AGE_GRACE_MS = "INT_MAX" # keep connection open indefinitely CLIENT_IDLE_TIMEOUT_MS = 30000 @@ -89,10 +91,12 @@ def _init_secure_channel(self, host: str, port: int, token: str): logger.info(f"Connecting (GRPC) to {url}") if os.getenv("FEDN_GRPC_ROOT_CERT_PATH"): - logger.info("Using root certificate from environment variable for GRPC channel.") + logger.info( + "Using root certificate from environment variable for GRPC channel.") with open(os.environ["FEDN_GRPC_ROOT_CERT_PATH"], "rb") as f: credentials = grpc.ssl_channel_credentials(f.read()) - self.channel = grpc.secure_channel("{}:{}".format(host, str(port)), credentials) + self.channel = grpc.secure_channel( + "{}:{}".format(host, str(port)), credentials) return logger.info(f"Fetching SSL certificate for {host}") @@ -119,11 +123,13 @@ def heartbeat(self, client_name: str, client_id: str): :return: Response from the combiner. :rtype: fedn.Response """ - heartbeat = fedn.Heartbeat(sender=fedn.Client(name=client_name, role=fedn.CLIENT, client_id=client_id)) + heartbeat = fedn.Heartbeat(sender=fedn.Client( + name=client_name, role=fedn.CLIENT, client_id=client_id)) try: - logger.info("Sending heartbeat to combiner") - response = self.connectorStub.SendHeartbeat(heartbeat, metadata=self.metadata) + #logger.info("Sending heartbeat to combiner") + response = self.connectorStub.SendHeartbeat( + heartbeat, metadata=self.metadata) except grpc.RpcError as e: logger.error(f"GRPC (SendHeartbeat): An error occurred: {e}") raise e @@ -142,7 +148,8 @@ def send_heartbeats(self, client_name: str, client_id: str, update_frequency: fl except Exception as e: return self._handle_unknown_error(e, "SendHeartbeat", lambda: self.send_heartbeats(client_name, client_id, update_frequency)) if isinstance(response, fedn.Response): - logger.info("Heartbeat successful.") + pass + #logger.info("Heartbeat successful.") else: logger.error("Heartbeat failed.") send_hearbeat = False @@ -172,7 +179,8 @@ def listen_to_task_stream(self, client_name: str, client_id: str, callback: Call sender_name=client_name, ) - logger.info(f"Received task request of type {request.type} for model_id {request.model_id}") + logger.info( + f"Received task request of type {request.type} for model_id {request.model_id}") callback(request) @@ -181,7 +189,8 @@ def listen_to_task_stream(self, client_name: str, client_id: str, callback: Call return self._handle_grpc_error(e, "TaskStream", lambda: self.listen_to_task_stream(client_name, client_id, callback)) except Exception as e: logger.error(f"GRPC (TaskStream): An error occurred: {e}") - self._handle_unknown_error(e, "TaskStream", lambda: self.listen_to_task_stream(client_name, client_id, callback)) + self._handle_unknown_error(e, "TaskStream", lambda: self.listen_to_task_stream( + client_name, client_id, callback)) def send_status(self, msg: str, log_level=fedn.LogLevel.INFO, type=None, request=None, sesssion_id: str = None, sender_name: str = None): """Send status message. @@ -216,7 +225,8 @@ def send_status(self, msg: str, log_level=fedn.LogLevel.INFO, type=None, request return self._handle_grpc_error(e, "SendStatus", lambda: self.send_status(msg, log_level, type, request, sesssion_id, sender_name)) except Exception as e: logger.error(f"GRPC (SendStatus): An error occurred: {e}") - self._handle_unknown_error(e, "SendStatus", lambda: self.send_status(msg, log_level, type, request, sesssion_id, sender_name)) + self._handle_unknown_error(e, "SendStatus", lambda: self.send_status( + msg, log_level, type, request, sesssion_id, sender_name)) def get_model_from_combiner(self, id: str, client_id: str, timeout: int = 20) -> BytesIO: """Fetch a model from the assigned combiner. @@ -253,7 +263,8 @@ def get_model_from_combiner(self, id: str, client_id: str, timeout: int = 20) -> return self._handle_grpc_error(e, "Download", lambda: self.get_model_from_combiner(id, client_id, timeout)) except Exception as e: logger.error(f"GRPC (Download): An error occurred: {e}") - self._handle_unknown_error(e, "Download", lambda: self.get_model_from_combiner(id, client_id, timeout)) + self._handle_unknown_error( + e, "Download", lambda: self.get_model_from_combiner(id, client_id, timeout)) return data def send_model_to_combiner(self, model: BytesIO, id: str): @@ -279,12 +290,14 @@ def send_model_to_combiner(self, model: BytesIO, id: str): try: logger.info("Uploading model to combiner.") - result = self.modelStub.Upload(upload_request_generator(bt, id), metadata=self.metadata) + result = self.modelStub.Upload( + upload_request_generator(bt, id), metadata=self.metadata) except grpc.RpcError as e: return self._handle_grpc_error(e, "Upload", lambda: self.send_model_to_combiner(model, id)) except Exception as e: logger.error(f"GRPC (Upload): An error occurred: {e}") - self._handle_unknown_error(e, "Upload", lambda: self.send_model_to_combiner(model, id)) + self._handle_unknown_error( + e, "Upload", lambda: self.send_model_to_combiner(model, id)) return result def create_update_message( @@ -358,18 +371,21 @@ def create_prediction_message( def send_model_update(self, update: fedn.ModelUpdate): try: logger.info("Sending model update to combiner.") - _ = self.combinerStub.SendModelUpdate(update, metadata=self.metadata) + _ = self.combinerStub.SendModelUpdate( + update, metadata=self.metadata) except grpc.RpcError as e: return self._handle_grpc_error(e, "SendModelUpdate", lambda: self.send_model_update(update)) except Exception as e: logger.error(f"GRPC (SendModelUpdate): An error occurred: {e}") - self._handle_unknown_error(e, "SendModelUpdate", lambda: self.send_model_update(update)) + self._handle_unknown_error( + e, "SendModelUpdate", lambda: self.send_model_update(update)) return True def send_model_validation(self, validation: fedn.ModelValidation) -> bool: try: logger.info("Sending model validation to combiner.") - _ = self.combinerStub.SendModelValidation(validation, metadata=self.metadata) + _ = self.combinerStub.SendModelValidation( + validation, metadata=self.metadata) except grpc.RpcError as e: return self._handle_grpc_error( e, @@ -378,13 +394,15 @@ def send_model_validation(self, validation: fedn.ModelValidation) -> bool: ) except Exception as e: logger.error(f"GRPC (SendModelValidation): An error occurred: {e}") - self._handle_unknown_error(e, "SendModelValidation", lambda: self.send_model_validation(validation)) + self._handle_unknown_error( + e, "SendModelValidation", lambda: self.send_model_validation(validation)) return True def send_model_prediction(self, prediction: fedn.ModelPrediction) -> bool: try: logger.info("Sending model prediction to combiner.") - _ = self.combinerStub.SendModelPrediction(prediction, metadata=self.metadata) + _ = self.combinerStub.SendModelPrediction( + prediction, metadata=self.metadata) except grpc.RpcError as e: return self._handle_grpc_error( e, @@ -393,17 +411,20 @@ def send_model_prediction(self, prediction: fedn.ModelPrediction) -> bool: ) except Exception as e: logger.error(f"GRPC (SendModelPrediction): An error occurred: {e}") - self._handle_unknown_error(e, "SendModelPrediction", lambda: self.send_model_prediction(prediction)) + self._handle_unknown_error( + e, "SendModelPrediction", lambda: self.send_model_prediction(prediction)) return True def _handle_grpc_error(self, e, method_name: str, sender_function: Callable): status_code = e.code() if status_code == grpc.StatusCode.UNAVAILABLE: - logger.warning(f"GRPC ({method_name}): server unavailable. Retrying in 5 seconds.") + logger.warning( + f"GRPC ({method_name}): server unavailable. Retrying in 5 seconds.") time.sleep(5) return sender_function() elif status_code == grpc.StatusCode.CANCELLED: - logger.warning(f"GRPC ({method_name}): connection cancelled. Retrying in 5 seconds.") + logger.warning( + f"GRPC ({method_name}): connection cancelled. Retrying in 5 seconds.") time.sleep(5) return sender_function() elif status_code == grpc.StatusCode.UNAUTHENTICATED: @@ -412,10 +433,12 @@ def _handle_grpc_error(self, e, method_name: str, sender_function: Callable): logger.warning(f"GRPC ({method_name}): Token expired.") raise e elif status_code == grpc.StatusCode.UNKNOWN: - logger.warning(f"GRPC ({method_name}): An unknown error occurred: {e}.") + logger.warning( + f"GRPC ({method_name}): An unknown error occurred: {e}.") details = e.details() if details == "Stream removed": - logger.warning(f"GRPC ({method_name}): Stream removed. Reconnecting") + logger.warning( + f"GRPC ({method_name}): Stream removed. Reconnecting") self._disconnect() self._init_channel(self.host, self.port, self.token) self._init_stubs() @@ -427,7 +450,8 @@ def _handle_grpc_error(self, e, method_name: str, sender_function: Callable): def _handle_unknown_error(self, e, method_name: str, sender_function: Callable): # Try to reconnect - logger.warning(f"GRPC ({method_name}): An unknown error occurred: {e}.") + logger.warning( + f"GRPC ({method_name}): An unknown error occurred: {e}.") if isinstance(e, ValueError): # ValueError is raised when the channel is closed self._disconnect() diff --git a/fedn/network/combiner/modelservice.py b/fedn/network/combiner/modelservice.py index f9de18ef4..76702e941 100644 --- a/fedn/network/combiner/modelservice.py +++ b/fedn/network/combiner/modelservice.py @@ -9,7 +9,7 @@ from fedn.common.log_config import logger from fedn.network.storage.models.tempmodelstorage import TempModelStorage -CHUNK_SIZE = 1024 * 1024 +CHUNK_SIZE = 4 * 1024 * 1024 def upload_request_generator(mdl, id): From 05f29fe4daf6b62e48785b20e319e6ec42376013 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Fri, 13 Dec 2024 23:17:17 +0100 Subject: [PATCH 31/48] test --- fedn/network/clients/grpc_handler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fedn/network/clients/grpc_handler.py b/fedn/network/clients/grpc_handler.py index 9858c530d..e94f2d8f1 100644 --- a/fedn/network/clients/grpc_handler.py +++ b/fedn/network/clients/grpc_handler.py @@ -26,6 +26,7 @@ MAX_CONNECTION_IDLE_MS = 30000 MAX_CONNECTION_AGE_GRACE_MS = "INT_MAX" # keep connection open indefinitely CLIENT_IDLE_TIMEOUT_MS = 30000 +MAX_MESSAGE_LENGTH = 10 * 1024 * 1024 GRPC_OPTIONS = [ ("grpc.keepalive_time_ms", KEEPALIVE_TIME_MS), @@ -35,6 +36,8 @@ ("grpc.max_connection_idle_ms", MAX_CONNECTION_IDLE_MS), ("grpc.max_connection_age_grace_ms", MAX_CONNECTION_AGE_GRACE_MS), ("grpc.client_idle_timeout_ms", CLIENT_IDLE_TIMEOUT_MS), + ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH), + ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH), ] From 9bd128317cd721eef3e5a0088a8ec972962a7fde Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Fri, 13 Dec 2024 23:22:49 +0100 Subject: [PATCH 32/48] test --- fedn/network/grpc/server.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/fedn/network/grpc/server.py b/fedn/network/grpc/server.py index 8523f4a64..a4e24f58a 100644 --- a/fedn/network/grpc/server.py +++ b/fedn/network/grpc/server.py @@ -5,7 +5,8 @@ from grpc_health.v1 import health, health_pb2_grpc import fedn.network.grpc.fedn_pb2_grpc as rpc -from fedn.common.log_config import logger, set_log_level_from_string, set_log_stream +from fedn.common.log_config import (logger, set_log_level_from_string, + set_log_stream) from fedn.network.combiner.shared import modelservice from fedn.network.grpc.auth import JWTInterceptor @@ -28,8 +29,11 @@ def __init__(self, servicer, config: ServerConfig): # Keepalive settings: these detect if the client is alive KEEPALIVE_TIME_MS = 60 * 1000 # send keepalive ping every 60 seconds - KEEPALIVE_TIMEOUT_MS = 20 * 1000 # wait 20 seconds for keepalive ping ack before considering connection dead - MAX_CONNECTION_IDLE_MS = 5 * 60 * 1000 # max idle time before server terminates the connection (5 minutes) + # wait 20 seconds for keepalive ping ack before considering connection dead + KEEPALIVE_TIMEOUT_MS = 20 * 1000 + # max idle time before server terminates the connection (5 minutes) + MAX_CONNECTION_IDLE_MS = 5 * 60 * 1000 + MAX_MESSAGE_LENGTH = 10 * 1024 * 1024 self.server = grpc.server( futures.ThreadPoolExecutor(max_workers=350), @@ -38,6 +42,8 @@ def __init__(self, servicer, config: ServerConfig): ("grpc.keepalive_time_ms", KEEPALIVE_TIME_MS), ("grpc.keepalive_timeout_ms", KEEPALIVE_TIMEOUT_MS), ("grpc.max_connection_idle_ms", MAX_CONNECTION_IDLE_MS), + ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH), + ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH), ], ) self.certificate = None @@ -54,7 +60,8 @@ def __init__(self, servicer, config: ServerConfig): if isinstance(servicer, rpc.CombinerServicer): rpc.add_ControlServicer_to_server(servicer, self.server) - health_pb2_grpc.add_HealthServicer_to_server(self.health_servicer, self.server) + health_pb2_grpc.add_HealthServicer_to_server( + self.health_servicer, self.server) if config["secure"]: logger.info("Creating secure gRPCS server using certificate") @@ -66,7 +73,8 @@ def __init__(self, servicer, config: ServerConfig): ), ) ) - self.server.add_secure_port("[::]:" + str(config["port"]), server_credentials) + self.server.add_secure_port( + "[::]:" + str(config["port"]), server_credentials) else: logger.info("Creating gRPC server") self.server.add_insecure_port("[::]:" + str(config["port"])) From ce258e68eaf1c80c4baaccedf7eb9e994381e84e Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Fri, 13 Dec 2024 23:54:21 +0100 Subject: [PATCH 33/48] test --- fedn/network/clients/grpc_handler.py | 3 --- fedn/network/combiner/modelservice.py | 2 +- fedn/network/grpc/server.py | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/fedn/network/clients/grpc_handler.py b/fedn/network/clients/grpc_handler.py index e94f2d8f1..9858c530d 100644 --- a/fedn/network/clients/grpc_handler.py +++ b/fedn/network/clients/grpc_handler.py @@ -26,7 +26,6 @@ MAX_CONNECTION_IDLE_MS = 30000 MAX_CONNECTION_AGE_GRACE_MS = "INT_MAX" # keep connection open indefinitely CLIENT_IDLE_TIMEOUT_MS = 30000 -MAX_MESSAGE_LENGTH = 10 * 1024 * 1024 GRPC_OPTIONS = [ ("grpc.keepalive_time_ms", KEEPALIVE_TIME_MS), @@ -36,8 +35,6 @@ ("grpc.max_connection_idle_ms", MAX_CONNECTION_IDLE_MS), ("grpc.max_connection_age_grace_ms", MAX_CONNECTION_AGE_GRACE_MS), ("grpc.client_idle_timeout_ms", CLIENT_IDLE_TIMEOUT_MS), - ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH), - ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH), ] diff --git a/fedn/network/combiner/modelservice.py b/fedn/network/combiner/modelservice.py index 76702e941..ef5f9a75a 100644 --- a/fedn/network/combiner/modelservice.py +++ b/fedn/network/combiner/modelservice.py @@ -9,7 +9,7 @@ from fedn.common.log_config import logger from fedn.network.storage.models.tempmodelstorage import TempModelStorage -CHUNK_SIZE = 4 * 1024 * 1024 +CHUNK_SIZE = 2 * 1024 * 1024 def upload_request_generator(mdl, id): diff --git a/fedn/network/grpc/server.py b/fedn/network/grpc/server.py index a4e24f58a..a581c16bf 100644 --- a/fedn/network/grpc/server.py +++ b/fedn/network/grpc/server.py @@ -33,7 +33,7 @@ def __init__(self, servicer, config: ServerConfig): KEEPALIVE_TIMEOUT_MS = 20 * 1000 # max idle time before server terminates the connection (5 minutes) MAX_CONNECTION_IDLE_MS = 5 * 60 * 1000 - MAX_MESSAGE_LENGTH = 10 * 1024 * 1024 + MAX_MESSAGE_LENGTH = 100 * 1024 * 1024 self.server = grpc.server( futures.ThreadPoolExecutor(max_workers=350), From e772691e4d41e4e3e3728fefe8c2f3d704b6e225 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 16 Dec 2024 11:25:32 +0100 Subject: [PATCH 34/48] Latest --- fedn/network/combiner/modelservice.py | 2 +- fedn/network/combiner/roundhandler.py | 84 ++++++++++++++++++--------- fedn/network/grpc/server.py | 3 - 3 files changed, 57 insertions(+), 32 deletions(-) diff --git a/fedn/network/combiner/modelservice.py b/fedn/network/combiner/modelservice.py index ef5f9a75a..b2a11861d 100644 --- a/fedn/network/combiner/modelservice.py +++ b/fedn/network/combiner/modelservice.py @@ -9,7 +9,7 @@ from fedn.common.log_config import logger from fedn.network.storage.models.tempmodelstorage import TempModelStorage -CHUNK_SIZE = 2 * 1024 * 1024 +CHUNK_SIZE = 1 * 1024 * 1024 def upload_request_generator(mdl, id): diff --git a/fedn/network/combiner/roundhandler.py b/fedn/network/combiner/roundhandler.py index fa3d83e8f..604a77244 100644 --- a/fedn/network/combiner/roundhandler.py +++ b/fedn/network/combiner/roundhandler.py @@ -131,7 +131,8 @@ def _training_round(self, config: dict, clients: list, provided_functions: dict) :return: an aggregated model and associated metadata :rtype: model, dict """ - logger.info("ROUNDHANDLER: Initiating training round, participating clients: {}".format(clients)) + logger.info( + "ROUNDHANDLER: Initiating training round, participating clients: {}".format(clients)) meta = {} meta["nr_expected_updates"] = len(clients) @@ -142,11 +143,14 @@ def _training_round(self, config: dict, clients: list, provided_functions: dict) model_id = config["model_id"] if provided_functions.get("client_settings", False): - global_model_bytes = self.modelservice.temp_model_storage.get(model_id) - client_settings = self.hook_interface.client_settings(global_model_bytes) + global_model_bytes = self.modelservice.temp_model_storage.get( + model_id) + client_settings = self.hook_interface.client_settings( + global_model_bytes) config["client_settings"] = client_settings # Request model updates from all active clients. - self.server.request_model_update(session_id=session_id, model_id=model_id, config=config, clients=clients) + self.server.request_model_update( + session_id=session_id, model_id=model_id, config=config, clients=clients) # If buffer_size is -1 (default), the round terminates when/if all clients have completed. if int(config["buffer_size"]) == -1: @@ -161,7 +165,8 @@ def _training_round(self, config: dict, clients: list, provided_functions: dict) data = None try: helper = get_helper(config["helper_type"]) - logger.info("Config delete_models_storage: {}".format(config["delete_models_storage"])) + logger.info("Config delete_models_storage: {}".format( + config["delete_models_storage"])) if config["delete_models_storage"] == "True": delete_models = True else: @@ -173,10 +178,13 @@ def _training_round(self, config: dict, clients: list, provided_functions: dict) else: parameters = None if provided_functions.get("aggregate", False): - previous_model_bytes = self.modelservice.temp_model_storage.get(model_id) - model, data = self.hook_interface.aggregate(previous_model_bytes, self.update_handler, helper, delete_models=delete_models) + previous_model_bytes = self.modelservice.temp_model_storage.get( + model_id) + model, data = self.hook_interface.aggregate( + previous_model_bytes, self.update_handler, helper, delete_models=delete_models) else: - model, data = self.aggregator.combine_models(helper=helper, delete_models=delete_models, parameters=parameters) + model, data = self.aggregator.combine_models( + helper=helper, delete_models=delete_models, parameters=parameters) except Exception as e: logger.warning("AGGREGATION FAILED AT COMBINER! {}".format(e)) raise @@ -195,7 +203,8 @@ def _validation_round(self, session_id, model_id, clients): :param model_id: The ID of the model to validate :type model_id: str """ - self.server.request_model_validation(session_id, model_id, clients=clients) + self.server.request_model_validation( + session_id, model_id, clients=clients) def _prediction_round(self, prediction_id: str, model_id: str, clients: list): """Send model prediction requests to clients. @@ -207,7 +216,8 @@ def _prediction_round(self, prediction_id: str, model_id: str, clients: list): :param model_id: The ID of the model to use for prediction :type model_id: str """ - self.server.request_model_prediction(prediction_id, model_id, clients=clients) + self.server.request_model_prediction( + prediction_id, model_id, clients=clients) def stage_model(self, model_id, timeout_retry=3, retry=2): """Download a model from persistent storage and set in modelservice. @@ -221,7 +231,8 @@ def stage_model(self, model_id, timeout_retry=3, retry=2): """ # If the model is already in memory at the server we do not need to do anything. if self.modelservice.temp_model_storage.exist(model_id): - logger.info("Model already exists in memory, skipping model staging.") + logger.info( + "Model already exists in memory, skipping model staging.") return logger.info("Model Staging, fetching model from storage...") # If not, download it and stage it in memory at the combiner. @@ -232,11 +243,13 @@ def stage_model(self, model_id, timeout_retry=3, retry=2): if model: break except Exception: - logger.warning("Could not fetch model from storage backend, retrying.") + logger.warning( + "Could not fetch model from storage backend, retrying.") time.sleep(timeout_retry) tries += 1 if tries > retry: - logger.error("Failed to stage model {} from storage backend!".format(model_id)) + logger.error( + "Failed to stage model {} from storage backend!".format(model_id)) raise self.modelservice.set_model(model, model_id) @@ -256,7 +269,8 @@ def _assign_round_clients(self, n, type="trainers"): elif type == "trainers": clients = self.server.get_active_trainers() else: - logger.error("(ERROR): {} is not a supported type of client".format(type)) + logger.error( + "(ERROR): {} is not a supported type of client".format(type)) # If the number of requested trainers exceeds the number of available, use all available. n = min(n, len(clients)) @@ -278,7 +292,8 @@ def _check_nr_round_clients(self, config): """ active = self.server.nr_active_trainers() if active >= int(config["clients_required"]): - logger.info("Number of clients required ({0}) to start round met {1}.".format(config["clients_required"], active)) + logger.info("Number of clients required ({0}) to start round met {1}.".format( + config["clients_required"], active)) return True else: logger.info("Too few clients to start round.") @@ -290,9 +305,11 @@ def execute_validation_round(self, session_id, model_id): :param round_config: The round config object. :type round_config: dict """ - logger.info("COMBINER orchestrating validation of model {}".format(model_id)) + logger.info( + "COMBINER orchestrating validation of model {}".format(model_id)) self.stage_model(model_id) - validators = self._assign_round_clients(self.server.max_clients, type="validators") + validators = self._assign_round_clients( + self.server.max_clients, type="validators") self._validation_round(session_id, model_id, validators) def execute_prediction_round(self, prediction_id: str, model_id: str) -> None: @@ -301,10 +318,12 @@ def execute_prediction_round(self, prediction_id: str, model_id: str) -> None: :param round_config: The round config object. :type round_config: dict """ - logger.info("COMBINER orchestrating prediction using model {}".format(model_id)) + logger.info( + "COMBINER orchestrating prediction using model {}".format(model_id)) self.stage_model(model_id) # TODO: Implement prediction client type - clients = self._assign_round_clients(self.server.max_clients, type="validators") + clients = self._assign_round_clients( + self.server.max_clients, type="validators") self._prediction_round(prediction_id, model_id, clients) def execute_training_round(self, config): @@ -315,7 +334,8 @@ def execute_training_round(self, config): :return: metadata about the training round. :rtype: dict """ - logger.info("Processing training round, job_id {}".format(config["_job_id"])) + logger.info("Processing training round, job_id {}".format( + config["_job_id"])) data = {} data["config"] = config @@ -324,17 +344,20 @@ def execute_training_round(self, config): # Download model to update and set in temp storage. self.stage_model(config["model_id"]) - provided_functions = self.hook_interface.provided_functions(self.server_functions) + provided_functions = self.hook_interface.provided_functions( + self.server_functions) if provided_functions.get("client_selection", False): - clients = self.hook_interface.client_selection(clients=self.server.get_active_trainers()) + clients = self.hook_interface.client_selection( + clients=self.server.get_active_trainers()) else: clients = self._assign_round_clients(self.server.max_clients) model, meta = self._training_round(config, clients, provided_functions) data["data"] = meta if model is None: - logger.warning("\t Failed to update global model in round {0}!".format(config["round_id"])) + logger.warning( + "\t Failed to update global model in round {0}!".format(config["round_id"])) if model is not None: helper = get_helper(config["helper_type"]) @@ -343,7 +366,8 @@ def execute_training_round(self, config): a.close() data["model_id"] = model_id - logger.info("TRAINING ROUND COMPLETED. Aggregated model id: {}, Job id: {}".format(model_id, config["_job_id"])) + logger.info("TRAINING ROUND COMPLETED. Aggregated model id: {}, Job id: {}".format( + model_id, config["_job_id"])) # Delete temp model self.modelservice.temp_model_storage.delete(config["model_id"]) @@ -369,11 +393,14 @@ def run(self, polling_interval=1.0): session_id = round_config["session_id"] model_id = round_config["model_id"] tic = time.time() - round_meta = self.execute_training_round(round_config) - round_meta["time_exec_training"] = time.time() - tic + round_meta = self.execute_training_round( + round_config) + round_meta["time_exec_training"] = time.time() - \ + tic round_meta["status"] = "Success" round_meta["name"] = self.server.id - self.server.statestore.set_round_combiner_data(round_meta) + self.server.statestore.set_round_combiner_data( + round_meta) elif round_config["task"] == "validation": session_id = round_config["session_id"] model_id = round_config["model_id"] @@ -381,7 +408,8 @@ def run(self, polling_interval=1.0): elif round_config["task"] == "prediction": prediction_id = round_config["prediction_id"] model_id = round_config["model_id"] - self.execute_prediction_round(prediction_id, model_id) + self.execute_prediction_round( + prediction_id, model_id) else: logger.warning("config contains unkown task type.") else: diff --git a/fedn/network/grpc/server.py b/fedn/network/grpc/server.py index a581c16bf..7f6109324 100644 --- a/fedn/network/grpc/server.py +++ b/fedn/network/grpc/server.py @@ -33,7 +33,6 @@ def __init__(self, servicer, config: ServerConfig): KEEPALIVE_TIMEOUT_MS = 20 * 1000 # max idle time before server terminates the connection (5 minutes) MAX_CONNECTION_IDLE_MS = 5 * 60 * 1000 - MAX_MESSAGE_LENGTH = 100 * 1024 * 1024 self.server = grpc.server( futures.ThreadPoolExecutor(max_workers=350), @@ -42,8 +41,6 @@ def __init__(self, servicer, config: ServerConfig): ("grpc.keepalive_time_ms", KEEPALIVE_TIME_MS), ("grpc.keepalive_timeout_ms", KEEPALIVE_TIMEOUT_MS), ("grpc.max_connection_idle_ms", MAX_CONNECTION_IDLE_MS), - ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH), - ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH), ], ) self.certificate = None From 608789067b52672e9de8ae939e8219cd08c60c92 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 16 Dec 2024 11:32:42 +0100 Subject: [PATCH 35/48] Added back time_model_load/aggregation --- fedn/network/combiner/aggregators/fedavg.py | 30 +++++++--- fedn/network/combiner/aggregators/fedopt.py | 66 ++++++++++++++------- 2 files changed, 68 insertions(+), 28 deletions(-) diff --git a/fedn/network/combiner/aggregators/fedavg.py b/fedn/network/combiner/aggregators/fedavg.py index 71dab273a..2bc9526c1 100644 --- a/fedn/network/combiner/aggregators/fedavg.py +++ b/fedn/network/combiner/aggregators/fedavg.py @@ -1,3 +1,4 @@ +import time import traceback from fedn.common.log_config import logger @@ -41,26 +42,37 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): nr_aggregated_models = 0 total_examples = 0 - logger.info("AGGREGATOR({}): Aggregating model updates... ".format(self.name)) + logger.info( + "AGGREGATOR({}): Aggregating model updates... ".format(self.name)) while not self.update_handler.model_updates.empty(): try: - logger.info("AGGREGATOR({}): Getting next model update from queue.".format(self.name)) + logger.info( + "AGGREGATOR({}): Getting next model update from queue.".format(self.name)) model_update = self.update_handler.next_model_update() # Load model parameters and metadata - logger.info("AGGREGATOR({}): Loading model metadata {}.".format(self.name, model_update.model_update_id)) - model_next, metadata = self.update_handler.load_model_update(model_update, helper) + logger.info("AGGREGATOR({}): Loading model metadata {}.".format( + self.name, model_update.model_update_id)) - logger.info("AGGREGATOR({}): Processing model update {}, metadata: {} ".format(self.name, model_update.model_update_id, metadata)) + tic = time.time() + model_next, metadata = self.update_handler.load_model_update( + model_update, helper) + data['time_model_load'] += time.time()-tic + + logger.info("AGGREGATOR({}): Processing model update {}, metadata: {} ".format( + self.name, model_update.model_update_id, metadata)) # Increment total number of examples total_examples += metadata["num_examples"] + tic = time.time() if nr_aggregated_models == 0: model = model_next else: - model = helper.increment_average(model, model_next, metadata["num_examples"], total_examples) + model = helper.increment_average( + model, model_next, metadata["num_examples"], total_examples) + data['time_model_aggregration'] += time.time()-tic nr_aggregated_models += 1 # Delete model from storage @@ -68,10 +80,12 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): self.update_handler.delete_model(model_update) except Exception as e: tb = traceback.format_exc() - logger.error(f"AGGREGATOR({self.name}): Error encoutered while processing model update: {e}") + logger.error( + f"AGGREGATOR({self.name}): Error encoutered while processing model update: {e}") logger.error(tb) data["nr_aggregated_models"] = nr_aggregated_models - logger.info("AGGREGATOR({}): Aggregation completed, aggregated {} models.".format(self.name, nr_aggregated_models)) + logger.info("AGGREGATOR({}): Aggregation completed, aggregated {} models.".format( + self.name, nr_aggregated_models)) return model, data diff --git a/fedn/network/combiner/aggregators/fedopt.py b/fedn/network/combiner/aggregators/fedopt.py index d91fe6d22..eb8e7cf4d 100644 --- a/fedn/network/combiner/aggregators/fedopt.py +++ b/fedn/network/combiner/aggregators/fedopt.py @@ -1,4 +1,5 @@ import math +import time from fedn.common.exceptions import InvalidParameterError from fedn.common.log_config import logger @@ -61,7 +62,8 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): try: parameters.validate(parameter_schema) except InvalidParameterError as e: - logger.error("Aggregator {} recieved invalid parameters. Reason {}".format(self.name, e)) + logger.error( + "Aggregator {} recieved invalid parameters. Reason {}".format(self.name, e)) return None, data # Default hyperparameters. Note that these may need fine tuning. @@ -78,10 +80,12 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): try: parameters.validate(parameter_schema) except InvalidParameterError as e: - logger.error("Aggregator {} recieved invalid parameters. Reason {}".format(self.name, e)) + logger.error( + "Aggregator {} recieved invalid parameters. Reason {}".format(self.name, e)) return None, data else: - logger.info("Aggregator {} using default parameteres.", format(self.name)) + logger.info("Aggregator {} using default parameteres.", + format(self.name)) parameters = self.default_parameters # Override missing paramters with defaults @@ -93,48 +97,67 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): nr_aggregated_models = 0 total_examples = 0 - logger.info("AGGREGATOR({}): Aggregating model updates... ".format(self.name)) + logger.info( + "AGGREGATOR({}): Aggregating model updates... ".format(self.name)) while not self.update_handler.model_updates.empty(): try: - logger.info("AGGREGATOR({}): Getting next model update from queue.".format(self.name)) + logger.info( + "AGGREGATOR({}): Getting next model update from queue.".format(self.name)) model_update = self.update_handler.next_model_update() # Load model paratmeters and metadata - model_next, metadata = self.update_handler.load_model_update(model_update, helper) - logger.info("AGGREGATOR({}): Processing model update {}".format(self.name, model_update.model_update_id)) + tic = time.time() + model_next, metadata = self.update_handler.load_model_update( + model_update, helper) + data['time_model_load'] += time.time()-tic + + logger.info("AGGREGATOR({}): Processing model update {}".format( + self.name, model_update.model_update_id)) # Increment total number of examples total_examples += metadata["num_examples"] + tic = time.time() if nr_aggregated_models == 0: - model_old = self.update_handler.load_model(helper, model_update.model_id) + model_old = self.update_handler.load_model( + helper, model_update.model_id) pseudo_gradient = helper.subtract(model_next, model_old) else: - pseudo_gradient_next = helper.subtract(model_next, model_old) - pseudo_gradient = helper.increment_average(pseudo_gradient, pseudo_gradient_next, metadata["num_examples"], total_examples) + pseudo_gradient_next = helper.subtract( + model_next, model_old) + pseudo_gradient = helper.increment_average( + pseudo_gradient, pseudo_gradient_next, metadata["num_examples"], total_examples) + data['time_model_aggregration'] += time.time()-tic nr_aggregated_models += 1 # Delete model from storage if delete_models: - self.update_handler.delete_model(model_update.model_update_id) - logger.info("AGGREGATOR({}): Deleted model update {} from storage.".format(self.name, model_update.model_update_id)) + self.update_handler.delete_model( + model_update.model_update_id) + logger.info("AGGREGATOR({}): Deleted model update {} from storage.".format( + self.name, model_update.model_update_id)) except Exception as e: - logger.error("AGGREGATOR({}): Error encoutered while processing model update {}, skipping this update.".format(self.name, e)) + logger.error( + "AGGREGATOR({}): Error encoutered while processing model update {}, skipping this update.".format(self.name, e)) if parameters["serveropt"] == "adam": - model = self.serveropt_adam(helper, pseudo_gradient, model_old, parameters) + model = self.serveropt_adam( + helper, pseudo_gradient, model_old, parameters) elif parameters["serveropt"] == "yogi": - model = self.serveropt_yogi(helper, pseudo_gradient, model_old, parameters) + model = self.serveropt_yogi( + helper, pseudo_gradient, model_old, parameters) elif parameters["serveropt"] == "adagrad": - model = self.serveropt_adagrad(helper, pseudo_gradient, model_old, parameters) + model = self.serveropt_adagrad( + helper, pseudo_gradient, model_old, parameters) else: logger.error("Unsupported server optimizer passed to FedOpt.") return None, data data["nr_aggregated_models"] = nr_aggregated_models - logger.info("AGGREGATOR({}): Aggregation completed, aggregated {} models.".format(self.name, nr_aggregated_models)) + logger.info("AGGREGATOR({}): Aggregation completed, aggregated {} models.".format( + self.name, nr_aggregated_models)) return model, data def serveropt_adam(self, helper, pseudo_gradient, model_old, parameters): @@ -160,7 +183,8 @@ def serveropt_adam(self, helper, pseudo_gradient, model_old, parameters): self.v = helper.ones(pseudo_gradient, math.pow(tau, 2)) if not self.m: - self.m = helper.multiply(pseudo_gradient, [(1.0 - beta1)] * len(pseudo_gradient)) + self.m = helper.multiply( + pseudo_gradient, [(1.0 - beta1)] * len(pseudo_gradient)) else: self.m = helper.add(self.m, pseudo_gradient, beta1, (1.0 - beta1)) @@ -196,7 +220,8 @@ def serveropt_yogi(self, helper, pseudo_gradient, model_old, parameters): self.v = helper.ones(pseudo_gradient, math.pow(tau, 2)) if not self.m: - self.m = helper.multiply(pseudo_gradient, [(1.0 - beta1)] * len(pseudo_gradient)) + self.m = helper.multiply( + pseudo_gradient, [(1.0 - beta1)] * len(pseudo_gradient)) else: self.m = helper.add(self.m, pseudo_gradient, beta1, (1.0 - beta1)) @@ -233,7 +258,8 @@ def serveropt_adagrad(self, helper, pseudo_gradient, model_old, parameters): self.v = helper.ones(pseudo_gradient, math.pow(tau, 2)) if not self.m: - self.m = helper.multiply(pseudo_gradient, [(1.0 - beta1)] * len(pseudo_gradient)) + self.m = helper.multiply( + pseudo_gradient, [(1.0 - beta1)] * len(pseudo_gradient)) else: self.m = helper.add(self.m, pseudo_gradient, beta1, (1.0 - beta1)) From 10a8ca44620e39a926b03195d0421766c6789fca Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 16 Dec 2024 12:56:37 +0100 Subject: [PATCH 36/48] Bugfix timing of aggregation --- fedn/network/combiner/aggregators/fedopt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedn/network/combiner/aggregators/fedopt.py b/fedn/network/combiner/aggregators/fedopt.py index eb8e7cf4d..61a5858d3 100644 --- a/fedn/network/combiner/aggregators/fedopt.py +++ b/fedn/network/combiner/aggregators/fedopt.py @@ -128,7 +128,7 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): model_next, model_old) pseudo_gradient = helper.increment_average( pseudo_gradient, pseudo_gradient_next, metadata["num_examples"], total_examples) - data['time_model_aggregration'] += time.time()-tic + data['time_model_aggregation'] += time.time()-tic nr_aggregated_models += 1 # Delete model from storage From 5d188d118df43575d2f7dc76f5257a89d1622d08 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 16 Dec 2024 13:05:42 +0100 Subject: [PATCH 37/48] Fix bug in fedopt that causes model delete to fail in fedopt --- fedn/network/combiner/aggregators/fedopt.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fedn/network/combiner/aggregators/fedopt.py b/fedn/network/combiner/aggregators/fedopt.py index 61a5858d3..5ce19384c 100644 --- a/fedn/network/combiner/aggregators/fedopt.py +++ b/fedn/network/combiner/aggregators/fedopt.py @@ -133,8 +133,7 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): nr_aggregated_models += 1 # Delete model from storage if delete_models: - self.update_handler.delete_model( - model_update.model_update_id) + self.update_handler.delete_model(model_update) logger.info("AGGREGATOR({}): Deleted model update {} from storage.".format( self.name, model_update.model_update_id)) except Exception as e: From c68c1afdead3bd9fa3f59ba2257747eb104a7294 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 16 Dec 2024 13:18:35 +0100 Subject: [PATCH 38/48] Improver error handling in fedopt --- fedn/network/combiner/aggregators/fedopt.py | 30 +++++++++++++-------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/fedn/network/combiner/aggregators/fedopt.py b/fedn/network/combiner/aggregators/fedopt.py index 5ce19384c..2c1ca6986 100644 --- a/fedn/network/combiner/aggregators/fedopt.py +++ b/fedn/network/combiner/aggregators/fedopt.py @@ -1,5 +1,6 @@ import math import time +import traceback from fedn.common.exceptions import InvalidParameterError from fedn.common.log_config import logger @@ -140,17 +141,24 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): logger.error( "AGGREGATOR({}): Error encoutered while processing model update {}, skipping this update.".format(self.name, e)) - if parameters["serveropt"] == "adam": - model = self.serveropt_adam( - helper, pseudo_gradient, model_old, parameters) - elif parameters["serveropt"] == "yogi": - model = self.serveropt_yogi( - helper, pseudo_gradient, model_old, parameters) - elif parameters["serveropt"] == "adagrad": - model = self.serveropt_adagrad( - helper, pseudo_gradient, model_old, parameters) - else: - logger.error("Unsupported server optimizer passed to FedOpt.") + try: + if parameters["serveropt"] == "adam": + model = self.serveropt_adam( + helper, pseudo_gradient, model_old, parameters) + elif parameters["serveropt"] == "yogi": + model = self.serveropt_yogi( + helper, pseudo_gradient, model_old, parameters) + elif parameters["serveropt"] == "adagrad": + model = self.serveropt_adagrad( + helper, pseudo_gradient, model_old, parameters) + else: + logger.error("Unsupported server optimizer passed to FedOpt.") + return None, data + except Exception as e: + tb = traceback.format_exc() + logger.error( + "AGGREGATOR({}): Error encoutered while while aggregating: {}".format(self.name, e)) + logger.error(tb) return None, data data["nr_aggregated_models"] = nr_aggregated_models From 500563ce98d05dc57c86cfc600494874f5d9d6a6 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 16 Dec 2024 13:28:19 +0100 Subject: [PATCH 39/48] Bugfix --- fedn/network/combiner/aggregators/fedavg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedn/network/combiner/aggregators/fedavg.py b/fedn/network/combiner/aggregators/fedavg.py index 2bc9526c1..d3e1c513f 100644 --- a/fedn/network/combiner/aggregators/fedavg.py +++ b/fedn/network/combiner/aggregators/fedavg.py @@ -72,7 +72,7 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): else: model = helper.increment_average( model, model_next, metadata["num_examples"], total_examples) - data['time_model_aggregration'] += time.time()-tic + data['time_model_aggregation'] += time.time()-tic nr_aggregated_models += 1 # Delete model from storage From dd5db2f9d1639dc7ea17779c04d3e7377812c06c Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Mon, 16 Dec 2024 15:08:51 +0100 Subject: [PATCH 40/48] ruff linting --- examples/load-test/client/model.py | 8 +++----- examples/load-test/client/train.py | 7 +++---- examples/load-test/client/validate.py | 7 +++---- fedn/network/combiner/aggregators/fedavg.py | 4 ++-- fedn/network/combiner/aggregators/fedopt.py | 4 ++-- 5 files changed, 13 insertions(+), 17 deletions(-) diff --git a/examples/load-test/client/model.py b/examples/load-test/client/model.py index 4d6b89a31..6f61069c0 100644 --- a/examples/load-test/client/model.py +++ b/examples/load-test/client/model.py @@ -1,12 +1,10 @@ # /bin/python -import sys -import time import numpy as np -from fedn.utils.helpers.helpers import get_helper, save_metadata, save_metrics +from fedn.utils.helpers.helpers import get_helper -HELPER_MODULE = 'numpyhelper' +HELPER_MODULE = "numpyhelper" ARRAY_SIZE = 20000000 @@ -35,7 +33,7 @@ def load_model(model_path): return weights -def init_seed(out_path='seed.npz'): +def init_seed(out_path="seed.npz"): """ Initialize seed model. :param out_path: The path to save the seed model to. diff --git a/examples/load-test/client/train.py b/examples/load-test/client/train.py index 5bfe38ede..bfd5f00b7 100644 --- a/examples/load-test/client/train.py +++ b/examples/load-test/client/train.py @@ -5,9 +5,9 @@ import numpy as np from model import load_model, save_model -from fedn.utils.helpers.helpers import get_helper, save_metadata, save_metrics +from fedn.utils.helpers.helpers import save_metadata -HELPER_MODULE = 'numpyhelper' +HELPER_MODULE = "numpyhelper" ARRAY_SIZE = 10000 @@ -15,7 +15,6 @@ def train(in_model_path, out_model_path): """ Train model. """ - # Load model weights = load_model(in_model_path) @@ -24,7 +23,7 @@ def train(in_model_path, out_model_path): # Metadata needed for aggregation server side metadata = { - 'num_examples': ARRAY_SIZE, + "num_examples": ARRAY_SIZE, } # Save JSON metadata file diff --git a/examples/load-test/client/validate.py b/examples/load-test/client/validate.py index c79a0d169..8d710b546 100644 --- a/examples/load-test/client/validate.py +++ b/examples/load-test/client/validate.py @@ -1,13 +1,12 @@ # /bin/python import sys -import time import numpy as np -from model import load_model, save_model +from model import load_model -from fedn.utils.helpers.helpers import get_helper, save_metadata, save_metrics +from fedn.utils.helpers.helpers import save_metrics -HELPER_MODULE = 'numpyhelper' +HELPER_MODULE = "numpyhelper" ARRAY_SIZE = 1000000 diff --git a/fedn/network/combiner/aggregators/fedavg.py b/fedn/network/combiner/aggregators/fedavg.py index d3e1c513f..7734a7051 100644 --- a/fedn/network/combiner/aggregators/fedavg.py +++ b/fedn/network/combiner/aggregators/fedavg.py @@ -58,7 +58,7 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): tic = time.time() model_next, metadata = self.update_handler.load_model_update( model_update, helper) - data['time_model_load'] += time.time()-tic + data["time_model_load"] += time.time()-tic logger.info("AGGREGATOR({}): Processing model update {}, metadata: {} ".format( self.name, model_update.model_update_id, metadata)) @@ -72,7 +72,7 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): else: model = helper.increment_average( model, model_next, metadata["num_examples"], total_examples) - data['time_model_aggregation'] += time.time()-tic + data["time_model_aggregation"] += time.time()-tic nr_aggregated_models += 1 # Delete model from storage diff --git a/fedn/network/combiner/aggregators/fedopt.py b/fedn/network/combiner/aggregators/fedopt.py index 2c1ca6986..6ed62e078 100644 --- a/fedn/network/combiner/aggregators/fedopt.py +++ b/fedn/network/combiner/aggregators/fedopt.py @@ -111,7 +111,7 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): tic = time.time() model_next, metadata = self.update_handler.load_model_update( model_update, helper) - data['time_model_load'] += time.time()-tic + data["time_model_load"] += time.time()-tic logger.info("AGGREGATOR({}): Processing model update {}".format( self.name, model_update.model_update_id)) @@ -129,7 +129,7 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): model_next, model_old) pseudo_gradient = helper.increment_average( pseudo_gradient, pseudo_gradient_next, metadata["num_examples"], total_examples) - data['time_model_aggregation'] += time.time()-tic + data["time_model_aggregation"] += time.time()-tic nr_aggregated_models += 1 # Delete model from storage From f8f9732b817af1e3f805cff7912b62f408ed978d Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Wed, 8 Jan 2025 16:05:37 +0100 Subject: [PATCH 41/48] Train callback compatible with 0.20.0 --- examples/cifar100/run_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cifar100/run_client.py b/examples/cifar100/run_client.py index e94a9cbed..caf6c6cb6 100644 --- a/examples/cifar100/run_client.py +++ b/examples/cifar100/run_client.py @@ -26,7 +26,7 @@ def get_api_url(api_url: str, api_port: int, secure: bool = False): return url -def on_train(in_model): +def on_train(in_model, client_settings): # Save model to temp file inpath = helper.get_tmp_path() with open(inpath, "wb") as fh: From 0150ab91d25300bf6307e26ddb56a9b11f236e51 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Thu, 9 Jan 2025 16:31:53 +0100 Subject: [PATCH 42/48] wip --- fedn/network/combiner/roundhandler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedn/network/combiner/roundhandler.py b/fedn/network/combiner/roundhandler.py index 604a77244..dae885a1c 100644 --- a/fedn/network/combiner/roundhandler.py +++ b/fedn/network/combiner/roundhandler.py @@ -30,7 +30,7 @@ class RoundConfig(TypedDict): :type round_id: str :param round_timeout: The round timeout in seconds. Set by user interfaces or Controller. :type round_timeout: str - :param rounds: The number of rounds. Set by user interfaces. + :param rounds: The number of rounds. Set by user interfaces.w :param model_id: The model identifier. Set by user interfaces or Controller (get_latest_model). :type model_id: str :param model_version: The model version. Currently not used. From ef184fa5a11afed53978d71e07f409424f1c6aed Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Thu, 9 Jan 2025 22:30:15 +0100 Subject: [PATCH 43/48] Defined pseudo_gradient --- fedn/network/combiner/aggregators/fedopt.py | 50 +++++++++++---------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/fedn/network/combiner/aggregators/fedopt.py b/fedn/network/combiner/aggregators/fedopt.py index 6ed62e078..ff83b0c76 100644 --- a/fedn/network/combiner/aggregators/fedopt.py +++ b/fedn/network/combiner/aggregators/fedopt.py @@ -94,9 +94,9 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): if key not in parameters: parameters[key] = value - model = None - nr_aggregated_models = 0 - total_examples = 0 + # Aggregation initialization + model, pseudo_gradient = None, None + nr_aggregated_models, total_examples = 0, 0 logger.info( "AGGREGATOR({}): Aggregating model updates... ".format(self.name)) @@ -139,30 +139,34 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): self.name, model_update.model_update_id)) except Exception as e: logger.error( - "AGGREGATOR({}): Error encoutered while processing model update {}, skipping this update.".format(self.name, e)) + "AGGREGATOR({}): Error encoutered while processing model update {}, skiphttps://github.com/scaleoutsystems/fedn/pull/770ping this update.".format(self.name, e)) - try: - if parameters["serveropt"] == "adam": - model = self.serveropt_adam( - helper, pseudo_gradient, model_old, parameters) - elif parameters["serveropt"] == "yogi": - model = self.serveropt_yogi( - helper, pseudo_gradient, model_old, parameters) - elif parameters["serveropt"] == "adagrad": - model = self.serveropt_adagrad( - helper, pseudo_gradient, model_old, parameters) - else: - logger.error("Unsupported server optimizer passed to FedOpt.") + data["nr_aggregated_models"] = nr_aggregated_models + + if pseudo_gradient: + try: + if parameters["serveropt"] == "adam": + model = self.serveropt_adam( + helper, pseudo_gradient, model_old, parameters) + elif parameters["serveropt"] == "yogi": + model = self.serveropt_yogi( + helper, pseudo_gradient, model_old, parameters) + elif parameters["serveropt"] == "adagrad": + model = self.serveropt_adagrad( + helper, pseudo_gradient, model_old, parameters) + else: + logger.error( + "Unsupported server optimizer passed to FedOpt.") + return None, data + except Exception as e: + tb = traceback.format_exc() + logger.error( + "AGGREGATOR({}): Error encoutered while while aggregating: {}".format(self.name, e)) + logger.error(tb) return None, data - except Exception as e: - tb = traceback.format_exc() - logger.error( - "AGGREGATOR({}): Error encoutered while while aggregating: {}".format(self.name, e)) - logger.error(tb) + else: return None, data - data["nr_aggregated_models"] = nr_aggregated_models - logger.info("AGGREGATOR({}): Aggregation completed, aggregated {} models.".format( self.name, nr_aggregated_models)) return model, data From aa11dc79e9ce24bbe8383135057dfb45d0bd0aa4 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Fri, 10 Jan 2025 12:10:09 +0100 Subject: [PATCH 44/48] Refactor FedOpt --- fedn/network/combiner/aggregators/fedopt.py | 193 ++++++++++---------- 1 file changed, 100 insertions(+), 93 deletions(-) diff --git a/fedn/network/combiner/aggregators/fedopt.py b/fedn/network/combiner/aggregators/fedopt.py index ff83b0c76..734b91dba 100644 --- a/fedn/network/combiner/aggregators/fedopt.py +++ b/fedn/network/combiner/aggregators/fedopt.py @@ -1,10 +1,13 @@ import math import time import traceback +from typing import Any, Dict, Optional, Tuple from fedn.common.exceptions import InvalidParameterError from fedn.common.log_config import logger from fedn.network.combiner.aggregators.aggregatorbase import AggregatorBase +from fedn.utils.helpers.helperbase import HelperBase +from fedn.utils.parameters import Parameters class Aggregator(AggregatorBase): @@ -17,6 +20,9 @@ class Aggregator(AggregatorBase): A server-side scheme is then applied, currenty supported schemes are "adam", "yogi", "adagrad". + Limitations: + - Only supports one combiner. + - Momentum is reser for each new invokation of a training session. :param control: A handle to the :class: `fedn.network.combiner.updatehandler.UpdateHandler` :type control: class: `fedn.network.combiner.updatehandler.UpdateHandler` @@ -31,43 +37,23 @@ def __init__(self, update_handler): self.v = None self.m = None - def combine_models(self, helper=None, delete_models=True, parameters=None): - """Compute pseudo gradients using model updates in the queue. - - :param helper: An instance of :class: `fedn.utils.helpers.helpers.HelperBase`, ML framework specific helper, defaults to None - :type helper: class: `fedn.utils.helpers.helpers.HelperBase`, optional - :param time_window: The time window for model aggregation, defaults to 180 - :type time_window: int, optional - :param max_nr_models: The maximum number of updates aggregated, defaults to 100 - :type max_nr_models: int, optional - :param delete_models: Delete models from storage after aggregation, defaults to True - :type delete_models: bool, optional - :param parameters: Aggregator hyperparameters. - :type parameters: `fedn.utils.parmeters.Parameters`, optional - :return: The global model and metadata - :rtype: tuple + def combine_models( + self, + helper: Optional[HelperBase] = None, + delete_models: bool = True, + parameters: Optional[Parameters] = None + ) -> Tuple[Optional[Any], Dict[str, float]]: """ - data = {} - data["time_model_load"] = 0.0 - data["time_model_aggregation"] = 0.0 + Compute pseudo gradients using model updates in the queue. - # Define parameter schema - parameter_schema = { - "serveropt": str, - "learning_rate": float, - "beta1": float, - "beta2": float, - "tau": float, - } - - try: - parameters.validate(parameter_schema) - except InvalidParameterError as e: - logger.error( - "Aggregator {} recieved invalid parameters. Reason {}".format(self.name, e)) - return None, data + :param helper: ML framework-specific helper, defaults to None. + :param delete_models: Delete models from storage after aggregation, defaults to True. + :param parameters: Aggregator hyperparameters, defaults to None. + :return: The global model and metadata. + """ + data = {"time_model_load": 0.0, "time_model_aggregation": 0.0} - # Default hyperparameters. Note that these may need fine tuning. + # Default hyperparameters default_parameters = { "serveropt": "adam", "learning_rate": 1e-3, @@ -76,101 +62,122 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): "tau": 1e-4, } - # Validate parameters - if parameters: - try: - parameters.validate(parameter_schema) - except InvalidParameterError as e: - logger.error( - "Aggregator {} recieved invalid parameters. Reason {}".format(self.name, e)) - return None, data - else: - logger.info("Aggregator {} using default parameteres.", - format(self.name)) - parameters = self.default_parameters + # Validate and merge parameters + try: + parameters = self._validate_and_merge_parameters( + parameters, default_parameters) + except InvalidParameterError as e: + logger.error( + f"Aggregator {self.name} received invalid parameters: {e}") + return None, data - # Override missing paramters with defaults - for key, value in default_parameters.items(): - if key not in parameters: - parameters[key] = value + logger.info(f"Aggregator {self.name} starting model aggregation.") # Aggregation initialization - model, pseudo_gradient = None, None + pseudo_gradient, model_old = None, None nr_aggregated_models, total_examples = 0, 0 - logger.info( - "AGGREGATOR({}): Aggregating model updates... ".format(self.name)) - while not self.update_handler.model_updates.empty(): try: logger.info( - "AGGREGATOR({}): Getting next model update from queue.".format(self.name)) + f"Aggregator {self.name}: Fetching next model update.") model_update = self.update_handler.next_model_update() - # Load model paratmeters and metadata tic = time.time() model_next, metadata = self.update_handler.load_model_update( model_update, helper) - data["time_model_load"] += time.time()-tic + data["time_model_load"] += time.time() - tic - logger.info("AGGREGATOR({}): Processing model update {}".format( - self.name, model_update.model_update_id)) + logger.info( + f"Processing model update {model_update.model_update_id}") - # Increment total number of examples + # Increment total examples total_examples += metadata["num_examples"] tic = time.time() - if nr_aggregated_models == 0: - model_old = self.update_handler.load_model( - helper, model_update.model_id) - pseudo_gradient = helper.subtract(model_next, model_old) - else: - pseudo_gradient_next = helper.subtract( - model_next, model_old) - pseudo_gradient = helper.increment_average( - pseudo_gradient, pseudo_gradient_next, metadata["num_examples"], total_examples) - data["time_model_aggregation"] += time.time()-tic + pseudo_gradient, model_old = self._update_pseudo_gradient( + helper, pseudo_gradient, model_next, model_old, metadata, nr_aggregated_models, total_examples + ) + data["time_model_aggregation"] += time.time() - tic nr_aggregated_models += 1 - # Delete model from storage + if delete_models: self.update_handler.delete_model(model_update) - logger.info("AGGREGATOR({}): Deleted model update {} from storage.".format( - self.name, model_update.model_update_id)) + logger.info( + f"Deleted model update {model_update.model_update_id} from storage.") except Exception as e: logger.error( - "AGGREGATOR({}): Error encoutered while processing model update {}, skiphttps://github.com/scaleoutsystems/fedn/pull/770ping this update.".format(self.name, e)) + f"Error processing model update: {e}. Skipping this update.") + logger.error(traceback.format_exc()) + continue data["nr_aggregated_models"] = nr_aggregated_models if pseudo_gradient: try: - if parameters["serveropt"] == "adam": - model = self.serveropt_adam( - helper, pseudo_gradient, model_old, parameters) - elif parameters["serveropt"] == "yogi": - model = self.serveropt_yogi( - helper, pseudo_gradient, model_old, parameters) - elif parameters["serveropt"] == "adagrad": - model = self.serveropt_adagrad( - helper, pseudo_gradient, model_old, parameters) - else: - logger.error( - "Unsupported server optimizer passed to FedOpt.") - return None, data + model = self._apply_server_optimizer( + helper, pseudo_gradient, model_old, parameters) except Exception as e: - tb = traceback.format_exc() - logger.error( - "AGGREGATOR({}): Error encoutered while while aggregating: {}".format(self.name, e)) - logger.error(tb) + logger.error(f"Error during model aggregation: {e}") + logger.error(traceback.format_exc()) return None, data else: return None, data - logger.info("AGGREGATOR({}): Aggregation completed, aggregated {} models.".format( - self.name, nr_aggregated_models)) + logger.info( + f"Aggregator {self.name} completed. Aggregated {nr_aggregated_models} models.") return model, data + def _validate_and_merge_parameters( + self, parameters: Optional[Parameters], default_parameters: Dict[str, Any] + ) -> Dict[str, Any]: + """Validate and merge default parameters.""" + parameter_schema = { + "serveropt": str, + "learning_rate": float, + "beta1": float, + "beta2": float, + "tau": float, + } + if parameters: + parameters.validate(parameter_schema) + else: + logger.info(f"Aggregator {self.name} using default parameters.") + parameters = {} + return {**default_parameters, **parameters} + + def _update_pseudo_gradient( + self, helper: HelperBase, pseudo_gradient: Any, model_next: Any, model_old: Any, + metadata: Dict[str, Any], nr_aggregated_models: int, total_examples: int + ) -> Tuple[Any, Any]: + """Update pseudo gradient based on the current model.""" + if nr_aggregated_models == 0: + model_old = self.update_handler.load_model( + helper, metadata["model_id"]) + pseudo_gradient = helper.subtract(model_next, model_old) + else: + pseudo_gradient_next = helper.subtract(model_next, model_old) + pseudo_gradient = helper.increment_average( + pseudo_gradient, pseudo_gradient_next, metadata["num_examples"], total_examples + ) + return pseudo_gradient, model_old + + def _apply_server_optimizer( + self, helper: HelperBase, pseudo_gradient: Any, model_old: Any, parameters: Dict[str, Any] + ) -> Any: + """Apply the selected server optimizer to compute the new model.""" + optimizer_map = { + "adam": self.serveropt_adam, + "yogi": self.serveropt_yogi, + "adagrad": self.serveropt_adagrad, + } + optimizer = optimizer_map.get(parameters["serveropt"]) + if not optimizer: + raise ValueError( + f"Unsupported server optimizer: {parameters['serveropt']}") + return optimizer(helper, pseudo_gradient, model_old, parameters) + def serveropt_adam(self, helper, pseudo_gradient, model_old, parameters): """Server side optimization, FedAdam. From 24e60c59ff7c880c78e84b4969c5bf26af5e33a8 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Fri, 10 Jan 2025 16:27:56 +0100 Subject: [PATCH 45/48] wip --- examples/cifar100/config.py | 2 +- fedn/network/combiner/aggregators/fedopt.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/cifar100/config.py b/examples/cifar100/config.py index f2d5132df..90cad5f71 100644 --- a/examples/cifar100/config.py +++ b/examples/cifar100/config.py @@ -1,6 +1,6 @@ settings = { "N_CLIENTS": 5, - "DISCOVER_HOST": "localhost", + "DISCOVER_HOST": "api-server", "DISCOVER_PORT": 8092, "SECURE": False, "VERIFY": False, diff --git a/fedn/network/combiner/aggregators/fedopt.py b/fedn/network/combiner/aggregators/fedopt.py index 734b91dba..ba2ce39ae 100644 --- a/fedn/network/combiner/aggregators/fedopt.py +++ b/fedn/network/combiner/aggregators/fedopt.py @@ -159,8 +159,7 @@ def _update_pseudo_gradient( else: pseudo_gradient_next = helper.subtract(model_next, model_old) pseudo_gradient = helper.increment_average( - pseudo_gradient, pseudo_gradient_next, metadata["num_examples"], total_examples - ) + pseudo_gradient, pseudo_gradient_next, metadata["num_examples"], total_examples) return pseudo_gradient, model_old def _apply_server_optimizer( From e1cc4a1fd30e8359530cdab2b8aadf63844955de Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Sat, 11 Jan 2025 14:28:59 +0100 Subject: [PATCH 46/48] Fixed review comments --- examples/load-test/README.md | 2 +- examples/load-test/requirements.txt | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 examples/load-test/requirements.txt diff --git a/examples/load-test/README.md b/examples/load-test/README.md index aca3e2b2c..4f855b7ec 100644 --- a/examples/load-test/README.md +++ b/examples/load-test/README.md @@ -7,7 +7,7 @@ test the performance of FEDn deployment in a flexible way by sending and aggregating numeric arrays of varying size. ## Prerequisites -- [Python 3.9 or 3.10](https://www.python.org/downloads) +- [Python 3.9, 3.10, 3.11 or 3.12](https://www.python.org/downloads) ## Running the example (pseudo-distributed, single host) diff --git a/examples/load-test/requirements.txt b/examples/load-test/requirements.txt new file mode 100644 index 000000000..40c5843aa --- /dev/null +++ b/examples/load-test/requirements.txt @@ -0,0 +1,2 @@ +fedn +numpy \ No newline at end of file From 99608b6414a59651dc127ad9992eaaa6b1a97ea2 Mon Sep 17 00:00:00 2001 From: Andreas Hellander Date: Sat, 11 Jan 2025 14:45:06 +0100 Subject: [PATCH 47/48] fix --- examples/cifar100/config.py | 2 +- fedn/network/combiner/aggregators/fedopt.py | 28 ++++++++------------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/examples/cifar100/config.py b/examples/cifar100/config.py index 90cad5f71..f2d5132df 100644 --- a/examples/cifar100/config.py +++ b/examples/cifar100/config.py @@ -1,6 +1,6 @@ settings = { "N_CLIENTS": 5, - "DISCOVER_HOST": "api-server", + "DISCOVER_HOST": "localhost", "DISCOVER_PORT": 8092, "SECURE": False, "VERIFY": False, diff --git a/fedn/network/combiner/aggregators/fedopt.py b/fedn/network/combiner/aggregators/fedopt.py index ba2ce39ae..be3d6a691 100644 --- a/fedn/network/combiner/aggregators/fedopt.py +++ b/fedn/network/combiner/aggregators/fedopt.py @@ -95,9 +95,16 @@ def combine_models( total_examples += metadata["num_examples"] tic = time.time() - pseudo_gradient, model_old = self._update_pseudo_gradient( - helper, pseudo_gradient, model_next, model_old, metadata, nr_aggregated_models, total_examples - ) + if nr_aggregated_models == 0: + model_old = self.update_handler.load_model( + helper, model_update.model_id) + pseudo_gradient = helper.subtract(model_next, model_old) + else: + pseudo_gradient_next = helper.subtract( + model_next, model_old) + pseudo_gradient = helper.increment_average( + pseudo_gradient, pseudo_gradient_next, metadata["num_examples"], total_examples) + data["time_model_aggregation"] += time.time() - tic nr_aggregated_models += 1 @@ -147,21 +154,6 @@ def _validate_and_merge_parameters( parameters = {} return {**default_parameters, **parameters} - def _update_pseudo_gradient( - self, helper: HelperBase, pseudo_gradient: Any, model_next: Any, model_old: Any, - metadata: Dict[str, Any], nr_aggregated_models: int, total_examples: int - ) -> Tuple[Any, Any]: - """Update pseudo gradient based on the current model.""" - if nr_aggregated_models == 0: - model_old = self.update_handler.load_model( - helper, metadata["model_id"]) - pseudo_gradient = helper.subtract(model_next, model_old) - else: - pseudo_gradient_next = helper.subtract(model_next, model_old) - pseudo_gradient = helper.increment_average( - pseudo_gradient, pseudo_gradient_next, metadata["num_examples"], total_examples) - return pseudo_gradient, model_old - def _apply_server_optimizer( self, helper: HelperBase, pseudo_gradient: Any, model_old: Any, parameters: Dict[str, Any] ) -> Any: From 94c6a53a311df10af1b642e595741d5fd8940da3 Mon Sep 17 00:00:00 2001 From: benjaminastrand Date: Wed, 15 Jan 2025 16:41:48 +0100 Subject: [PATCH 48/48] Ruff --- examples/mnist-pytorch/client/model.py | 6 +- examples/mnist-pytorch/client/train.py | 2 +- fedn/network/clients/client_v2.py | 24 ++---- fedn/network/clients/fedn_client.py | 57 +++++--------- fedn/network/clients/grpc_handler.py | 67 ++++++---------- fedn/network/combiner/aggregators/fedavg.py | 28 +++---- fedn/network/combiner/aggregators/fedopt.py | 64 +++++---------- fedn/network/combiner/modelservice.py | 24 ++---- fedn/network/combiner/roundhandler.py | 86 +++++++-------------- fedn/network/grpc/server.py | 9 +-- 10 files changed, 123 insertions(+), 244 deletions(-) diff --git a/examples/mnist-pytorch/client/model.py b/examples/mnist-pytorch/client/model.py index d400a56bc..6ad344770 100644 --- a/examples/mnist-pytorch/client/model.py +++ b/examples/mnist-pytorch/client/model.py @@ -40,8 +40,7 @@ def save_parameters(model, out_path): :param out_path: The path to save to. :type out_path: str """ - parameters_np = [val.cpu().numpy() - for _, val in model.state_dict().items()] + parameters_np = [val.cpu().numpy() for _, val in model.state_dict().items()] helper.save(parameters_np, out_path) @@ -57,8 +56,7 @@ def load_parameters(model_path): parameters_np = helper.load(model_path) params_dict = zip(model.state_dict().keys(), parameters_np) - state_dict = collections.OrderedDict( - {key: torch.tensor(x) for key, x in params_dict}) + state_dict = collections.OrderedDict({key: torch.tensor(x) for key, x in params_dict}) model.load_state_dict(state_dict, strict=True) return model diff --git a/examples/mnist-pytorch/client/train.py b/examples/mnist-pytorch/client/train.py index 9ac9cce61..919d95ee9 100644 --- a/examples/mnist-pytorch/client/train.py +++ b/examples/mnist-pytorch/client/train.py @@ -3,9 +3,9 @@ import sys import torch +from data import load_data from model import load_parameters, save_parameters -from data import load_data from fedn.utils.helpers.helpers import save_metadata dir_path = os.path.dirname(os.path.realpath(__file__)) diff --git a/fedn/network/clients/client_v2.py b/fedn/network/clients/client_v2.py index 444e29bbe..3fa58e6fd 100644 --- a/fedn/network/clients/client_v2.py +++ b/fedn/network/clients/client_v2.py @@ -8,8 +8,7 @@ from fedn.common.config import FEDN_CUSTOM_URL_PREFIX from fedn.common.log_config import logger -from fedn.network.clients.fedn_client import (ConnectToApiResult, FednClient, - GrpcConnectionOptions) +from fedn.network.clients.fedn_client import ConnectToApiResult, FednClient, GrpcConnectionOptions from fedn.network.combiner.modelservice import get_tmp_path from fedn.utils.helpers.helpers import get_helper, save_metadata @@ -77,8 +76,7 @@ def _connect_to_api(self) -> Tuple[bool, dict]: if result == ConnectToApiResult.ComputePackageMissing: logger.info("Retrying in 3 seconds") time.sleep(3) - result, response = self.fedn_client.connect_to_api( - self.fedn_api_url, self.token, self.client_obj.to_json()) + result, response = self.fedn_client.connect_to_api(self.fedn_api_url, self.token, self.client_obj.to_json()) if result == ConnectToApiResult.Assigned: return True, response @@ -96,8 +94,7 @@ def start(self): if not result: return if self.client_obj.package == "remote": - result = self.fedn_client.init_remote_compute_package( - url=self.fedn_api_url, token=self.token, package_checksum=self.package_checksum) + result = self.fedn_client.init_remote_compute_package(url=self.fedn_api_url, token=self.token, package_checksum=self.package_checksum) if not result: return @@ -109,8 +106,7 @@ def start(self): self.set_helper(combiner_config) - result: bool = self.fedn_client.init_grpchandler( - config=combiner_config, client_name=self.client_obj.client_id, token=self.token) + result: bool = self.fedn_client.init_grpchandler(config=combiner_config, client_name=self.client_obj.client_id, token=self.token) if not result: return @@ -136,8 +132,7 @@ def set_helper(self, response: GrpcConnectionOptions = None): self.helper = get_helper(helper_type_to_use) def on_train(self, in_model, client_settings): - out_model, meta = self._process_training_request( - in_model, client_settings) + out_model, meta = self._process_training_request(in_model, client_settings) return out_model, meta def on_validation(self, in_model): @@ -166,8 +161,7 @@ def _process_training_request(self, in_model: BytesIO, client_settings: dict) -> tic = time.time() - self.fedn_client.dispatcher.run_cmd( - "train {} {}".format(inpath, outpath)) + self.fedn_client.dispatcher.run_cmd("train {} {}".format(inpath, outpath)) meta["exec_training"] = time.time() - tic @@ -180,8 +174,7 @@ def _process_training_request(self, in_model: BytesIO, client_settings: dict) -> with open(outpath + "-metadata", "r") as fh: training_metadata = json.loads(fh.read()) - logger.info("SETTING Training metadata: {}".format( - training_metadata)) + logger.info("SETTING Training metadata: {}".format(training_metadata)) meta["training_metadata"] = training_metadata os.unlink(inpath) @@ -189,8 +182,7 @@ def _process_training_request(self, in_model: BytesIO, client_settings: dict) -> os.unlink(outpath + "-metadata") except Exception as e: - logger.error( - "Could not process training request due to error: {}".format(e)) + logger.error("Could not process training request due to error: {}".format(e)) out_model = None meta = {"status": "failed", "error": str(e)} diff --git a/fedn/network/clients/fedn_client.py b/fedn/network/clients/fedn_client.py index ec77f96ec..81fefbd1d 100644 --- a/fedn/network/clients/fedn_client.py +++ b/fedn/network/clients/fedn_client.py @@ -88,14 +88,12 @@ def connect_to_api(self, url: str, token: str, json: dict) -> Tuple[ConnectToApi ) if response.status_code == 200: - logger.info( - "Connect to FEDn Api - Client assigned to controller") + logger.info("Connect to FEDn Api - Client assigned to controller") json_response = response.json() return ConnectToApiResult.Assigned, json_response elif response.status_code == 203: json_response = response.json() - logger.info( - "Connect to FEDn Api - Remote compute package missing.") + logger.info("Connect to FEDn Api - Remote compute package missing.") return ConnectToApiResult.ComputePackageMissing, json_response elif response.status_code == 401: logger.warning("Connect to FEDn Api - Unauthorized") @@ -157,8 +155,7 @@ def set_dispatcher(self, path) -> bool: def get_or_set_environment(self) -> bool: try: - logger.info( - "Initiating Dispatcher with entrypoint set to: startup") + logger.info("Initiating Dispatcher with entrypoint set to: startup") activate_cmd = self.dispatcher._get_or_create_python_env() self.dispatcher.run_cmd("startup") except KeyError: @@ -169,8 +166,7 @@ def get_or_set_environment(self) -> bool: return False if activate_cmd: - logger.info( - "To activate the virtual environment, run: {}".format(activate_cmd)) + logger.info("To activate the virtual environment, run: {}".format(activate_cmd)) return True @@ -185,8 +181,7 @@ def init_grpchandler(self, config: GrpcConnectionOptions, client_name: str, toke port = config["port"] combiner_name = config["host"] - self.grpc_handler = GrpcHandler( - host=host, port=port, name=client_name, token=token, combiner_name=combiner_name) + self.grpc_handler = GrpcHandler(host=host, port=port, name=client_name, token=token, combiner_name=combiner_name) logger.info("Successfully initialized GRPC connection") return True @@ -195,12 +190,10 @@ def init_grpchandler(self, config: GrpcConnectionOptions, client_name: str, toke return False def send_heartbeats(self, client_name: str, client_id: str, update_frequency: float = 2.0): - self.grpc_handler.send_heartbeats( - client_name=client_name, client_id=client_id, update_frequency=update_frequency) + self.grpc_handler.send_heartbeats(client_name=client_name, client_id=client_id, update_frequency=update_frequency) def listen_to_task_stream(self, client_name: str, client_id: str): - self.grpc_handler.listen_to_task_stream( - client_name=client_name, client_id=client_id, callback=self._task_stream_callback) + self.grpc_handler.listen_to_task_stream(client_name=client_name, client_id=client_id, callback=self._task_stream_callback) def _task_stream_callback(self, request): if request.type == fedn.StatusType.MODEL_UPDATE: @@ -215,12 +208,10 @@ def update_local_model(self, request): model_update_id = str(uuid.uuid4()) tic = time.time() - in_model = self.get_model_from_combiner( - id=model_id, client_id=self.client_id) + in_model = self.get_model_from_combiner(id=model_id, client_id=self.client_id) if in_model is None: - logger.error( - "Could not retrieve model from combiner. Aborting training request.") + logger.error("Could not retrieve model from combiner. Aborting training request.") return fetch_model_time = time.time() - tic @@ -252,8 +243,7 @@ def update_local_model(self, request): meta["fetch_model"] = fetch_model_time meta["config"] = request.data - update = self.create_update_message( - model_id=model_id, model_update_id=model_update_id, meta=meta, request=request) + update = self.create_update_message(model_id=model_id, model_update_id=model_update_id, meta=meta, request=request) self.send_model_update(update) @@ -277,12 +267,10 @@ def validate_global_model(self, request): type=fedn.StatusType.MODEL_VALIDATION, ) - in_model = self.get_model_from_combiner( - id=model_id, client_id=self.client_id) + in_model = self.get_model_from_combiner(id=model_id, client_id=self.client_id) if in_model is None: - logger.error( - "Could not retrieve model from combiner. Aborting validation request.") + logger.error("Could not retrieve model from combiner. Aborting validation request.") return if not self.validate_callback: @@ -294,8 +282,7 @@ def validate_global_model(self, request): if metrics is not None: # Send validation - validation = self.create_validation_message( - metrics=metrics, request=request) + validation = self.create_validation_message(metrics=metrics, request=request) result: bool = self.send_model_validation(validation) @@ -310,8 +297,7 @@ def validate_global_model(self, request): ) else: self.send_status( - "Client {} failed to complete model validation.".format( - self.name), + "Client {} failed to complete model validation.".format(self.name), log_level=fedn.LogLevel.WARNING, request=request, sesssion_id=request.session_id, @@ -320,12 +306,10 @@ def validate_global_model(self, request): def predict_global_model(self, request): model_id = request.model_id - model = self.get_model_from_combiner( - id=model_id, client_id=self.client_id) + model = self.get_model_from_combiner(id=model_id, client_id=self.client_id) if model is None: - logger.error( - "Could not retrieve model from combiner. Aborting prediction request.") + logger.error("Could not retrieve model from combiner. Aborting prediction request.") return if not self.predict_callback: @@ -335,8 +319,7 @@ def predict_global_model(self, request): logger.info(f"Running predict callback with model ID: {model_id}") prediction = self.predict_callback(model) - prediction_message = self.create_prediction_message( - prediction=prediction, request=request) + prediction_message = self.create_prediction_message(prediction=prediction, request=request) self.send_model_prediction(prediction_message) @@ -381,11 +364,9 @@ def set_client_id(self, client_id: str): self.client_id = client_id def run(self): - threading.Thread(target=self.send_heartbeats, kwargs={ - "client_name": self.name, "client_id": self.client_id}, daemon=True).start() + threading.Thread(target=self.send_heartbeats, kwargs={"client_name": self.name, "client_id": self.client_id}, daemon=True).start() try: - self.listen_to_task_stream( - client_name=self.name, client_id=self.client_id) + self.listen_to_task_stream(client_name=self.name, client_id=self.client_id) except KeyboardInterrupt: logger.info("Client stopped by user.") diff --git a/fedn/network/clients/grpc_handler.py b/fedn/network/clients/grpc_handler.py index 9858c530d..ba9e025a2 100644 --- a/fedn/network/clients/grpc_handler.py +++ b/fedn/network/clients/grpc_handler.py @@ -91,12 +91,10 @@ def _init_secure_channel(self, host: str, port: int, token: str): logger.info(f"Connecting (GRPC) to {url}") if os.getenv("FEDN_GRPC_ROOT_CERT_PATH"): - logger.info( - "Using root certificate from environment variable for GRPC channel.") + logger.info("Using root certificate from environment variable for GRPC channel.") with open(os.environ["FEDN_GRPC_ROOT_CERT_PATH"], "rb") as f: credentials = grpc.ssl_channel_credentials(f.read()) - self.channel = grpc.secure_channel( - "{}:{}".format(host, str(port)), credentials) + self.channel = grpc.secure_channel("{}:{}".format(host, str(port)), credentials) return logger.info(f"Fetching SSL certificate for {host}") @@ -123,13 +121,11 @@ def heartbeat(self, client_name: str, client_id: str): :return: Response from the combiner. :rtype: fedn.Response """ - heartbeat = fedn.Heartbeat(sender=fedn.Client( - name=client_name, role=fedn.CLIENT, client_id=client_id)) + heartbeat = fedn.Heartbeat(sender=fedn.Client(name=client_name, role=fedn.CLIENT, client_id=client_id)) try: - #logger.info("Sending heartbeat to combiner") - response = self.connectorStub.SendHeartbeat( - heartbeat, metadata=self.metadata) + # logger.info("Sending heartbeat to combiner") + response = self.connectorStub.SendHeartbeat(heartbeat, metadata=self.metadata) except grpc.RpcError as e: logger.error(f"GRPC (SendHeartbeat): An error occurred: {e}") raise e @@ -149,7 +145,7 @@ def send_heartbeats(self, client_name: str, client_id: str, update_frequency: fl return self._handle_unknown_error(e, "SendHeartbeat", lambda: self.send_heartbeats(client_name, client_id, update_frequency)) if isinstance(response, fedn.Response): pass - #logger.info("Heartbeat successful.") + # logger.info("Heartbeat successful.") else: logger.error("Heartbeat failed.") send_hearbeat = False @@ -179,8 +175,7 @@ def listen_to_task_stream(self, client_name: str, client_id: str, callback: Call sender_name=client_name, ) - logger.info( - f"Received task request of type {request.type} for model_id {request.model_id}") + logger.info(f"Received task request of type {request.type} for model_id {request.model_id}") callback(request) @@ -189,8 +184,7 @@ def listen_to_task_stream(self, client_name: str, client_id: str, callback: Call return self._handle_grpc_error(e, "TaskStream", lambda: self.listen_to_task_stream(client_name, client_id, callback)) except Exception as e: logger.error(f"GRPC (TaskStream): An error occurred: {e}") - self._handle_unknown_error(e, "TaskStream", lambda: self.listen_to_task_stream( - client_name, client_id, callback)) + self._handle_unknown_error(e, "TaskStream", lambda: self.listen_to_task_stream(client_name, client_id, callback)) def send_status(self, msg: str, log_level=fedn.LogLevel.INFO, type=None, request=None, sesssion_id: str = None, sender_name: str = None): """Send status message. @@ -225,8 +219,7 @@ def send_status(self, msg: str, log_level=fedn.LogLevel.INFO, type=None, request return self._handle_grpc_error(e, "SendStatus", lambda: self.send_status(msg, log_level, type, request, sesssion_id, sender_name)) except Exception as e: logger.error(f"GRPC (SendStatus): An error occurred: {e}") - self._handle_unknown_error(e, "SendStatus", lambda: self.send_status( - msg, log_level, type, request, sesssion_id, sender_name)) + self._handle_unknown_error(e, "SendStatus", lambda: self.send_status(msg, log_level, type, request, sesssion_id, sender_name)) def get_model_from_combiner(self, id: str, client_id: str, timeout: int = 20) -> BytesIO: """Fetch a model from the assigned combiner. @@ -263,8 +256,7 @@ def get_model_from_combiner(self, id: str, client_id: str, timeout: int = 20) -> return self._handle_grpc_error(e, "Download", lambda: self.get_model_from_combiner(id, client_id, timeout)) except Exception as e: logger.error(f"GRPC (Download): An error occurred: {e}") - self._handle_unknown_error( - e, "Download", lambda: self.get_model_from_combiner(id, client_id, timeout)) + self._handle_unknown_error(e, "Download", lambda: self.get_model_from_combiner(id, client_id, timeout)) return data def send_model_to_combiner(self, model: BytesIO, id: str): @@ -290,14 +282,12 @@ def send_model_to_combiner(self, model: BytesIO, id: str): try: logger.info("Uploading model to combiner.") - result = self.modelStub.Upload( - upload_request_generator(bt, id), metadata=self.metadata) + result = self.modelStub.Upload(upload_request_generator(bt, id), metadata=self.metadata) except grpc.RpcError as e: return self._handle_grpc_error(e, "Upload", lambda: self.send_model_to_combiner(model, id)) except Exception as e: logger.error(f"GRPC (Upload): An error occurred: {e}") - self._handle_unknown_error( - e, "Upload", lambda: self.send_model_to_combiner(model, id)) + self._handle_unknown_error(e, "Upload", lambda: self.send_model_to_combiner(model, id)) return result def create_update_message( @@ -371,21 +361,18 @@ def create_prediction_message( def send_model_update(self, update: fedn.ModelUpdate): try: logger.info("Sending model update to combiner.") - _ = self.combinerStub.SendModelUpdate( - update, metadata=self.metadata) + _ = self.combinerStub.SendModelUpdate(update, metadata=self.metadata) except grpc.RpcError as e: return self._handle_grpc_error(e, "SendModelUpdate", lambda: self.send_model_update(update)) except Exception as e: logger.error(f"GRPC (SendModelUpdate): An error occurred: {e}") - self._handle_unknown_error( - e, "SendModelUpdate", lambda: self.send_model_update(update)) + self._handle_unknown_error(e, "SendModelUpdate", lambda: self.send_model_update(update)) return True def send_model_validation(self, validation: fedn.ModelValidation) -> bool: try: logger.info("Sending model validation to combiner.") - _ = self.combinerStub.SendModelValidation( - validation, metadata=self.metadata) + _ = self.combinerStub.SendModelValidation(validation, metadata=self.metadata) except grpc.RpcError as e: return self._handle_grpc_error( e, @@ -394,15 +381,13 @@ def send_model_validation(self, validation: fedn.ModelValidation) -> bool: ) except Exception as e: logger.error(f"GRPC (SendModelValidation): An error occurred: {e}") - self._handle_unknown_error( - e, "SendModelValidation", lambda: self.send_model_validation(validation)) + self._handle_unknown_error(e, "SendModelValidation", lambda: self.send_model_validation(validation)) return True def send_model_prediction(self, prediction: fedn.ModelPrediction) -> bool: try: logger.info("Sending model prediction to combiner.") - _ = self.combinerStub.SendModelPrediction( - prediction, metadata=self.metadata) + _ = self.combinerStub.SendModelPrediction(prediction, metadata=self.metadata) except grpc.RpcError as e: return self._handle_grpc_error( e, @@ -411,20 +396,17 @@ def send_model_prediction(self, prediction: fedn.ModelPrediction) -> bool: ) except Exception as e: logger.error(f"GRPC (SendModelPrediction): An error occurred: {e}") - self._handle_unknown_error( - e, "SendModelPrediction", lambda: self.send_model_prediction(prediction)) + self._handle_unknown_error(e, "SendModelPrediction", lambda: self.send_model_prediction(prediction)) return True def _handle_grpc_error(self, e, method_name: str, sender_function: Callable): status_code = e.code() if status_code == grpc.StatusCode.UNAVAILABLE: - logger.warning( - f"GRPC ({method_name}): server unavailable. Retrying in 5 seconds.") + logger.warning(f"GRPC ({method_name}): server unavailable. Retrying in 5 seconds.") time.sleep(5) return sender_function() elif status_code == grpc.StatusCode.CANCELLED: - logger.warning( - f"GRPC ({method_name}): connection cancelled. Retrying in 5 seconds.") + logger.warning(f"GRPC ({method_name}): connection cancelled. Retrying in 5 seconds.") time.sleep(5) return sender_function() elif status_code == grpc.StatusCode.UNAUTHENTICATED: @@ -433,12 +415,10 @@ def _handle_grpc_error(self, e, method_name: str, sender_function: Callable): logger.warning(f"GRPC ({method_name}): Token expired.") raise e elif status_code == grpc.StatusCode.UNKNOWN: - logger.warning( - f"GRPC ({method_name}): An unknown error occurred: {e}.") + logger.warning(f"GRPC ({method_name}): An unknown error occurred: {e}.") details = e.details() if details == "Stream removed": - logger.warning( - f"GRPC ({method_name}): Stream removed. Reconnecting") + logger.warning(f"GRPC ({method_name}): Stream removed. Reconnecting") self._disconnect() self._init_channel(self.host, self.port, self.token) self._init_stubs() @@ -450,8 +430,7 @@ def _handle_grpc_error(self, e, method_name: str, sender_function: Callable): def _handle_unknown_error(self, e, method_name: str, sender_function: Callable): # Try to reconnect - logger.warning( - f"GRPC ({method_name}): An unknown error occurred: {e}.") + logger.warning(f"GRPC ({method_name}): An unknown error occurred: {e}.") if isinstance(e, ValueError): # ValueError is raised when the channel is closed self._disconnect() diff --git a/fedn/network/combiner/aggregators/fedavg.py b/fedn/network/combiner/aggregators/fedavg.py index 7734a7051..82a64b5fa 100644 --- a/fedn/network/combiner/aggregators/fedavg.py +++ b/fedn/network/combiner/aggregators/fedavg.py @@ -42,26 +42,21 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): nr_aggregated_models = 0 total_examples = 0 - logger.info( - "AGGREGATOR({}): Aggregating model updates... ".format(self.name)) + logger.info("AGGREGATOR({}): Aggregating model updates... ".format(self.name)) while not self.update_handler.model_updates.empty(): try: - logger.info( - "AGGREGATOR({}): Getting next model update from queue.".format(self.name)) + logger.info("AGGREGATOR({}): Getting next model update from queue.".format(self.name)) model_update = self.update_handler.next_model_update() # Load model parameters and metadata - logger.info("AGGREGATOR({}): Loading model metadata {}.".format( - self.name, model_update.model_update_id)) + logger.info("AGGREGATOR({}): Loading model metadata {}.".format(self.name, model_update.model_update_id)) tic = time.time() - model_next, metadata = self.update_handler.load_model_update( - model_update, helper) - data["time_model_load"] += time.time()-tic + model_next, metadata = self.update_handler.load_model_update(model_update, helper) + data["time_model_load"] += time.time() - tic - logger.info("AGGREGATOR({}): Processing model update {}, metadata: {} ".format( - self.name, model_update.model_update_id, metadata)) + logger.info("AGGREGATOR({}): Processing model update {}, metadata: {} ".format(self.name, model_update.model_update_id, metadata)) # Increment total number of examples total_examples += metadata["num_examples"] @@ -70,9 +65,8 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): if nr_aggregated_models == 0: model = model_next else: - model = helper.increment_average( - model, model_next, metadata["num_examples"], total_examples) - data["time_model_aggregation"] += time.time()-tic + model = helper.increment_average(model, model_next, metadata["num_examples"], total_examples) + data["time_model_aggregation"] += time.time() - tic nr_aggregated_models += 1 # Delete model from storage @@ -80,12 +74,10 @@ def combine_models(self, helper=None, delete_models=True, parameters=None): self.update_handler.delete_model(model_update) except Exception as e: tb = traceback.format_exc() - logger.error( - f"AGGREGATOR({self.name}): Error encoutered while processing model update: {e}") + logger.error(f"AGGREGATOR({self.name}): Error encoutered while processing model update: {e}") logger.error(tb) data["nr_aggregated_models"] = nr_aggregated_models - logger.info("AGGREGATOR({}): Aggregation completed, aggregated {} models.".format( - self.name, nr_aggregated_models)) + logger.info("AGGREGATOR({}): Aggregation completed, aggregated {} models.".format(self.name, nr_aggregated_models)) return model, data diff --git a/fedn/network/combiner/aggregators/fedopt.py b/fedn/network/combiner/aggregators/fedopt.py index be3d6a691..157c4c85f 100644 --- a/fedn/network/combiner/aggregators/fedopt.py +++ b/fedn/network/combiner/aggregators/fedopt.py @@ -38,13 +38,9 @@ def __init__(self, update_handler): self.m = None def combine_models( - self, - helper: Optional[HelperBase] = None, - delete_models: bool = True, - parameters: Optional[Parameters] = None + self, helper: Optional[HelperBase] = None, delete_models: bool = True, parameters: Optional[Parameters] = None ) -> Tuple[Optional[Any], Dict[str, float]]: - """ - Compute pseudo gradients using model updates in the queue. + """Compute pseudo gradients using model updates in the queue. :param helper: ML framework-specific helper, defaults to None. :param delete_models: Delete models from storage after aggregation, defaults to True. @@ -64,11 +60,9 @@ def combine_models( # Validate and merge parameters try: - parameters = self._validate_and_merge_parameters( - parameters, default_parameters) + parameters = self._validate_and_merge_parameters(parameters, default_parameters) except InvalidParameterError as e: - logger.error( - f"Aggregator {self.name} received invalid parameters: {e}") + logger.error(f"Aggregator {self.name} received invalid parameters: {e}") return None, data logger.info(f"Aggregator {self.name} starting model aggregation.") @@ -79,31 +73,25 @@ def combine_models( while not self.update_handler.model_updates.empty(): try: - logger.info( - f"Aggregator {self.name}: Fetching next model update.") + logger.info(f"Aggregator {self.name}: Fetching next model update.") model_update = self.update_handler.next_model_update() tic = time.time() - model_next, metadata = self.update_handler.load_model_update( - model_update, helper) + model_next, metadata = self.update_handler.load_model_update(model_update, helper) data["time_model_load"] += time.time() - tic - logger.info( - f"Processing model update {model_update.model_update_id}") + logger.info(f"Processing model update {model_update.model_update_id}") # Increment total examples total_examples += metadata["num_examples"] tic = time.time() if nr_aggregated_models == 0: - model_old = self.update_handler.load_model( - helper, model_update.model_id) + model_old = self.update_handler.load_model(helper, model_update.model_id) pseudo_gradient = helper.subtract(model_next, model_old) else: - pseudo_gradient_next = helper.subtract( - model_next, model_old) - pseudo_gradient = helper.increment_average( - pseudo_gradient, pseudo_gradient_next, metadata["num_examples"], total_examples) + pseudo_gradient_next = helper.subtract(model_next, model_old) + pseudo_gradient = helper.increment_average(pseudo_gradient, pseudo_gradient_next, metadata["num_examples"], total_examples) data["time_model_aggregation"] += time.time() - tic @@ -111,11 +99,9 @@ def combine_models( if delete_models: self.update_handler.delete_model(model_update) - logger.info( - f"Deleted model update {model_update.model_update_id} from storage.") + logger.info(f"Deleted model update {model_update.model_update_id} from storage.") except Exception as e: - logger.error( - f"Error processing model update: {e}. Skipping this update.") + logger.error(f"Error processing model update: {e}. Skipping this update.") logger.error(traceback.format_exc()) continue @@ -123,8 +109,7 @@ def combine_models( if pseudo_gradient: try: - model = self._apply_server_optimizer( - helper, pseudo_gradient, model_old, parameters) + model = self._apply_server_optimizer(helper, pseudo_gradient, model_old, parameters) except Exception as e: logger.error(f"Error during model aggregation: {e}") logger.error(traceback.format_exc()) @@ -132,13 +117,10 @@ def combine_models( else: return None, data - logger.info( - f"Aggregator {self.name} completed. Aggregated {nr_aggregated_models} models.") + logger.info(f"Aggregator {self.name} completed. Aggregated {nr_aggregated_models} models.") return model, data - def _validate_and_merge_parameters( - self, parameters: Optional[Parameters], default_parameters: Dict[str, Any] - ) -> Dict[str, Any]: + def _validate_and_merge_parameters(self, parameters: Optional[Parameters], default_parameters: Dict[str, Any]) -> Dict[str, Any]: """Validate and merge default parameters.""" parameter_schema = { "serveropt": str, @@ -154,9 +136,7 @@ def _validate_and_merge_parameters( parameters = {} return {**default_parameters, **parameters} - def _apply_server_optimizer( - self, helper: HelperBase, pseudo_gradient: Any, model_old: Any, parameters: Dict[str, Any] - ) -> Any: + def _apply_server_optimizer(self, helper: HelperBase, pseudo_gradient: Any, model_old: Any, parameters: Dict[str, Any]) -> Any: """Apply the selected server optimizer to compute the new model.""" optimizer_map = { "adam": self.serveropt_adam, @@ -165,8 +145,7 @@ def _apply_server_optimizer( } optimizer = optimizer_map.get(parameters["serveropt"]) if not optimizer: - raise ValueError( - f"Unsupported server optimizer: {parameters['serveropt']}") + raise ValueError(f"Unsupported server optimizer: {parameters['serveropt']}") return optimizer(helper, pseudo_gradient, model_old, parameters) def serveropt_adam(self, helper, pseudo_gradient, model_old, parameters): @@ -192,8 +171,7 @@ def serveropt_adam(self, helper, pseudo_gradient, model_old, parameters): self.v = helper.ones(pseudo_gradient, math.pow(tau, 2)) if not self.m: - self.m = helper.multiply( - pseudo_gradient, [(1.0 - beta1)] * len(pseudo_gradient)) + self.m = helper.multiply(pseudo_gradient, [(1.0 - beta1)] * len(pseudo_gradient)) else: self.m = helper.add(self.m, pseudo_gradient, beta1, (1.0 - beta1)) @@ -229,8 +207,7 @@ def serveropt_yogi(self, helper, pseudo_gradient, model_old, parameters): self.v = helper.ones(pseudo_gradient, math.pow(tau, 2)) if not self.m: - self.m = helper.multiply( - pseudo_gradient, [(1.0 - beta1)] * len(pseudo_gradient)) + self.m = helper.multiply(pseudo_gradient, [(1.0 - beta1)] * len(pseudo_gradient)) else: self.m = helper.add(self.m, pseudo_gradient, beta1, (1.0 - beta1)) @@ -267,8 +244,7 @@ def serveropt_adagrad(self, helper, pseudo_gradient, model_old, parameters): self.v = helper.ones(pseudo_gradient, math.pow(tau, 2)) if not self.m: - self.m = helper.multiply( - pseudo_gradient, [(1.0 - beta1)] * len(pseudo_gradient)) + self.m = helper.multiply(pseudo_gradient, [(1.0 - beta1)] * len(pseudo_gradient)) else: self.m = helper.add(self.m, pseudo_gradient, beta1, (1.0 - beta1)) diff --git a/fedn/network/combiner/modelservice.py b/fedn/network/combiner/modelservice.py index b2a11861d..97820f9e2 100644 --- a/fedn/network/combiner/modelservice.py +++ b/fedn/network/combiner/modelservice.py @@ -23,11 +23,9 @@ def upload_request_generator(mdl, id): while True: b = mdl.read(CHUNK_SIZE) if b: - result = fedn.ModelRequest( - data=b, id=id, status=fedn.ModelStatus.IN_PROGRESS) + result = fedn.ModelRequest(data=b, id=id, status=fedn.ModelStatus.IN_PROGRESS) else: - result = fedn.ModelRequest( - id=id, data=None, status=fedn.ModelStatus.OK) + result = fedn.ModelRequest(id=id, data=None, status=fedn.ModelStatus.OK) yield result if not b: break @@ -89,8 +87,7 @@ def unpack_model(request_iterator, helper): if request.data: model_buffer.write(request.data) except MemoryError as e: - logger.error( - f"Memory error occured when loading model, reach out to the FEDn team if you need a solution to this. {e}") + logger.error(f"Memory error occured when loading model, reach out to the FEDn team if you need a solution to this. {e}") raise except Exception as e: logger.error(f"Exception occured during model loading: {e}") @@ -212,15 +209,12 @@ def Upload(self, request_iterator, context): for request in request_iterator: if request.status == fedn.ModelStatus.IN_PROGRESS: self.temp_model_storage.get_ptr(request.id).write(request.data) - self.temp_model_storage.set_model_metadata( - request.id, fedn.ModelStatus.IN_PROGRESS) + self.temp_model_storage.set_model_metadata(request.id, fedn.ModelStatus.IN_PROGRESS) if request.status == fedn.ModelStatus.OK and not request.data: - result = fedn.ModelResponse( - id=request.id, status=fedn.ModelStatus.OK, message="Got model successfully.") + result = fedn.ModelResponse(id=request.id, status=fedn.ModelStatus.OK, message="Got model successfully.") # self.temp_model_storage_metadata.update({request.id: fedn.ModelStatus.OK}) - self.temp_model_storage.set_model_metadata( - request.id, fedn.ModelStatus.OK) + self.temp_model_storage.set_model_metadata(request.id, fedn.ModelStatus.OK) self.temp_model_storage.get_ptr(request.id).flush() self.temp_model_storage.get_ptr(request.id).close() return result @@ -235,13 +229,11 @@ def Download(self, request, context): :return: A model response iterator. :rtype: :class:`fedn.network.grpc.fedn_pb2.ModelResponse` """ - logger.info( - f"grpc.ModelService.Download: {request.sender.role}:{request.sender.client_id} requested model {request.id}") + logger.info(f"grpc.ModelService.Download: {request.sender.role}:{request.sender.client_id} requested model {request.id}") try: status = self.temp_model_storage.get_model_metadata(request.id) if status != fedn.ModelStatus.OK: - logger.error( - f"model file is not ready: {request.id}, status: {status}") + logger.error(f"model file is not ready: {request.id}, status: {status}") yield fedn.ModelResponse(id=request.id, data=None, status=status) except Exception: logger.error("Error file does not exist: {}".format(request.id)) diff --git a/fedn/network/combiner/roundhandler.py b/fedn/network/combiner/roundhandler.py index dae885a1c..fa3d83e8f 100644 --- a/fedn/network/combiner/roundhandler.py +++ b/fedn/network/combiner/roundhandler.py @@ -30,7 +30,7 @@ class RoundConfig(TypedDict): :type round_id: str :param round_timeout: The round timeout in seconds. Set by user interfaces or Controller. :type round_timeout: str - :param rounds: The number of rounds. Set by user interfaces.w + :param rounds: The number of rounds. Set by user interfaces. :param model_id: The model identifier. Set by user interfaces or Controller (get_latest_model). :type model_id: str :param model_version: The model version. Currently not used. @@ -131,8 +131,7 @@ def _training_round(self, config: dict, clients: list, provided_functions: dict) :return: an aggregated model and associated metadata :rtype: model, dict """ - logger.info( - "ROUNDHANDLER: Initiating training round, participating clients: {}".format(clients)) + logger.info("ROUNDHANDLER: Initiating training round, participating clients: {}".format(clients)) meta = {} meta["nr_expected_updates"] = len(clients) @@ -143,14 +142,11 @@ def _training_round(self, config: dict, clients: list, provided_functions: dict) model_id = config["model_id"] if provided_functions.get("client_settings", False): - global_model_bytes = self.modelservice.temp_model_storage.get( - model_id) - client_settings = self.hook_interface.client_settings( - global_model_bytes) + global_model_bytes = self.modelservice.temp_model_storage.get(model_id) + client_settings = self.hook_interface.client_settings(global_model_bytes) config["client_settings"] = client_settings # Request model updates from all active clients. - self.server.request_model_update( - session_id=session_id, model_id=model_id, config=config, clients=clients) + self.server.request_model_update(session_id=session_id, model_id=model_id, config=config, clients=clients) # If buffer_size is -1 (default), the round terminates when/if all clients have completed. if int(config["buffer_size"]) == -1: @@ -165,8 +161,7 @@ def _training_round(self, config: dict, clients: list, provided_functions: dict) data = None try: helper = get_helper(config["helper_type"]) - logger.info("Config delete_models_storage: {}".format( - config["delete_models_storage"])) + logger.info("Config delete_models_storage: {}".format(config["delete_models_storage"])) if config["delete_models_storage"] == "True": delete_models = True else: @@ -178,13 +173,10 @@ def _training_round(self, config: dict, clients: list, provided_functions: dict) else: parameters = None if provided_functions.get("aggregate", False): - previous_model_bytes = self.modelservice.temp_model_storage.get( - model_id) - model, data = self.hook_interface.aggregate( - previous_model_bytes, self.update_handler, helper, delete_models=delete_models) + previous_model_bytes = self.modelservice.temp_model_storage.get(model_id) + model, data = self.hook_interface.aggregate(previous_model_bytes, self.update_handler, helper, delete_models=delete_models) else: - model, data = self.aggregator.combine_models( - helper=helper, delete_models=delete_models, parameters=parameters) + model, data = self.aggregator.combine_models(helper=helper, delete_models=delete_models, parameters=parameters) except Exception as e: logger.warning("AGGREGATION FAILED AT COMBINER! {}".format(e)) raise @@ -203,8 +195,7 @@ def _validation_round(self, session_id, model_id, clients): :param model_id: The ID of the model to validate :type model_id: str """ - self.server.request_model_validation( - session_id, model_id, clients=clients) + self.server.request_model_validation(session_id, model_id, clients=clients) def _prediction_round(self, prediction_id: str, model_id: str, clients: list): """Send model prediction requests to clients. @@ -216,8 +207,7 @@ def _prediction_round(self, prediction_id: str, model_id: str, clients: list): :param model_id: The ID of the model to use for prediction :type model_id: str """ - self.server.request_model_prediction( - prediction_id, model_id, clients=clients) + self.server.request_model_prediction(prediction_id, model_id, clients=clients) def stage_model(self, model_id, timeout_retry=3, retry=2): """Download a model from persistent storage and set in modelservice. @@ -231,8 +221,7 @@ def stage_model(self, model_id, timeout_retry=3, retry=2): """ # If the model is already in memory at the server we do not need to do anything. if self.modelservice.temp_model_storage.exist(model_id): - logger.info( - "Model already exists in memory, skipping model staging.") + logger.info("Model already exists in memory, skipping model staging.") return logger.info("Model Staging, fetching model from storage...") # If not, download it and stage it in memory at the combiner. @@ -243,13 +232,11 @@ def stage_model(self, model_id, timeout_retry=3, retry=2): if model: break except Exception: - logger.warning( - "Could not fetch model from storage backend, retrying.") + logger.warning("Could not fetch model from storage backend, retrying.") time.sleep(timeout_retry) tries += 1 if tries > retry: - logger.error( - "Failed to stage model {} from storage backend!".format(model_id)) + logger.error("Failed to stage model {} from storage backend!".format(model_id)) raise self.modelservice.set_model(model, model_id) @@ -269,8 +256,7 @@ def _assign_round_clients(self, n, type="trainers"): elif type == "trainers": clients = self.server.get_active_trainers() else: - logger.error( - "(ERROR): {} is not a supported type of client".format(type)) + logger.error("(ERROR): {} is not a supported type of client".format(type)) # If the number of requested trainers exceeds the number of available, use all available. n = min(n, len(clients)) @@ -292,8 +278,7 @@ def _check_nr_round_clients(self, config): """ active = self.server.nr_active_trainers() if active >= int(config["clients_required"]): - logger.info("Number of clients required ({0}) to start round met {1}.".format( - config["clients_required"], active)) + logger.info("Number of clients required ({0}) to start round met {1}.".format(config["clients_required"], active)) return True else: logger.info("Too few clients to start round.") @@ -305,11 +290,9 @@ def execute_validation_round(self, session_id, model_id): :param round_config: The round config object. :type round_config: dict """ - logger.info( - "COMBINER orchestrating validation of model {}".format(model_id)) + logger.info("COMBINER orchestrating validation of model {}".format(model_id)) self.stage_model(model_id) - validators = self._assign_round_clients( - self.server.max_clients, type="validators") + validators = self._assign_round_clients(self.server.max_clients, type="validators") self._validation_round(session_id, model_id, validators) def execute_prediction_round(self, prediction_id: str, model_id: str) -> None: @@ -318,12 +301,10 @@ def execute_prediction_round(self, prediction_id: str, model_id: str) -> None: :param round_config: The round config object. :type round_config: dict """ - logger.info( - "COMBINER orchestrating prediction using model {}".format(model_id)) + logger.info("COMBINER orchestrating prediction using model {}".format(model_id)) self.stage_model(model_id) # TODO: Implement prediction client type - clients = self._assign_round_clients( - self.server.max_clients, type="validators") + clients = self._assign_round_clients(self.server.max_clients, type="validators") self._prediction_round(prediction_id, model_id, clients) def execute_training_round(self, config): @@ -334,8 +315,7 @@ def execute_training_round(self, config): :return: metadata about the training round. :rtype: dict """ - logger.info("Processing training round, job_id {}".format( - config["_job_id"])) + logger.info("Processing training round, job_id {}".format(config["_job_id"])) data = {} data["config"] = config @@ -344,20 +324,17 @@ def execute_training_round(self, config): # Download model to update and set in temp storage. self.stage_model(config["model_id"]) - provided_functions = self.hook_interface.provided_functions( - self.server_functions) + provided_functions = self.hook_interface.provided_functions(self.server_functions) if provided_functions.get("client_selection", False): - clients = self.hook_interface.client_selection( - clients=self.server.get_active_trainers()) + clients = self.hook_interface.client_selection(clients=self.server.get_active_trainers()) else: clients = self._assign_round_clients(self.server.max_clients) model, meta = self._training_round(config, clients, provided_functions) data["data"] = meta if model is None: - logger.warning( - "\t Failed to update global model in round {0}!".format(config["round_id"])) + logger.warning("\t Failed to update global model in round {0}!".format(config["round_id"])) if model is not None: helper = get_helper(config["helper_type"]) @@ -366,8 +343,7 @@ def execute_training_round(self, config): a.close() data["model_id"] = model_id - logger.info("TRAINING ROUND COMPLETED. Aggregated model id: {}, Job id: {}".format( - model_id, config["_job_id"])) + logger.info("TRAINING ROUND COMPLETED. Aggregated model id: {}, Job id: {}".format(model_id, config["_job_id"])) # Delete temp model self.modelservice.temp_model_storage.delete(config["model_id"]) @@ -393,14 +369,11 @@ def run(self, polling_interval=1.0): session_id = round_config["session_id"] model_id = round_config["model_id"] tic = time.time() - round_meta = self.execute_training_round( - round_config) - round_meta["time_exec_training"] = time.time() - \ - tic + round_meta = self.execute_training_round(round_config) + round_meta["time_exec_training"] = time.time() - tic round_meta["status"] = "Success" round_meta["name"] = self.server.id - self.server.statestore.set_round_combiner_data( - round_meta) + self.server.statestore.set_round_combiner_data(round_meta) elif round_config["task"] == "validation": session_id = round_config["session_id"] model_id = round_config["model_id"] @@ -408,8 +381,7 @@ def run(self, polling_interval=1.0): elif round_config["task"] == "prediction": prediction_id = round_config["prediction_id"] model_id = round_config["model_id"] - self.execute_prediction_round( - prediction_id, model_id) + self.execute_prediction_round(prediction_id, model_id) else: logger.warning("config contains unkown task type.") else: diff --git a/fedn/network/grpc/server.py b/fedn/network/grpc/server.py index 7f6109324..76ef58b65 100644 --- a/fedn/network/grpc/server.py +++ b/fedn/network/grpc/server.py @@ -5,8 +5,7 @@ from grpc_health.v1 import health, health_pb2_grpc import fedn.network.grpc.fedn_pb2_grpc as rpc -from fedn.common.log_config import (logger, set_log_level_from_string, - set_log_stream) +from fedn.common.log_config import logger, set_log_level_from_string, set_log_stream from fedn.network.combiner.shared import modelservice from fedn.network.grpc.auth import JWTInterceptor @@ -57,8 +56,7 @@ def __init__(self, servicer, config: ServerConfig): if isinstance(servicer, rpc.CombinerServicer): rpc.add_ControlServicer_to_server(servicer, self.server) - health_pb2_grpc.add_HealthServicer_to_server( - self.health_servicer, self.server) + health_pb2_grpc.add_HealthServicer_to_server(self.health_servicer, self.server) if config["secure"]: logger.info("Creating secure gRPCS server using certificate") @@ -70,8 +68,7 @@ def __init__(self, servicer, config: ServerConfig): ), ) ) - self.server.add_secure_port( - "[::]:" + str(config["port"]), server_credentials) + self.server.add_secure_port("[::]:" + str(config["port"]), server_credentials) else: logger.info("Creating gRPC server") self.server.add_insecure_port("[::]:" + str(config["port"]))