Merge with master

scaleoutsystems · Mar 1, 2024 · fc0d5af · fc0d5af
2 parents fb1f335 + eeb82ce
commit fc0d5af
Show file tree

Hide file tree

Showing 481 changed files with 9,418 additions and 17,351 deletions.
diff --git a/.ci/tests/examples/print_logs.sh b/.ci/tests/examples/print_logs.sh
@@ -5,9 +5,6 @@ docker logs "$(basename $PWD)_minio_1"
 echo "Mongo logs"
 docker logs "$(basename $PWD)_mongo_1"
 
-echo "Dashboard logs"
-docker logs "$(basename $PWD)_dashboard_1"
-
 echo "API-Server logs"
 docker logs "$(basename $PWD)_api-server_1"
 

diff --git a/.ci/tests/examples/run.sh b/.ci/tests/examples/run.sh
@@ -11,6 +11,7 @@ helper="$2"
 
 >&2 echo "Start FEDn"
 pushd "examples/$example"
+
 docker-compose \
     -f ../../docker-compose.yaml \
     -f docker-compose.override.yaml \

diff --git a/.devcontainer/devcontainer.json.tpl b/.devcontainer/devcontainer.json.tpl
@@ -15,12 +15,6 @@
   "mounts": [
     "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind,consistency=default",
   ],
-  "forwardPorts": [
-    8090,
-    9000,
-    9001,
-    8081
-  ],
   "runArgs": [
     "--net=host"
   ],
@@ -30,3 +24,4 @@
     }
   }
 }
+
diff --git a/.github/workflows/branch-name-check.yaml b/.github/workflows/branch-name-check.yaml
@@ -7,7 +7,7 @@ on:
       - master
 
 env:
-  BRANCH_REGEX: '^((feature|hotfix|bugfix|docs)\/.+)|(release\/v((([0-9]+)\.([0-9]+)\.([0-9]+)(?:-([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?)(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?))$'
+  BRANCH_REGEX: '^((feature|hotfix|bugfix|bug|docs|refactor)\/.+)|(release\/v((([0-9]+)\.([0-9]+)\.([0-9]+)(?:-([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?)(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?))$'
 
 jobs:
   branch-name-check:

diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
@@ -15,8 +15,8 @@ jobs:
     strategy:
       matrix:
         to_test:
-          - "mnist-keras kerashelper"
-          - "mnist-pytorch pytorchhelper"
+          - "mnist-keras numpyhelper"
+          - "mnist-pytorch numpyhelper"
         python_version: ["3.8", "3.9","3.10"]
         os:
           - ubuntu-20.04
@@ -38,9 +38,9 @@ jobs:
       - name: run ${{ matrix.to_test }}
         run: .ci/tests/examples/run.sh ${{ matrix.to_test }}
 
-      - name: run ${{ matrix.to_test }} inference
-        run: .ci/tests/examples/run_inference.sh ${{ matrix.to_test }}
-        if: ${{ matrix.os != 'macos-11' && matrix.to_test == 'mnist-keras keras' }} # example available for Keras
+      # - name: run ${{ matrix.to_test }} inference
+      #   run: .ci/tests/examples/run_inference.sh ${{ matrix.to_test }}
+      #   if: ${{ matrix.os != 'macos-11' && matrix.to_test == 'mnist-keras keras' }} # example available for Keras
 
       - name: print logs
         if: failure()

diff --git a/Dockerfile b/Dockerfile
@@ -2,6 +2,8 @@
 ARG BASE_IMG=python:3.10-slim
 FROM $BASE_IMG
 
+ARG GRPC_HEALTH_PROBE_VERSION=""
+
 # Requirements (use MNIST Keras as default)
 ARG REQUIREMENTS=""
 
@@ -15,6 +17,17 @@ COPY $REQUIREMENTS /app/config/requirements.txt
 # Install developer tools (needed for psutil)
 RUN apt-get update && apt-get install -y python3-dev gcc
 
+# Install grpc health probe checker
+RUN if [ ! -z "$GRPC_HEALTH_PROBE_VERSION" ]; then \
+  apt-get install -y wget && \
+  wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-amd64 && \
+  chmod +x /bin/grpc_health_probe && \
+  apt-get remove -y wget && apt autoremove -y; \
+  else \
+  echo "No grpc_health_probe version specified, skipping installation"; \
+  fi
+
+
 # Create FEDn app directory
 SHELL ["/bin/bash", "-c"]
 RUN mkdir -p /app \

diff --git a/README.rst b/README.rst
@@ -10,102 +10,42 @@
 .. image:: https://readthedocs.org/projects/fedn/badge/?version=latest&style=flat
    :target: https://fedn.readthedocs.io
 
-FEDn is a modular and model agnostic framework for hierarchical
-federated machine learning which scales from pseudo-distributed
-development to real-world production networks in distributed,
-heterogeneous environments. For more details see https://arxiv.org/abs/2103.00148.
+FEDn is a modular and model agnostic framework for
+federated machine learning. FEDn is designed to scale from pseudo-distributed
+development on your laptop to real-world production setups in geographically distributed environments. 
 
 Core Features
 =============
 
--  **Scalable and resilient.** FEDn is highly scalable and resilient via a tiered 
-   architecture where multiple aggregation servers (combiners) form a network to divide up the work to coordinate clients and aggregate models. 
-   Recent benchmarks show high performance both for thousands of clients in a cross-device
-   setting and for large model updates (1GB) in a cross-silo setting. 
-   FEDn has the ability to recover from failure in all critical components.  
-
+-  **Scalable and resilient.** FEDn is scalable and resilient via a tiered 
+   architecture where multiple aggregation servers (combiners) divide up the work to coordinate clients and aggregate models. 
+   Benchmarks show high performance both for thousands of clients in a cross-device
+   setting and for large model updates in a cross-silo setting. 
+   FEDn has the ability to recover from failure in all critical components. 
+
+-  **Security**. FEDn is built using secure industry standard communication protocols (gRPC). A key feature is that
+   clients do not have to expose any ingress ports. 
+
+-  **Track events and training progress in real-time**. FEDn tracks events for clients and aggregation servers, logging to MongoDB. This
+   helps developers monitor traning progress in real-time, and to troubleshoot the distributed computation.  
+   Tracking and model validation data can easily be retrieved using the API enabling development of custom dashboards and visualizations. 
+
+-  **Flexible handling of asynchronous clients**. FEDn supports flexible experimentation 
+   with clients coming in and dropping out during training sessions. Extend aggregators to experiment 
+   with different strategies to handle so called stragglers.
+
 -  **ML-framework agnostic**. Model updates are treated as black-box
    computations. This means that it is possible to support any
    ML model type or framework. Support for Keras and PyTorch is
    available out-of-the-box.
 
--  **Security**. A key feature is that
-   clients do not have to expose any ingress ports.
-
--  **Track events and training progress**. FEDn logs events in the federation and tracks both training and validation progress in real time. Data is logged as JSON to MongoDB and a user can easily make custom dashboards and visualizations. 
-
-- **UI.** A Flask UI lets users see client model validations in real time, as well as track client training time distributions and key performance metrics for clients and combiners.  
 
 Getting started
 ===============
 
-Prerequisites
--------------
-
--  `Python 3.8, 3.9 or 3.10 <https://www.python.org/downloads>`__
--  `Docker <https://docs.docker.com/get-docker>`__
--  `Docker Compose <https://docs.docker.com/compose/install>`__
-
-Quick start
------------
-
-Clone this repository, locate into it and start a pseudo-distributed FEDn network using docker-compose:
-
-.. code-block::
-
-   docker-compose up 
-
-Navigate to http://localhost:8090. You should see the FEDn UI, asking you to upload a compute package. The compute package is a tarball of a project.  The project in turn implements the entrypoints used by clients to compute model updates and to validate a model.  
-
-Locate into 'examples/mnist-pytorch'.  
-
-Start by initializing a virtual enviroment with all of the required dependencies for this project.
-
-.. code-block::
-
-   bin/init_venv.sh
-
-Now create the compute package and a seed model:
-
-.. code-block::
-
-   bin/build.sh
+The best way to get started is to take the quickstart tutorial: 
 
-Uploade the generated files 'package.tar.gz' and 'seed.npz' in the FEDn UI. 
-
-The next step is to configure and attach clients. For this we download data and make data partitions: 
-
-Download the data:
-
-.. code-block::
-
-   bin/get_data
-
-
-Split the data in 2 parts for the clients:
-
-.. code-block::
-
-   bin/split_data
-
-Data partitions will be generated in the folder 'data/clients'.  
-
-Now navigate to http://localhost:8090/network and download the client config file. Place it in the example working directory.  
-
-To connect a client that uses the data partition 'data/clients/1/mnist.pt': 
-
-.. code-block::
-
-   docker run \
-  -v $PWD/client.yaml:/app/client.yaml \
-  -v $PWD/data/clients/1:/var/data \
-  -e ENTRYPOINT_OPTS=--data_path=/var/data/mnist.pt \
-  --network=fedn_default \
-  ghcr.io/scaleoutsystems/fedn/fedn:master-mnist-pytorch run client -in client.yaml --name client1 
-
-You are now ready to start training the model at http://localhost:8090/control.
-
-To scale up the experiment, refer to the README at 'examples/mnist-pytorch' (or the corresponding Keras version), where we explain how to use docker-compose to automate deployment of several clients.  
+- `Quickstart PyTorch <https://fedn.readthedocs.io/en/latest/quickstart.html>`__
 
 Documentation
 =============
@@ -115,6 +55,13 @@ You will find more details about the architecture, compute package and how to de
 -  `Paper <https://arxiv.org/abs/2103.00148>`__
 
 
+FEDn Studio
+===============
+Scaleout also develops FEDn Studio, a web application that extends the FEDn SDK with a UI, production-grade deployment of the FEDn server side on Kubernetes, user authentication/authorization, client identity/API-token management, and project-based multitenancy for segmenting work and resources into collaboration workspaces. FEDn Studio is available as a fully managed service.  
+There is also additional tooling and charts for self-managed deployment on Kubernetes including integration with several projects from the cloud native landscape. 
+See  `FEDn Framework <https://www.scaleoutsystems.com/framework>`__  for more information. 
+
+
 Making contributions
 ====================
 

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -34,7 +34,7 @@ services:
       - 9001:9001
 
   mongo:
-    image: mongo:5.0.2
+    image: mongo:7.0
     restart: always
     environment:
       - MONGO_INITDB_ROOT_USERNAME=fedn_admin
@@ -58,26 +58,6 @@ services:
     ports:
       - 8081:8081
 
-  dashboard:
-    environment:
-      - GET_HOSTS_FROM=dns
-      - USER=test
-      - PROJECT=project
-      - FLASK_DEBUG=1
-      - STATESTORE_CONFIG=/app/config/settings-reducer.yaml
-    build:
-      context: .
-      args:
-        BASE_IMG: ${BASE_IMG:-python:3.10-slim}
-    working_dir: /app
-    volumes:
-      - ${HOST_REPO_DIR:-.}/fedn:/app/fedn
-    entrypoint: [ "sh", "-c" ]
-    command:
-      - "/venv/bin/pip install --no-cache-dir -e /app/fedn && /venv/bin/fedn run dashboard -n reducer --init=config/settings-reducer.yaml"
-    ports:
-      - 8090:8090
-
   api-server:
     environment:
       - GET_HOSTS_FROM=dns
@@ -111,6 +91,7 @@ services:
       context: .
       args:
         BASE_IMG: ${BASE_IMG:-python:3.10-slim}
+        GRPC_HEALTH_PROBE_VERSION: v0.4.24
     working_dir: /app
     volumes:
       - ${HOST_REPO_DIR:-.}/fedn:/app/fedn
@@ -119,6 +100,18 @@ services:
       - "/venv/bin/pip install --no-cache-dir -e /app/fedn && /venv/bin/fedn run combiner --init config/settings-combiner.yaml"
     ports:
       - 12080:12080
+    healthcheck:
+      test:
+        [
+          "CMD",
+          "/bin/grpc_health_probe",
+          "-addr=localhost:12080"
+        ]
+      interval: 2s
+      timeout: 10s
+      retries: 5
+    depends_on:
+      - api-server
 
   # Client
   client:
@@ -136,3 +129,6 @@ services:
       - "/venv/bin/pip install --no-cache-dir -e /app/fedn && /venv/bin/fedn run client --init config/settings-client.yaml"
     deploy:
       replicas: 0
+    depends_on:
+      combiner:
+        condition: service_healthy
diff --git a/docs/aggregators.rst b/docs/aggregators.rst
@@ -0,0 +1,45 @@
+.. _agg-label:
+
+Aggregators
+===========
+
+Aggregators handle combinations of model updates received by the combiner into a combiner-level global model. 
+During a training session, the combiners will instantiate an Aggregator and use it to process the incoming model updates from clients.
+
+.. image:: img/aggregators.png
+   :alt: Aggregator overview
+   :width: 100%
+   :align: center
+
+The above figure illustrates the overall flow. When a client completes a model update, the model parameters are streamed to the combiner, and a model update message is sent. The model parameters are written to file on disk, and the model update message is passed to a callback function, on_model_update. The callback function validates the model update, and if successful, puts the update message on an aggregation queue. The model parameters are written to disk at a configurable storage location at the combiner. This is done to avoid exhausting RAM memory at the combiner. As multiple clients send updates, the aggregation queue builds up, and when a certain criteria is met, another method, combine_models, starts processing the queue, aggregating models according to the specifics of the scheme (FedAvg, FedAdam, etc). 
+
+The user can configure several parameters that guide general behavior of the aggregation flow: 
+
+- Round timeout: The maximal time the combiner waits before processing the update queue.  
+- Buffer size: The maximal allowed length of the queue before processing it.
+- Whether to retain or delete model update files after they have been processed (default is to delete them)
+
+
+
+A developer can extend FEDn with his/her own Aggregator(s) by implementing the interface specified in 
+:py:mod:`fedn.network.combiner.aggregators.aggregatorbase.AggregatorBase`. The developer implements two following methods:  
+
+- ``on_model_update`` (optional)
+- ``combine_models``
+
+on_model_update
+----------------
+
+The on_model_update has access to the complete model update including the metadata passed on  by the clients (as specified in the training entrypoint, see compute package).  The base class implements a default callback that checks that all metadata assumed by the aggregation algorithms FedAvg and FedAdam is present in the metadata. However, the callback could also be used to implement custom preprocessing and additional checks including strategies to filter out updates that are suspected to be corrupted or malicious. 
+
+combine_models
+--------------
+
+This method is responsible for processing the model update queue and in doing so produce an aggregated model. This is the main extension point where the numerical detail of the aggregation scheme is implemented. The best way to understand how to implement this methods is to study the already implemented algorithms: 
+
+- :py:mod:`fedn.network.combiner.aggregators.fedavg`
+- :py:mod:`fedn.network.combiner.aggregators.fedopt`
+
+To add an aggregator plugin “myaggregator”, the developer implements the interface and places a file called ‘myaggregator.py’ in the folder ‘fedn.network.combiner.aggregators’. 
+
+
diff --git a/docs/apiclient.rst b/docs/apiclient.rst
@@ -0,0 +1,21 @@
+APIClient
+===============
+
+FEDn comes with an *APIClient* for interacting with the FEDn network. The APIClient is a Python3 library that can be used to interact with the FEDn network programmatically. 
+
+
+The APIClient is available as a Python package on PyPI, and can be installed using pip:
+
+.. code-block:: bash
+   
+   $ pip install fedn
+
+
+To initialize the APIClient, you need to provide the hostname and port of the FEDn API server. The default port is 8092. The following code snippet shows how to initialize the APIClient:
+
+.. code-block:: python
+   
+   from fedn import APIClient
+   client = APIClient("localhost", 8092)
+
+For more information on how to use the APIClient, see the :py:mod:`fedn.network.api.client`, and the example `Notebooks <https://github.com/scaleoutsystems/fedn/blob/master/examples/mnist-pytorch/API_Example.ipynb>`_. 
diff --git a/docs/architecture.rst b/docs/architecture.rst
@@ -51,13 +51,6 @@ Notes on aggregating algorithms
 FEDn is designed to allow customization of the FedML algorithm, following a specified pattern, or programming model. 
 Model aggregation happens on two levels in the network. First, each Combiner can be configured with a custom orchestration and aggregation implementation, that reduces model updates from Clients into a single, *combiner level* model. 
 Then, a configurable aggregation protocol on the *Controller* level is responsible for combining the combiner-level models into a global model. By varying the aggregation schemes on the two levels in the system, 
-many different possible outcomes can be achieved. Good starting configurations are provided out-of-the-box to help the user get started. See API reference for more details. 
-
-Hierarchical Federated Averaging
-................................
-
-The currently implemented default scheme uses a local SGD strategy on the Combiner level aggregation and a simple average of models on the reducer level. 
-This results in a highly horizontally scalable FedAvg scheme. The strategy works well with most artificial neural network (ANNs) models, 
-and can in general be applied to models where it is possible and makes sense to form mean values of model parameters (for example SVMs).
+many different possible outcomes can be achieved. Good starting configurations are provided out-of-the-box to help the user get started. See :ref:`agg-label` and API reference for more details.
 
 
diff --git a/docs/conf.py b/docs/conf.py
@@ -12,7 +12,7 @@
 author = 'Scaleout Systems AB'
 
 # The full version, including alpha/beta/rc tags
-release = '0.6.0'
+release = '0.8.0'
 
 # Add any Sphinx extension module names here, as strings
 extensions = [
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,7 +7,7 @@ on: @@
           - master
     env:
-      BRANCH_REGEX: '^((feature|hotfix|bugfix|docs)\/.+)|(release\/v((([0-9]+)\.([0-9]+)\.([0-9]+)(?:-([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?)(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?))$'
+      BRANCH_REGEX: '^((feature|hotfix|bugfix|bug|docs|refactor)\/.+)|(release\/v((([0-9]+)\.([0-9]+)\.([0-9]+)(?:-([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?)(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?))$'
     jobs:
       branch-name-check:
@@ Expand Down @@