new hugging face example: email spam detection

scaleoutsystems · May 2, 2024 · 178e467 · 178e467
1 parent 4e43279
commit 178e467
Show file tree

Hide file tree

Showing 9 changed files with 295 additions and 459 deletions.
diff --git a/examples/huggingface/.gitignore b/examples/huggingface/.gitignore
@@ -2,5 +2,5 @@ data
 *.npz
 *.tgz
 *.tar.gz
-.mnist-pytorch
+.huggingface_env
 client.yaml
diff --git a/examples/huggingface/API_Example.ipynb b/examples/huggingface/API_Example.ipynb
diff --git a/examples/huggingface/README.rst b/examples/huggingface/README.rst
@@ -1 +1,151 @@
-Huggingface Example
+Hugging Face Transformer Example
+-----------------------------
+
+This is an example project that demonstrates how one can make use of the Hugging Face Transformers library in FEDn.
+In this example, a pre-trained BERT-tiny model from Hugging Face is fine-tuned to perform spam detection 
+on the Enron spam email dataset.
+
+Email communication often contains personal and sensitive information, and privacy regulations make it 
+impossible to collect the data to a central storage for model training.
+Federated learning is a privacy preserving machine learning technique that enables the training of models on decentralized data sources.
+Fine-tuning large language models (LLMs) on various data sources enhances both accuracy and generalizability.
+In this example, the Enron email spam dataset is split among two clients. The BERT-tiny model is fine-tuned on the client data using 
+federated learning to predict whether an email is spam or not.
+Execute the following steps to run the example:
+
+Prerequisites
+-------------
+
+Using FEDn Studio:
+
+-  `Python 3.8, 3.9, 3.10 or 3.11 <https://www.python.org/downloads>`__
+-  `A FEDn Studio account <https://fedn.scaleoutsystems.com/signup>`__   
+
+If using pseudo-distributed mode with docker-compose:
+
+-  `Docker <https://docs.docker.com/get-docker>`__
+-  `Docker Compose <https://docs.docker.com/compose/install>`__
+
+Creating the compute package and seed model
+-------------------------------------------
+
+Install fedn: 
+
+.. code-block::
+
+   pip install fedn
+
+Clone this repository, then locate into this directory:
+
+.. code-block::
+
+   git clone https://github.com/scaleoutsystems/fedn.git
+   cd fedn/examples/huggingface
+
+Create the compute package:
+
+.. code-block::
+
+   fedn package create --path client
+
+This should create a file 'package.tgz' in the project folder.
+
+Next, generate a seed model (the first model in a global model trail):
+
+.. code-block::
+
+   fedn run build --path client
+
+This will create a seed model called 'seed.npz' in the root of the project. This step will take a few minutes, depending on hardware and internet connection (builds a virtualenv).  
+
+
+
+Using FEDn Studio (recommended)
+-------------------------------
+-------------------------------
+
+Setting up the project in FEDn Studio:
+--------------------------------------
+
+Follow the instructions to register for FEDN Studio and start a project (https://fedn.readthedocs.io/en/stable/studio.html).
+
+In your Studio project:
+
+- Go to the 'Sessions' menu, click on 'New session', and upload the compute package (package.tgz) and seed model (seed.npz).
+- In the 'Clients' menu, click on 'Connect client' and download the client configuration file (client.yaml)
+- Save the client configuration file to the huggingface example directory (fedn/examples/huggingface)
+
+To connect a client, run the following command in your terminal:
+
+.. code-block::
+
+   fedn run client -in client.yaml -secure=True -force-ssl
+   
+
+Alternatively, if you prefer to use Docker, run the following:
+
+.. code-block::
+    docker run \
+    -v $PWD/client.yaml:/app/client.yaml \
+    -e CLIENT_NUMBER=0 \
+    -e FEDN_PACKAGE_EXTRACT_DIR=package \
+    ghcr.io/scaleoutsystems/fedn/fedn:0.9.0 run client -in client.yaml --secure=True --force-ssl
+
+
+Running the example
+-------------------
+
+After everything is set up, go to 'Sessions' and click on 'New Session'. Click on 'Start run' and the example
+will execute. You can follow the training progress on 'Events' and 'Models', where you can view the calculated metrics.
+
+
+
+Running FEDn in local development mode:
+---------------------------------------
+---------------------------------------
+
+Create the compute package and seed model as explained above. Then run the following command:
+
+
+.. code-block::
+
+   docker-compose \
+ -f ../../docker-compose.yaml \
+ -f docker-compose.override.yaml \
+ up
+
+
+This starts up local services for MongoDB, Minio, the API Server, one Combiner and two clients. You can verify the deployment using these urls:
+
+-  `API Server: <http://localhost:5000>`__
+-  `Minio: <http://localhost:9000>`__
+-  `Mongo Express: <http://localhost:27017>`__
+
+
+Upload the package and seed model to FEDn controller using the APIClient:
+
+.. code-block::
+
+    from fedn import APIClient
+    client = APIClient(host="localhost", port=8092)
+    client.set_active_package("package.tgz", helper="numpyhelper")
+    client.set_active_model("seed.npz")
+
+
+You can now start a training session with 5 rounds (default) using the API client:
+
+.. code-block::
+
+    client.start_session()
+
+Clean up 
+--------
+
+You can clean up by running 
+
+.. code-block::
+
+   docker-compose \
+ -f ../../docker-compose.yaml \
+ -f docker-compose.override.yaml \
+ down -v
diff --git a/examples/huggingface/client/data.py b/examples/huggingface/client/data.py
@@ -1,61 +1,26 @@
 import os
-import requests
-import tarfile
 from pathlib import Path
 import torch
-import numpy as np   
+import numpy as np  
+from datasets import load_dataset
 from math import floor
 
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
 abs_path = os.path.abspath(dir_path)
 
 
-def get_data(out_dir='data'):
-    # Make dir if necessary
-    if not os.path.exists(out_dir):
-        os.mkdir(out_dir)
-
-    url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
-    save_path = f"{out_dir}/aclImdb_v1.tar.gz"
-
-    # Download the file
-    response = requests.get(url, stream=True)
-    if response.status_code == 200:
-        with open(save_path, 'wb') as f:
-            f.write(response.raw.read())
-        print("data download completed")
-    else:
-        print("could not download data")
-
-    # unzip file
-    with tarfile.open(save_path, "r:gz") as tar:
-        tar.extractall(path=out_dir)
-        print("data extraction completed")
-
-def read_imdb_split(split_dir):
-    split_dir = Path(split_dir)
-    texts = []
-    labels = []
-    for label_dir in ["pos", "neg"]:
-        for text_file in (split_dir/label_dir).iterdir():
-            texts.append(text_file.read_text())
-            labels.append(0 if label_dir is "neg" else 1)
-
-    return texts, labels
-
-def load_data(data_path):
+def load_data(data_path=None, is_train=True):
     if data_path is None:
-        data_path = os.environ.get("FEDN_DATA_PATH", abs_path+'/data/clients/1/imdb.pt')
-
+        data_path = os.environ.get("FEDN_DATA_PATH", abs_path+'/data/clients/1/enron_spam.pt')
     data = torch.load(data_path)
-
-    train_texts = list(data['train_texts'])
-    train_labels = list(data['train_labels'])
-    test_texts = list(data['test_texts'])
-    test_labels = list(data['test_labels'])
-
-    return train_texts, train_labels, test_texts, test_labels
+    if is_train:
+        X = data['X_train']
+        y = data['y_train']
+    else:
+        X = data['X_test']
+        y = data['y_test']
+    return X, y
 
 
 def splitset(dataset, parts):
@@ -72,30 +37,26 @@ def split(out_dir='data', n_splits=2):
     if not os.path.exists(f'{out_dir}/clients'):
         os.makedirs(f'{out_dir}/clients')
 
-    # Load and convert to dict
-    train_texts, train_labels = read_imdb_split(f'{out_dir}/aclImdb/train')
-    test_texts, test_labels = read_imdb_split(f'{out_dir}/aclImdb/test')
-
-    # Shuffle train data
-    perm = np.random.permutation(len(train_texts))
-    train_texts = np.array(train_texts)[perm]
-    train_labels = np.array(train_labels)[perm]
-    # shuffle test data
-    perm = np.random.permutation(len(train_texts))
-    test_texts = np.array(test_texts)[perm]
-    test_labels = np.array(test_labels)[perm]
+    dataset = load_dataset("SetFit/enron_spam")
+    train_data = dataset["train"].to_pandas()
+    test_data = dataset["test"].to_pandas()
+
+    X_train = train_data["text"].values
+    y_train = train_data["label"].values
+    X_test = test_data["text"].values
+    y_test = test_data["label"].values
 
     # Reduce data size
-    train_texts = train_texts[:500]    
-    train_labels = train_labels[:500]
-    test_texts = test_texts[:50]
-    test_labels = train_labels[:50]
+    X_train = X_train[:2000]    
+    y_train = y_train[:2000]
+    X_test = X_test[:200]
+    y_test = y_test[:200]
 
     data = {
-        'train_texts': splitset(train_texts, n_splits),
-        'train_labels': splitset(train_labels, n_splits),
-        'test_texts': splitset(test_texts, n_splits),
-        'test_labels': splitset(test_labels, n_splits),
+        'X_train': splitset(X_train, n_splits),
+        'y_train': splitset(y_train, n_splits),
+        'X_test': splitset(X_test, n_splits),
+        'y_test': splitset(y_test, n_splits),
     }
 
     # Make splits
@@ -104,16 +65,15 @@ def split(out_dir='data', n_splits=2):
         if not os.path.exists(subdir):
             os.mkdir(subdir)
         torch.save({
-            'train_texts': data['train_texts'][i],
-            'train_labels': data['train_labels'][i],
-            'test_texts': data['test_texts'][i],
-            'test_labels': data['test_labels'][i],
+            'X_train': data['X_train'][i],
+            'y_train': data['y_train'][i],
+            'X_test': data['X_test'][i],
+            'y_test': data['y_test'][i],
         },
-        f'{subdir}/imdb.pt')
+        f'{subdir}/enron_spam.pt')
 
 
 if __name__ == '__main__':
     # Prepare data if not already done
     if not os.path.exists(abs_path+'/data/clients/1'):
-        get_data()
         split()
diff --git a/examples/huggingface/client/model.py b/examples/huggingface/client/model.py
@@ -1,21 +1,21 @@
 import collections
 
 import torch
-from transformers import DistilBertForSequenceClassification
+from transformers import AutoModelForSequenceClassification
 
 from fedn.utils.helpers.helpers import get_helper
 
 HELPER_MODULE = 'numpyhelper'
 helper = get_helper(HELPER_MODULE)
 
 
-def compile_model(base_model="distilbert-base-uncased"):
+def compile_model(base_model="google/bert_uncased_L-2_H-128_A-2"):
     """ Compile the pytorch model.
 
     :return: The compiled model.
     :rtype: torch.nn.Module
     """
-    model = DistilBertForSequenceClassification.from_pretrained(base_model)
+    model = AutoModelForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=2)
     return model
 
 

diff --git a/examples/huggingface/client/python_env.yaml b/examples/huggingface/client/python_env.yaml
@@ -1,4 +1,4 @@
-name: mnist-pytorch
+name: huggingface_env
 build_dependencies:
   - pip
   - setuptools
@@ -7,4 +7,5 @@ dependencies:
   - torch==2.2.1
   - torchvision==0.17.1
   - fedn==0.9.0
-  - transformers==4.39.3
+  - transformers==4.39.3
+  - datasets==2.19.0