epfml · JulienVig · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/cli/package.json b/cli/package.json
@@ -15,8 +15,9 @@
   "author": "",
   "license": "ISC",
   "dependencies": {
-    "server": "*",
     "@epfml/discojs-node": "*",
+    "csv-parse": "^5.6.0",
+    "server": "*",
     "tslib": "2"
   },
   "devDependencies": {

diff --git a/cli/src/args.ts b/cli/src/args.ts
@@ -45,7 +45,7 @@ const unsafeArgs = parse<BenchmarkUnsafeArguments>(
 )
 
 const supportedTasks = Map(
-  Set.of<TaskProvider<"image"> | TaskProvider<"tabular">>(
+  Set.of<TaskProvider<"image"> | TaskProvider<"tabular"> | TaskProvider<"text">>(
     defaultTasks.cifar10,
     defaultTasks.lusCovid,
     defaultTasks.simpleFace,

diff --git a/cli/src/benchmark_gpt.ts b/cli/src/benchmark_gpt.ts
@@ -134,4 +134,4 @@ async function main(args: Required<CLIArguments>): Promise<void> {
 }
 
 // You can run this example with "npm start" from this folder
-main(args).catch(console.error)
+main(args).catch(console.error)
diff --git a/cli/src/train_gpt.ts b/cli/src/train_gpt.ts
@@ -45,4 +45,4 @@ async function main(): Promise<void> {
 }
 
 // You can run this example with "npm run run_gpt" from this folder
-main().catch(console.error)
+main().catch(console.error)
diff --git a/discojs/src/default_tasks/cifar10.ts b/discojs/src/default_tasks/cifar10.ts
@@ -63,6 +63,6 @@ export const cifar10: TaskProvider<'image'> = {
       metrics: ['accuracy']
     })
 
-    return new models.TFJS('image', model)
+    return new models.TFJS('image', model, "fedprox")
   }
 }
diff --git a/discojs/src/default_tasks/lus_covid.ts b/discojs/src/default_tasks/lus_covid.ts
@@ -39,7 +39,8 @@ export const lusCovid: TaskProvider<'image'> = {
 
   // Model architecture from tensorflow.js docs: 
   // https://codelabs.developers.google.com/codelabs/tfjs-training-classfication/index.html#4
-  async getModel (): Promise<Model<'image'>> {
+  async getModel(): Promise<Model<'image'>> {
+    const seed = 42
     const imageHeight = 100
     const imageWidth = 100
     const imageChannels = 3
@@ -55,7 +56,7 @@ export const lusCovid: TaskProvider<'image'> = {
       filters: 8,
       strides: 1,
       activation: 'relu',
-      kernelInitializer: 'varianceScaling'
+      kernelInitializer: tf.initializers.heNormal({ seed })
     }))
 
     // The MaxPooling layer acts as a sort of downsampling using max values
@@ -69,7 +70,7 @@ export const lusCovid: TaskProvider<'image'> = {
       filters: 16,
       strides: 1,
       activation: 'relu',
-      kernelInitializer: 'varianceScaling'
+      kernelInitializer: tf.initializers.heNormal({ seed })
     }))
     model.add(tf.layers.maxPooling2d({ poolSize: [2, 2], strides: [2, 2] }))
 
@@ -82,16 +83,16 @@ export const lusCovid: TaskProvider<'image'> = {
     // output class.
     model.add(tf.layers.dense({
       units: numOutputClasses,
-      kernelInitializer: 'varianceScaling',
-      activation: 'softmax'
+      activation: 'softmax',
+      kernelInitializer: tf.initializers.heNormal({ seed })
     }))
-
+    
     model.compile({
-      optimizer: 'sgd',
+      optimizer: tf.train.sgd(0.001),
       loss: 'binaryCrossentropy',
       metrics: ['accuracy']
     })
 
-    return Promise.resolve(new models.TFJS('image', model))
+    return Promise.resolve(new models.TFJS('image', model, "fedprox"))
   }
-}
+}
diff --git a/discojs/src/default_tasks/mnist.ts b/discojs/src/default_tasks/mnist.ts
@@ -66,6 +66,6 @@ export const mnist: TaskProvider<'image'> = {
       metrics: ['accuracy']
     })
 
-    return Promise.resolve(new models.TFJS('image', model))
+    return Promise.resolve(new models.TFJS('image', model, "fedprox"))
   }
 }
diff --git a/discojs/src/default_tasks/simple_face.ts b/discojs/src/default_tasks/simple_face.ts
@@ -48,6 +48,6 @@ export const simpleFace: TaskProvider<'image'> = {
       metrics: ['accuracy']
     })
 
-    return new models.TFJS('image', model)
+    return new models.TFJS('image', model, "fedprox")
   }
 }
diff --git a/discojs/src/default_tasks/tinder_dog.ts b/discojs/src/default_tasks/tinder_dog.ts
@@ -79,6 +79,6 @@ export const tinderDog: TaskProvider<'image'> = {
       metrics: ['accuracy']
     })
 
-    return Promise.resolve(new models.TFJS('image', model))
+    return Promise.resolve(new models.TFJS('image', model, "fedprox"))
   }
 }
diff --git a/discojs/src/default_tasks/titanic.ts b/discojs/src/default_tasks/titanic.ts
@@ -90,6 +90,6 @@ export const titanic: TaskProvider<'tabular'> = {
       metrics: ['accuracy']
     })
 
-    return Promise.resolve(new models.TFJS('tabular', model))
+    return Promise.resolve(new models.TFJS('tabular', model, "fedprox"))
   }
 }
diff --git a/discojs/src/models/gpt/index.ts b/discojs/src/models/gpt/index.ts
@@ -76,30 +76,10 @@ export class GPT extends Model<"text"> {
   async #runBatch(
     batch: Batched<DataFormat.ModelEncoded["text"]>,
   ): Promise<BatchLogs> {
-    const tfBatch = this.#batchToTF(batch);
-
-    let logs: tf.Logs | undefined;
-    await this.model.fitDataset(tf.data.array([tfBatch]), {
-      epochs: 1,
-      verbose: 0, // don't pollute
-      callbacks: {
-        onEpochEnd: (_, cur) => {
-          logs = cur;
-        },
-      },
-    });
-    tf.dispose(tfBatch);
-    if (logs === undefined) throw new Error("batch didn't gave any logs");
-
-    const { loss, acc: accuracy } = logs;
-    if (loss === undefined || isNaN(loss))
-      throw new Error("training loss is undefined or NaN");
-
-    return {
-      accuracy,
-      loss,
-      memoryUsage: tf.memory().numBytes / 1024 / 1024 / 1024,
-    };
+    const {xs, ys} = this.#batchToTF(batch);
+    const logs = await this.model.trainOnBatch(xs, ys);
+    tf.dispose([xs, ys])
+    return this.getBatchLogs(logs)
   }
 
   async #evaluate(

diff --git a/discojs/src/models/gpt/model.ts b/discojs/src/models/gpt/model.ts
@@ -4,7 +4,6 @@ import * as tf from '@tensorflow/tfjs'
 import type { GPTConfig } from './config.js'
 import { getModelSizes, DefaultGPTConfig } from './config.js'
 import { getCustomAdam, clipByGlobalNormObj } from './optimizers.js'
-import evaluate from './evaluate.js'
 import { GPTArchitecture } from './layers.js'
 
 const debug = createDebug("discojs:models:gpt:model");
@@ -55,101 +54,52 @@ export class GPTModel extends tf.LayersModel {
       : tf.train.adam(this.config.lr) 
   }
 
-  override async fitDataset<T>(dataset: Dataset<T>, trainingArgs: tf.ModelFitDatasetArgs<T>): Promise<tf.History> {
-    const callbacks = trainingArgs.callbacks as tf.CustomCallbackArgs
-    const evalDataset = trainingArgs.validationData as tf.data.Dataset<{ xs: tf.Tensor2D, ys: tf.Tensor3D }>
-    await callbacks.onTrainBegin?.()
+  override async trainOnBatch(x: tf.Tensor, y: tf.Tensor): Promise<number | number[]> {
+    let weightUpdateTime = performance.now()
 
-    for (let epoch = 1; epoch <= trainingArgs.epochs; epoch++) {
-      let accuracyFraction: [number, number] = [0, 0];
-      let averageLoss = 0
-      let iteration = 1
-      const iterator = await dataset.iterator()
-      let next = await iterator.next()
+    let preprocessingTime = performance.now()
+    await Promise.all([x.data(), y.data()])
+    preprocessingTime = performance.now() - preprocessingTime
 
-      while (next.done !== true && iteration <= this.config.maxIter) {
-        let weightUpdateTime = performance.now()
-        await callbacks.onEpochBegin?.(epoch)
-        const { xs, ys } = next.value as { xs: tf.Tensor2D, ys: tf.Tensor3D }
+    let logitsTensor: tf.Tensor<tf.Rank>;
+    const lossTensor = tf.tidy(() => {
+      const { grads, value: lossTensor } = this.optimizer.computeGradients(() => {
+        const logits = this.apply(x)
+        if (Array.isArray(logits))
+          throw new Error('model outputs too many tensor')
+        if (logits instanceof tf.SymbolicTensor)
+          throw new Error('model outputs symbolic tensor')
+        logitsTensor = tf.keep(logits)
+        return tf.losses.softmaxCrossEntropy(y, logits)
+      })
+      const gradsClipped = clipByGlobalNormObj(grads, 1)
+      this.optimizer.applyGradients(gradsClipped)
+      return lossTensor
+    })
 
-        let preprocessingTime = performance.now()
-        await Promise.all([xs.data(), ys.data()])
-        preprocessingTime = performance.now() - preprocessingTime
-
-        // TODO include as a tensor inside the model
-        const accTensor = tf.tidy(() => {
-          const logits = this.apply(xs)
-          if (Array.isArray(logits))
-            throw new Error('model outputs too many tensor')
-          if (logits instanceof tf.SymbolicTensor)
-            throw new Error('model outputs symbolic tensor')
-          return tf.metrics.categoricalAccuracy(ys, logits)
-        })
-        const accSize = accTensor.shape.reduce((l, r) => l * r, 1)
-        const accSumTensor = accTensor.sum()
-        const accSum = await accSumTensor.array()
-        tf.dispose(accSumTensor)
-        if (typeof accSum !== 'number')
-          throw new Error('got multiple accuracy sum')
-        accuracyFraction = [accuracyFraction[0] + accSum, accuracyFraction[1] + accSize];
-        tf.dispose([accTensor])
+    // @ts-expect-error Variable 'logitsTensor' is used before being assigned
+    const accTensor = tf.metrics.categoricalAccuracy(y, logitsTensor)
+    const accSize = accTensor.shape.reduce((l, r) => l * r, 1)
+    const accSumTensor = accTensor.sum()
+    const accSum = await accSumTensor.array()
+    if (typeof accSum !== 'number')
+      throw new Error('got multiple accuracy sum')
+    // @ts-expect-error Variable 'logitsTensor' is used before being assigned
+    tf.dispose([accTensor, accSumTensor, logitsTensor])
+
+    const loss = await lossTensor.array()
+    weightUpdateTime = performance.now() - weightUpdateTime
 
-        const lossTensor = tf.tidy(() => {
-          const { grads, value: lossTensor } = this.optimizer.computeGradients(() => {
-            const logits = this.apply(xs)
-            if (Array.isArray(logits))
-              throw new Error('model outputs too many tensor')
-            if (logits instanceof tf.SymbolicTensor)
-              throw new Error('model outputs symbolic tensor')
-            return tf.losses.softmaxCrossEntropy(ys, logits)
-          })
-          const gradsClipped = clipByGlobalNormObj(grads, 1)
-          this.optimizer.applyGradients(gradsClipped)
-          return lossTensor
-        })
-
-        const loss = await lossTensor.array()
-        averageLoss += loss
-        weightUpdateTime = performance.now() - weightUpdateTime
-
-        tf.dispose([xs, ys, lossTensor])
-
-        if (
-          evalDataset !== undefined &&
-          this.config.evaluateEvery !== undefined &&
-          iteration % this.config.evaluateEvery == 0
-        ){
-          const iterationLogs = await evaluate(this, evalDataset, this.config.maxEvalBatches)
-          debug('evaluation metrics: %O', iterationLogs);
-        }
-        const memory = tf.memory().numBytes / 1024 / 1024 / 1024
-        debug("training metrics: %O", {
-          epoch,
-          iteration,
-          loss,
-          memory,
-          allocated: tf.memory().numTensors,
-          preprocessingTime,
-          weightUpdateTime,
-        });
-        iteration++
-        next = await iterator.next()
-      }
-      // Memory leak: If we reached the last iteration rather than the end of the dataset, cleanup the tensors
-      if (next.done !== true && iteration > this.config.maxIter) {
-        const { xs, ys } = next.value as { xs: tf.Tensor2D, ys: tf.Tensor3D }
-        tf.dispose([xs, ys])
-      }
-      let logs: tf.Logs = {
-        'loss': averageLoss / (iteration - 1), // -1 because iteration got incremented at the end of the loop
-        'acc': accuracyFraction[0] / accuracyFraction[1],
-      }
-      if (evalDataset !== undefined) {
-        logs = { ...logs, ...await evaluate(this, evalDataset, this.config.maxEvalBatches) }
-      }
-      await callbacks.onEpochEnd?.(epoch, logs)
-    }
-    await callbacks.onTrainEnd?.()
-    return new tf.History()
+    tf.dispose([x, y, lossTensor])
+
+    const memory = tf.memory().numBytes / 1024 / 1024 / 1024
+    debug("training metrics: %O", {
+      loss,
+      memory,
+      allocated: tf.memory().numTensors,
+      preprocessingTime,
+      weightUpdateTime,
+    });
+    return [loss, accSum / accSize]
   }
 }
diff --git a/discojs/src/models/model.ts b/discojs/src/models/model.ts
@@ -6,6 +6,8 @@ import type {
   WeightsContainer,
 } from "../index.js";
 
+import * as tf from "@tensorflow/tfjs";
+
 import type { BatchLogs, EpochLogs } from "./logs.js";
 
 /**
@@ -15,12 +17,16 @@ import type { BatchLogs, EpochLogs } from "./logs.js";
  **/
 // TODO make it typesafe: same shape of data/input/weights
 export abstract class Model<D extends DataType> implements Disposable {
+  protected prevRoundWeights: WeightsContainer | undefined;
   // TODO don't allow external access but upgrade train to return weights on every epoch
   /** Return training state */
   abstract get weights(): WeightsContainer;
   /** Set training state */
   abstract set weights(ws: WeightsContainer);
 
+  set previousRoundWeights(ws: WeightsContainer | undefined) {
+    this.prevRoundWeights = ws
+  }
   /**
    * Improve predictor
    *
@@ -39,6 +45,26 @@ export abstract class Model<D extends DataType> implements Disposable {
     batch: Batched<DataFormat.ModelEncoded[D][0]>,
   ): Promise<Batched<DataFormat.ModelEncoded[D][1]>>;
 
+  protected getBatchLogs(
+    logs: number | number[],
+  ): BatchLogs {
+    if (!Array.isArray(logs) || logs.length != 2) 
+      throw new Error("training output has unexpected shape")
+
+    const [loss, accuracy] = logs
+
+    if (
+      typeof loss !== "number" || isNaN(loss) ||
+      typeof accuracy !== "number" || isNaN(accuracy)
+    )
+      throw new Error("training loss or accuracy is undefined or NaN");
+
+    return {
+      accuracy,
+      loss,
+      memoryUsage: tf.memory().numBytes / 1024 / 1024 / 1024,
+    };
+  }
   /**
    * This method is automatically called to cleanup the memory occupied by the model
    * when leaving the definition scope if the instance has been defined with the `using` keyword.