discojs-core/tasks: add wikitext

tharvik · tharvik · commit 050a4b473f52 · 2024-03-15T13:01:53.000+01:00
diff --git a/discojs/discojs-core/src/default_tasks/index.ts b/discojs/discojs-core/src/default_tasks/index.ts
@@ -1,7 +1,8 @@
 export { cifar10 } from './cifar10'
+export { geotags } from './geotags'
 export { lusCovid } from './lus_covid'
 export { mnist } from './mnist'
-export { titanic } from './titanic'
 export { simpleFace } from './simple_face'
-export { geotags } from './geotags'
 export { skinMnist } from './skin_mnist'
+export { titanic } from './titanic'
+export { wikitext } from './wikitext'
diff --git a/discojs/discojs-core/src/default_tasks/wikitext.ts b/discojs/discojs-core/src/default_tasks/wikitext.ts
@@ -0,0 +1,40 @@
+import type { Model, Task, TaskProvider } from '..'
+import { TrainingSchemes, models } from '..'
+
+export const wikitext: TaskProvider = {
+  getTask (): Task {
+    return {
+      id: 'wikitext-103',
+      displayInformation: {
+        taskTitle: 'Language modelling on wikitext',
+        summary: {
+          preview: 'In this challenge, we ask you to do next word prediction on a dataset of Wikipedia articles.',
+          overview: 'Wikitext-103-raw is a dataset comprising unprocessed text excerpts from Wikipedia articles, designed for tasks related to natural language processing and language modeling.'
+        },
+        limitations: 'The dataset may contain noise, inconsistencies, and unstructured content due to its raw nature, potentially posing challenges for certain NLP tasks.',
+        tradeoffs: 'The raw format may lack structured annotations and may require additional preprocessing for specific applications.',
+        dataFormatInformation: 'The dataset is organized as a large text file, with each line representing a segment of raw text from Wikipedia articles.',
+        dataExampleText: 'An example excerpt from the dataset could be: "The history of artificial intelligence dates back to ancient times, with philosophical discussions on the nature of thought and reasoning."'
+      },
+      trainingInformation: {
+        dataType: 'text',
+        modelID: 'wikitext-103-raw-model',
+        validationSplit: 0.2, // TODO: is this used somewhere? because train, eval and test are already split in dataset
+        epochs: 10_000,
+        // constructing a batch is taken care automatically in the dataset to make things faster
+        // so we fake a batch size of 1
+        batchSize: 1,
+        scheme: TrainingSchemes.FEDERATED,
+        noiseScale: undefined,
+        decentralizedSecure: true,
+        minimumReadyPeers: 3,
+        maxShareValue: 100,
+        roundDuration: 10
+      }
+    }
+  },
+
+  async getModel (): Promise<Model> {
+    return new models.GPT()
+  }
+}