Skip to content

Commit 43ea7cc

Browse files
committedMar 6, 2024
discojs-core/tasks: add wikitext
1 parent 26f9eb7 commit 43ea7cc

File tree

2 files changed

+49
-2
lines changed

2 files changed

+49
-2
lines changed
 

Diff for: ‎discojs/discojs-core/src/default_tasks/index.ts

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
export { cifar10 } from './cifar10'
2+
export { geotags } from './geotags'
23
export { lusCovid } from './lus_covid'
34
export { mnist } from './mnist'
4-
export { titanic } from './titanic'
55
export { simpleFace } from './simple_face'
6-
export { geotags } from './geotags'
76
export { skinMnist } from './skin_mnist'
7+
export { titanic } from './titanic'
8+
export { wikitext } from './wikitext'

Diff for: ‎discojs/discojs-core/src/default_tasks/wikitext.ts

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import type { Model, Task, TaskProvider } from '..'
2+
import { TrainingSchemes, models } from '..'
3+
4+
export const wikitext: TaskProvider = {
5+
getTask (): Task {
6+
return {
7+
id: 'wikitext-103',
8+
displayInformation: {
9+
taskTitle: 'Wikitext 103 Raw',
10+
summary: {
11+
preview:
12+
'In this challenge, we ask you to do next word prediction on a dataset of Wikipedia articles.',
13+
overview:
14+
'Wikitext-103-raw is a dataset comprising unprocessed text excerpts from Wikipedia articles, designed for tasks related to natural language processing and language modeling.'
15+
},
16+
limitations:
17+
'The dataset may contain noise, inconsistencies, and unstructured content due to its raw nature, potentially posing challenges for certain NLP tasks.',
18+
tradeoffs:
19+
'The raw format may lack structured annotations and may require additional preprocessing for specific applications.',
20+
dataFormatInformation:
21+
'The dataset is organized as a large text file, with each line representing a segment of raw text from Wikipedia articles.',
22+
dataExampleText:
23+
'An example excerpt from the dataset could be: "The history of artificial intelligence dates back to ancient times, with philosophical discussions on the nature of thought and reasoning."'
24+
},
25+
trainingInformation: {
26+
dataType: 'text',
27+
modelID: 'wikitext-103-raw-model',
28+
validationSplit: 0.2, // TODO: is this used somewhere? because train, eval and test are already split in dataset
29+
epochs: 10_000,
30+
// constructing a batch is taken care automatically in the dataset to make things faster
31+
// so we fake a batch size of 1
32+
batchSize: 1,
33+
scheme: TrainingSchemes.DECENTRALIZED,
34+
noiseScale: undefined,
35+
decentralizedSecure: true,
36+
minimumReadyPeers: 3,
37+
maxShareValue: 100,
38+
roundDuration: 10
39+
}
40+
}
41+
},
42+
43+
async getModel (): Promise<Model> {
44+
return new models.GPT()
45+
}
46+
}

0 commit comments

Comments
 (0)