Skip to content

Commit 050a4b4

Browse files
committed
discojs-core/tasks: add wikitext
1 parent d928f74 commit 050a4b4

File tree

2 files changed

+43
-2
lines changed

2 files changed

+43
-2
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
export { cifar10 } from './cifar10'
2+
export { geotags } from './geotags'
23
export { lusCovid } from './lus_covid'
34
export { mnist } from './mnist'
4-
export { titanic } from './titanic'
55
export { simpleFace } from './simple_face'
6-
export { geotags } from './geotags'
76
export { skinMnist } from './skin_mnist'
7+
export { titanic } from './titanic'
8+
export { wikitext } from './wikitext'
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import type { Model, Task, TaskProvider } from '..'
2+
import { TrainingSchemes, models } from '..'
3+
4+
export const wikitext: TaskProvider = {
5+
getTask (): Task {
6+
return {
7+
id: 'wikitext-103',
8+
displayInformation: {
9+
taskTitle: 'Language modelling on wikitext',
10+
summary: {
11+
preview: 'In this challenge, we ask you to do next word prediction on a dataset of Wikipedia articles.',
12+
overview: 'Wikitext-103-raw is a dataset comprising unprocessed text excerpts from Wikipedia articles, designed for tasks related to natural language processing and language modeling.'
13+
},
14+
limitations: 'The dataset may contain noise, inconsistencies, and unstructured content due to its raw nature, potentially posing challenges for certain NLP tasks.',
15+
tradeoffs: 'The raw format may lack structured annotations and may require additional preprocessing for specific applications.',
16+
dataFormatInformation: 'The dataset is organized as a large text file, with each line representing a segment of raw text from Wikipedia articles.',
17+
dataExampleText: 'An example excerpt from the dataset could be: "The history of artificial intelligence dates back to ancient times, with philosophical discussions on the nature of thought and reasoning."'
18+
},
19+
trainingInformation: {
20+
dataType: 'text',
21+
modelID: 'wikitext-103-raw-model',
22+
validationSplit: 0.2, // TODO: is this used somewhere? because train, eval and test are already split in dataset
23+
epochs: 10_000,
24+
// constructing a batch is taken care automatically in the dataset to make things faster
25+
// so we fake a batch size of 1
26+
batchSize: 1,
27+
scheme: TrainingSchemes.FEDERATED,
28+
noiseScale: undefined,
29+
decentralizedSecure: true,
30+
minimumReadyPeers: 3,
31+
maxShareValue: 100,
32+
roundDuration: 10
33+
}
34+
}
35+
},
36+
37+
async getModel (): Promise<Model> {
38+
return new models.GPT()
39+
}
40+
}

0 commit comments

Comments
 (0)