Skip to content

Commit c891efe

Browse files
committed
datasets: initial
1 parent 43ea7cc commit c891efe

File tree

18 files changed

+112
-207
lines changed

18 files changed

+112
-207
lines changed

.github/workflows/lint-test-build.yml

+50-33
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,21 @@ on:
66

77
env:
88
node_version: 16
9+
datasets-cache-key: datasets-${{ hashFiles('datasets/**') }}
910

1011
jobs:
11-
download-training-data:
12+
download-datasets:
1213
runs-on: ubuntu-latest
1314
steps:
1415
- uses: actions/checkout@v3
16+
with:
17+
lfs: true
18+
submodules: true
1519
- uses: actions/cache@v3
1620
with:
17-
path: example_training_data
18-
key: training_data
19-
- run: ./get_training_data.sh
20-
working-directory: ./
21+
path: datasets
22+
key: ${{ env.datasets-cache-key }}
23+
- run: ./datasets/populate
2124

2225
lint-lib-core:
2326
needs: [build-lib-core, build-lib-node]
@@ -213,19 +216,17 @@ jobs:
213216

214217
test-lib-core:
215218
needs:
216-
[
217-
build-lib-core,
218-
build-lib-node,
219-
build-server-docker,
220-
download-training-data,
221-
]
219+
[build-lib-core, build-lib-node, build-server-docker, download-datasets]
222220
runs-on: ubuntu-latest
223221
steps:
224222
- uses: actions/checkout@v3
223+
with:
224+
lfs: true
225+
submodules: true
225226
- uses: actions/cache@v3
226227
with:
227-
path: example_training_data
228-
key: training_data
228+
path: datasets
229+
key: ${{ env.datasets-cache-key }}
229230
- uses: actions/setup-node@v3
230231
with:
231232
node-version: ${{ env.node_version }}
@@ -235,14 +236,17 @@ jobs:
235236
- run: ./with_server npm --workspace=./discojs/discojs-core test
236237

237238
test-lib-node:
238-
needs: [build-lib-core, build-server-docker, download-training-data]
239+
needs: [build-lib-core, build-server-docker, download-datasets]
239240
runs-on: ubuntu-latest
240241
steps:
241242
- uses: actions/checkout@v3
243+
with:
244+
lfs: true
245+
submodules: true
242246
- uses: actions/cache@v3
243247
with:
244-
path: example_training_data
245-
key: training_data
248+
path: datasets
249+
key: ${{ env.datasets-cache-key }}
246250
- uses: actions/setup-node@v3
247251
with:
248252
node-version: ${{ env.node_version }}
@@ -252,14 +256,17 @@ jobs:
252256
- run: ./with_server npm --workspace=./discojs/discojs-node test
253257

254258
test-lib-web:
255-
needs: [build-lib-core, build-server-docker, download-training-data]
259+
needs: [build-lib-core, build-server-docker, download-datasets]
256260
runs-on: ubuntu-latest
257261
steps:
258262
- uses: actions/checkout@v3
263+
with:
264+
lfs: true
265+
submodules: true
259266
- uses: actions/cache@v3
260267
with:
261-
path: example_training_data
262-
key: training_data
268+
path: datasets
269+
key: ${{ env.datasets-cache-key }}
263270
- uses: actions/setup-node@v3
264271
with:
265272
node-version: ${{ env.node_version }}
@@ -269,14 +276,17 @@ jobs:
269276
- run: ./with_server npm --workspace=./discojs/discojs-web test
270277

271278
test-server:
272-
needs: [build-lib-core, build-lib-node, download-training-data]
279+
needs: [build-lib-core, build-lib-node, download-datasets]
273280
runs-on: ubuntu-latest
274281
steps:
275282
- uses: actions/checkout@v3
283+
with:
284+
lfs: true
285+
submodules: true
276286
- uses: actions/cache@v3
277287
with:
278-
path: example_training_data
279-
key: training_data
288+
path: datasets
289+
key: ${{ env.datasets-cache-key }}
280290
- uses: actions/setup-node@v3
281291
with:
282292
node-version: ${{ env.node_version }}
@@ -286,14 +296,17 @@ jobs:
286296
- run: npm --workspace=./server test
287297

288298
test-web-client:
289-
needs: [build-lib-core, build-lib-web, download-training-data]
299+
needs: [build-lib-core, build-lib-web, download-datasets]
290300
runs-on: ubuntu-latest
291301
steps:
292302
- uses: actions/checkout@v3
303+
with:
304+
lfs: true
305+
submodules: true
293306
- uses: actions/cache@v3
294307
with:
295-
path: example_training_data
296-
key: training_data
308+
path: datasets
309+
key: ${{ env.datasets-cache-key }}
297310
- uses: actions/setup-node@v3
298311
with:
299312
node-version: ${{ env.node_version }}
@@ -310,15 +323,17 @@ jobs:
310323
config: baseUrl=http://localhost:8081/#/
311324

312325
test-cli:
313-
needs:
314-
[build-lib-core, build-lib-node, build-server, download-training-data]
326+
needs: [build-lib-core, build-lib-node, build-server, download-datasets]
315327
runs-on: ubuntu-latest
316328
steps:
317329
- uses: actions/checkout@v3
330+
with:
331+
lfs: true
332+
submodules: true
318333
- uses: actions/cache@v3
319334
with:
320-
path: example_training_data
321-
key: training_data
335+
path: datasets
336+
key: ${{ env.datasets-cache-key }}
322337
- uses: actions/setup-node@v3
323338
with:
324339
node-version: ${{ env.node_version }}
@@ -328,15 +343,17 @@ jobs:
328343
- run: npm --workspace=./cli start -- -t cifar10 -u 1 -e 1
329344

330345
test-docs-examples:
331-
needs:
332-
[build-lib-core, build-lib-node, build-server, download-training-data]
346+
needs: [build-lib-core, build-lib-node, build-server, download-datasets]
333347
runs-on: ubuntu-latest
334348
steps:
335349
- uses: actions/checkout@v3
350+
with:
351+
lfs: true
352+
submodules: true
336353
- uses: actions/cache@v3
337354
with:
338-
path: example_training_data
339-
key: training_data
355+
path: datasets
356+
key: ${{ env.datasets-cache-key }}
340357
- uses: actions/setup-node@v3
341358
with:
342359
node-version: ${{ env.node_version }}

.gitignore

+5-144
Original file line numberDiff line numberDiff line change
@@ -1,150 +1,11 @@
1-
# Byte-compiled / optimized / DLL files
2-
__pycache__/
3-
*.py[cod]
4-
*$py.class
1+
# dependencies
2+
/node_modules/
53

6-
# C extensions
7-
*.so
8-
9-
# Distribution / packaging
10-
.Python
11-
build/
12-
develop-eggs/
4+
# built
135
dist/
14-
downloads/
15-
eggs/
16-
.eggs/
17-
lib/
18-
lib64/
19-
parts/
20-
sdist/
21-
var/
22-
wheels/
23-
pip-wheel-metadata/
24-
share/python-wheels/
25-
*.egg-info/
26-
.installed.cfg
27-
*.egg
28-
MANIFEST
29-
30-
# PyInstaller
31-
# Usually these files are written by a python script from a template
32-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
33-
*.manifest
34-
*.spec
35-
36-
# Installer logs
37-
pip-log.txt
38-
pip-delete-this-directory.txt
39-
40-
# Unit test / coverage reports
41-
htmlcov/
42-
.tox/
43-
.nox/
44-
.coverage
45-
.coverage.*
46-
.cache
47-
nosetests.xml
48-
coverage.xml
49-
*.cover
50-
*.py,cover
51-
.hypothesis/
52-
.pytest_cache/
53-
54-
# Translations
55-
*.mo
56-
*.pot
57-
58-
# Django stuff:
59-
*.log
60-
local_settings.py
61-
db.sqlite3
62-
db.sqlite3-journal
63-
64-
# Flask stuff:
65-
instance/
66-
.webassets-cache
67-
68-
# Scrapy stuff:
69-
.scrapy
70-
71-
# Sphinx documentation
72-
docs/_build/
73-
74-
# PyBuilder
75-
target/
76-
77-
# Jupyter Notebook
78-
.ipynb_checkpoints
79-
80-
# IPython
81-
profile_default/
82-
ipython_config.py
83-
84-
# pyenv
85-
.python-version
86-
87-
# pipenv
88-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
90-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
91-
# install all needed dependencies.
92-
#Pipfile.lock
93-
94-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
95-
__pypackages__/
96-
97-
# Celery stuff
98-
celerybeat-schedule
99-
celerybeat.pid
1006

101-
# SageMath parsed files
102-
*.sage.py
103-
104-
# Environments
105-
.env
106-
.venv
107-
env/
108-
venv/
109-
ENV/
110-
env.bak/
111-
venv.bak/
112-
113-
# Spyder project settings
114-
.spyderproject
115-
.spyproject
116-
117-
# Rope project settings
118-
.ropeproject
119-
120-
# mkdocs documentation
121-
/site
122-
123-
# mypy
124-
.mypy_cache/
125-
.dmypy.json
126-
dmypy.json
127-
128-
# Pyre type checker
129-
.pyre/
130-
131-
132-
UI/public/.DS_Store
133-
UI/.DS_Store
134-
135-
*.DS_Store
136-
137-
node_modules/
138-
139-
# model files on server
140-
weights.bin
141-
model.json
142-
143-
# example training data
144-
example_training_data/
145-
example_training_data.tar.gz
146-
147-
# IDE files
7+
# system specifics files
1488
.metals/
1499
.idea/
15010
.vscode/
11+
*.DS_Store

.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "datasets/wikitext"]
2+
path = datasets/wikitext
3+
url = https://huggingface.co/datasets/wikitext

DEV.md

+7-2
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,13 @@ npm --version
7575

7676
**3.** Clone the repository
7777

78+
As we are using some huge files in the repository, you'll need to install [`git-lfs`](https://git-lfs.com).
79+
Be sure to run `git lfs install` after installing it.
80+
81+
We are using git submodules, so we need to also clone theses.
82+
7883
```
79-
git clone git@github.com:epfml/disco.git
84+
git clone --recurse-submodules git@github.com:epfml/disco.git
8085
cd disco
8186
```
8287

@@ -101,7 +106,7 @@ npm -ws run build
101106
**6.** Download and extract the sample training datasets. These datasets are used in the automated tests.
102107

103108
```
104-
./get_training_data.sh
109+
./datasets/populate
105110
```
106111

107112
**7.** Launch DISCO

cli/src/data.ts

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ function filesFromFolder (dir: string, folder: string, fractionToKeep: number):
1212
}
1313

1414
async function simplefaceData (task: Task): Promise<data.DataSplit> {
15-
const dir = '../example_training_data/simple_face/'
15+
const dir = '../datasets/simple_face/'
1616
const youngFolders = ['child']
1717
const oldFolders = ['adult']
1818

@@ -39,15 +39,15 @@ async function simplefaceData (task: Task): Promise<data.DataSplit> {
3939
}
4040

4141
async function cifar10Data (cifar10: Task): Promise<data.DataSplit> {
42-
const dir = '../example_training_data/CIFAR10/'
42+
const dir = '../datasets/CIFAR10/'
4343
const files = (await fs_promises.readdir(dir)).map((file) => path.join(dir, file))
4444
const labels = Range(0, 24).map((label) => (label % 10).toString()).toArray()
4545

4646
return await new NodeImageLoader(cifar10).loadAll(files, { labels })
4747
}
4848

4949
async function titanicData (titanic: Task): Promise<data.DataSplit> {
50-
const dir = '../example_training_data/titanic_train.csv'
50+
const dir = '../datasets/titanic_train.csv'
5151

5252
const data = await (new NodeTabularLoader(titanic, ',').loadAll(
5353
['file://'.concat(dir)],

0 commit comments

Comments
 (0)