Skip to content

Commit aec0ddc

Browse files
committed
datasets: initial
1 parent 050a4b4 commit aec0ddc

File tree

16 files changed

+109
-208
lines changed

16 files changed

+109
-208
lines changed

.github/workflows/lint-test-build.yml

+49-33
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,18 @@ env:
88
node_version: 16
99

1010
jobs:
11-
download-training-data:
11+
download-datasets:
1212
runs-on: ubuntu-latest
1313
steps:
1414
- uses: actions/checkout@v3
15+
with:
16+
lfs: true
17+
submodules: true
1518
- uses: actions/cache@v3
1619
with:
17-
path: example_training_data
18-
key: training_data
19-
- run: ./get_training_data.sh
20-
working-directory: ./
20+
path: datasets
21+
key: datasets-${{ hashFiles('datasets/**') }}
22+
- run: ./datasets/populate
2123

2224
lint-lib-core:
2325
needs: [build-lib-core, build-lib-node]
@@ -213,19 +215,17 @@ jobs:
213215

214216
test-lib-core:
215217
needs:
216-
[
217-
build-lib-core,
218-
build-lib-node,
219-
build-server-docker,
220-
download-training-data,
221-
]
218+
[build-lib-core, build-lib-node, build-server-docker, download-datasets]
222219
runs-on: ubuntu-latest
223220
steps:
224221
- uses: actions/checkout@v3
222+
with:
223+
lfs: true
224+
submodules: true
225225
- uses: actions/cache@v3
226226
with:
227-
path: example_training_data
228-
key: training_data
227+
path: datasets
228+
key: datasets-${{ hashFiles('datasets/**') }}
229229
- uses: actions/setup-node@v3
230230
with:
231231
node-version: ${{ env.node_version }}
@@ -235,14 +235,17 @@ jobs:
235235
- run: ./with_server npm --workspace=./discojs/discojs-core test
236236

237237
test-lib-node:
238-
needs: [build-lib-core, build-server-docker, download-training-data]
238+
needs: [build-lib-core, build-server-docker, download-datasets]
239239
runs-on: ubuntu-latest
240240
steps:
241241
- uses: actions/checkout@v3
242+
with:
243+
lfs: true
244+
submodules: true
242245
- uses: actions/cache@v3
243246
with:
244-
path: example_training_data
245-
key: training_data
247+
path: datasets
248+
key: datasets-${{ hashFiles('datasets/**') }}
246249
- uses: actions/setup-node@v3
247250
with:
248251
node-version: ${{ env.node_version }}
@@ -252,14 +255,17 @@ jobs:
252255
- run: ./with_server npm --workspace=./discojs/discojs-node test
253256

254257
test-lib-web:
255-
needs: [build-lib-core, build-server-docker, download-training-data]
258+
needs: [build-lib-core, build-server-docker, download-datasets]
256259
runs-on: ubuntu-latest
257260
steps:
258261
- uses: actions/checkout@v3
262+
with:
263+
lfs: true
264+
submodules: true
259265
- uses: actions/cache@v3
260266
with:
261-
path: example_training_data
262-
key: training_data
267+
path: datasets
268+
key: datasets-${{ hashFiles('datasets/**') }}
263269
- uses: actions/setup-node@v3
264270
with:
265271
node-version: ${{ env.node_version }}
@@ -269,14 +275,17 @@ jobs:
269275
- run: ./with_server npm --workspace=./discojs/discojs-web test
270276

271277
test-server:
272-
needs: [build-lib-core, build-lib-node, download-training-data]
278+
needs: [build-lib-core, build-lib-node, download-datasets]
273279
runs-on: ubuntu-latest
274280
steps:
275281
- uses: actions/checkout@v3
282+
with:
283+
lfs: true
284+
submodules: true
276285
- uses: actions/cache@v3
277286
with:
278-
path: example_training_data
279-
key: training_data
287+
path: datasets
288+
key: datasets-${{ hashFiles('datasets/**') }}
280289
- uses: actions/setup-node@v3
281290
with:
282291
node-version: ${{ env.node_version }}
@@ -286,14 +295,17 @@ jobs:
286295
- run: npm --workspace=./server test
287296

288297
test-web-client:
289-
needs: [build-lib-core, build-lib-web, download-training-data]
298+
needs: [build-lib-core, build-lib-web, download-datasets]
290299
runs-on: ubuntu-latest
291300
steps:
292301
- uses: actions/checkout@v3
302+
with:
303+
lfs: true
304+
submodules: true
293305
- uses: actions/cache@v3
294306
with:
295-
path: example_training_data
296-
key: training_data
307+
path: datasets
308+
key: datasets-${{ hashFiles('datasets/**') }}
297309
- uses: actions/setup-node@v3
298310
with:
299311
node-version: ${{ env.node_version }}
@@ -310,15 +322,17 @@ jobs:
310322
config: baseUrl=http://localhost:8081/#/
311323

312324
test-cli:
313-
needs:
314-
[build-lib-core, build-lib-node, build-server, download-training-data]
325+
needs: [build-lib-core, build-lib-node, build-server, download-datasets]
315326
runs-on: ubuntu-latest
316327
steps:
317328
- uses: actions/checkout@v3
329+
with:
330+
lfs: true
331+
submodules: true
318332
- uses: actions/cache@v3
319333
with:
320-
path: example_training_data
321-
key: training_data
334+
path: datasets
335+
key: datasets-${{ hashFiles('datasets/**') }}
322336
- uses: actions/setup-node@v3
323337
with:
324338
node-version: ${{ env.node_version }}
@@ -328,15 +342,17 @@ jobs:
328342
- run: npm --workspace=./cli start -- -t cifar10 -u 1 -e 1
329343

330344
test-docs-examples:
331-
needs:
332-
[build-lib-core, build-lib-node, build-server, download-training-data]
345+
needs: [build-lib-core, build-lib-node, build-server, download-datasets]
333346
runs-on: ubuntu-latest
334347
steps:
335348
- uses: actions/checkout@v3
349+
with:
350+
lfs: true
351+
submodules: true
336352
- uses: actions/cache@v3
337353
with:
338-
path: example_training_data
339-
key: training_data
354+
path: datasets
355+
key: datasets-${{ hashFiles('datasets/**') }}
340356
- uses: actions/setup-node@v3
341357
with:
342358
node-version: ${{ env.node_version }}

.gitignore

+5-144
Original file line numberDiff line numberDiff line change
@@ -1,150 +1,11 @@
1-
# Byte-compiled / optimized / DLL files
2-
__pycache__/
3-
*.py[cod]
4-
*$py.class
1+
# dependencies
2+
/node_modules/
53

6-
# C extensions
7-
*.so
8-
9-
# Distribution / packaging
10-
.Python
11-
build/
12-
develop-eggs/
4+
# built
135
dist/
14-
downloads/
15-
eggs/
16-
.eggs/
17-
lib/
18-
lib64/
19-
parts/
20-
sdist/
21-
var/
22-
wheels/
23-
pip-wheel-metadata/
24-
share/python-wheels/
25-
*.egg-info/
26-
.installed.cfg
27-
*.egg
28-
MANIFEST
29-
30-
# PyInstaller
31-
# Usually these files are written by a python script from a template
32-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
33-
*.manifest
34-
*.spec
35-
36-
# Installer logs
37-
pip-log.txt
38-
pip-delete-this-directory.txt
39-
40-
# Unit test / coverage reports
41-
htmlcov/
42-
.tox/
43-
.nox/
44-
.coverage
45-
.coverage.*
46-
.cache
47-
nosetests.xml
48-
coverage.xml
49-
*.cover
50-
*.py,cover
51-
.hypothesis/
52-
.pytest_cache/
53-
54-
# Translations
55-
*.mo
56-
*.pot
57-
58-
# Django stuff:
59-
*.log
60-
local_settings.py
61-
db.sqlite3
62-
db.sqlite3-journal
63-
64-
# Flask stuff:
65-
instance/
66-
.webassets-cache
67-
68-
# Scrapy stuff:
69-
.scrapy
70-
71-
# Sphinx documentation
72-
docs/_build/
73-
74-
# PyBuilder
75-
target/
76-
77-
# Jupyter Notebook
78-
.ipynb_checkpoints
79-
80-
# IPython
81-
profile_default/
82-
ipython_config.py
83-
84-
# pyenv
85-
.python-version
86-
87-
# pipenv
88-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
90-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
91-
# install all needed dependencies.
92-
#Pipfile.lock
93-
94-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
95-
__pypackages__/
96-
97-
# Celery stuff
98-
celerybeat-schedule
99-
celerybeat.pid
1006

101-
# SageMath parsed files
102-
*.sage.py
103-
104-
# Environments
105-
.env
106-
.venv
107-
env/
108-
venv/
109-
ENV/
110-
env.bak/
111-
venv.bak/
112-
113-
# Spyder project settings
114-
.spyderproject
115-
.spyproject
116-
117-
# Rope project settings
118-
.ropeproject
119-
120-
# mkdocs documentation
121-
/site
122-
123-
# mypy
124-
.mypy_cache/
125-
.dmypy.json
126-
dmypy.json
127-
128-
# Pyre type checker
129-
.pyre/
130-
131-
132-
UI/public/.DS_Store
133-
UI/.DS_Store
134-
135-
*.DS_Store
136-
137-
node_modules/
138-
139-
# model files on server
140-
weights.bin
141-
model.json
142-
143-
# example training data
144-
example_training_data/
145-
example_training_data.tar.gz
146-
147-
# IDE files
7+
# system specifics files
1488
.metals/
1499
.idea/
15010
.vscode/
11+
*.DS_Store

DEV.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ npm -ws run build
101101
**6.** Download and extract the sample training datasets. These datasets are used in the automated tests.
102102

103103
```
104-
./get_training_data.sh
104+
./datasets/populate
105105
```
106106

107107
**7.** Launch DISCO

cli/src/data.ts

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ function filesFromFolder (dir: string, folder: string, fractionToKeep: number):
1212
}
1313

1414
async function simplefaceData (task: Task): Promise<data.DataSplit> {
15-
const dir = '../example_training_data/simple_face/'
15+
const dir = '../datasets/simple_face/'
1616
const youngFolders = ['child']
1717
const oldFolders = ['adult']
1818

@@ -39,15 +39,15 @@ async function simplefaceData (task: Task): Promise<data.DataSplit> {
3939
}
4040

4141
async function cifar10Data (cifar10: Task): Promise<data.DataSplit> {
42-
const dir = '../example_training_data/CIFAR10/'
42+
const dir = '../datasets/CIFAR10/'
4343
const files = (await fs_promises.readdir(dir)).map((file) => path.join(dir, file))
4444
const labels = Range(0, 24).map((label) => (label % 10).toString()).toArray()
4545

4646
return await new NodeImageLoader(cifar10).loadAll(files, { labels })
4747
}
4848

4949
async function titanicData (titanic: Task): Promise<data.DataSplit> {
50-
const dir = '../example_training_data/titanic_train.csv'
50+
const dir = '../datasets/titanic_train.csv'
5151

5252
const data = await (new NodeTabularLoader(titanic, ',').loadAll(
5353
['file://'.concat(dir)],

datasets/.gitignore

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# example_training_data.tar.gz
2+
/2_QAID_1.masked.reshaped.squared.224.png
3+
/9-mnist-example.png
4+
/CIFAR10/
5+
/cifar10-agents
6+
/cifar10-example.png
7+
/cifar10-labels.csv
8+
/simple_face
9+
/simple_face-example.png
10+
/titanic_test.csv
11+
/titanic_train.csv
12+
/titanic_wrong_number_columns.csv
13+
/titanic_wrong_passengerID.csv
14+
15+
# wikitext
16+
/wikitext/

datasets/README.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Raw datasets
2+
3+
This directory contains a selection of raw datasets.
4+
5+
Run `./populate` to get all.

0 commit comments

Comments
 (0)