-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdb.sh
executable file
·272 lines (269 loc) · 8.93 KB
/
db.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
# Set the path of your corpus
# "downloads" means the corpus can be downloaded by the recipe automatically
AIDATATANG_200ZH=downloads
AISHELL=downloads
AISHELL3=downloads
AISHELL4=downloads
ALFFA=downloads
AN4=downloads
DIRHA_ENGLISH_PHDEV=
DIRHA_WSJ=
DIRHA_WSJ_PROCESSED="${PWD}/data/local/dirha_wsj_processed" # Output file path
DNS=
DSING=downloads
WSJ0=
WSJ1=
WSJCAM0=
REVERB=
REVERB_OUT="${PWD}/REVERB" # Output file path
CHIME3=
CHIME4=
CSJDATATOP=
CSJVER=dvd ## Set your CSJ format (dvd or usb).
## Usage :
## Case DVD : We assume CSJ DVDs are copied in this directory with the names dvd1, dvd2,...,dvd17.
## Necessary directory is dvd3 - dvd17.
## e.g. $ ls $CSJDATATOP(DVD) => 00README.txt dvd1 dvd2 ... dvd17
##
## Case USB : Necessary directory is MORPH/SDB and WAV
## e.g. $ ls $CSJDATATOP(USB) => 00README.txt DOC MORPH ... WAV fileList.csv
## Case merl :MERL setup. Necessary directory is WAV and sdb
CSMSC=downloads
CSS10=
HKUST1=
HKUST2=
HUI_ACG=downloads
HUB4_SPANISH=
LABOROTV=
TEDXJP=
LIBRISPEECH=
LIBRILIGHT_LIMITED=
FSC=
SLURP=
VOXCELEB=
MINI_LIBRISPEECH=downloads
MISP2021=
LIBRIMIX=downloads
LIBRITTS=
LJSPEECH=downloads
NSC=
JMD=downloads
JSSS=downloads
JSUT=downloads
JTUBESPEECH=downloads
JVS=downloads
KSS=
SNIPS= # smart-light-en-closed-field data path
SPGISPEECH=
SWBD=
SWBD_NXT=
THCHS30=downloads
TIMIT=$(realpath ../../../../TIMIT)
TSUKUYOMI=downloads
VOXFORGE=downloads
AMI=
COMMONVOICE=downloads
BABEL_101=
BABEL_102=
BABEL_103=
BABEL_104=
BABEL_105=
BABEL_106=
BABEL_107=
BABEL_201=
BABEL_202=
BABEL_203=
BABEL_204=
BABEL_205=
BABEL_206=
BABEL_207=
BABEL_301=
BABEL_302=
BABEL_303=
BABEL_304=
BABEL_305=
BABEL_306=
BABEL_307=
BABEL_401=
BABEL_402=
BABEL_403=
BABEL_404=
PUEBLA_NAHUATL=downloads
TEDLIUM2=downloads
TEDLIUM3=downloads
VCTK=downloads
VIVOS=downloads
YESNO=downloads
YOLOXOCHITL_MIXTEC=downloads
HOW2_TEXT=downloads/how2-300h-v1
HOW2_FEATS=downloads/fbank_pitch_181516
ZEROTH_KOREAN=downloads
JAVA=downloads
RU_OPEN_STT=downloads
RUSLAN=downloads
SIWIS=downloads
GIGASPEECH=/mnt/data/GigaSpeech/
GOOGLEI18N=downloads
NOISY_SPEECH=
NOISY_REVERBERANT_SPEECH=
LRS2=
LRS3=
SUNDA=downloads
CMU_ARCTIC=downloads
CMU_INDIC=downloads
INDIC_SPEECH=downloads
IWSLT22_DIALECT=
JKAC=
MUCS_SUBTASK1=downloads
MUCS_SUBTASK2=downloads
GAMAYUN=downloads
IWSLT21LR=downloads/iwslt21
JDCINAL=downloads
GRABO=downloads
WENETSPEECH=
SPEECHCOMMANDS=downloads
TOTONAC=downloads
PRIMEWORDS_CHINESE=downloads
SEAME=
BENGALI=downloads
IWSLT14=
# For only CMU TIR environment
if [[ "$(hostname)" == tir* ]]; then
BABEL_101=/projects/tir5/data/speech_corpora/babel/IARPA_BABEL_BP_101/
BABEL_102=/projects/tir5/data/speech_corpora/babel/BABEL_OP1_102/
BABEL_103=/projects/tir5/data/speech_corpora/babel/BABEL_OP1_103/
BABEL_104=/projects/tir5/data/speech_corpora/babel/BABEL_BP_104/
BABEL_105=/projects/tir5/data/speech_corpora/babel/IARPA-babel105b-v0.5-build/BABEL_BP_105/
BABEL_106=/projects/tir5/data/speech_corpora/babel/BABEL_BP_106/
BABEL_107=/projects/tir5/data/speech_corpora/babel/BABEL_BP_107/
BABEL_201=/projects/tir5/data/speech_corpora/babel/IARPA-babel201b-v0.2b.build/BABEL_OP1_201/
BABEL_202=/projects/tir5/data/speech_corpora/babel/IARPA-babel202b-v1.0d-build/BABEL_OP2_202/
BABEL_203=/projects/tir5/data/speech_corpora/babel/IARPA-babel203b-v3.1a-build/
BABEL_204=/projects/tir5/data/speech_corpora/babel/BABEL_OP1_204/
BABEL_205=/projects/tir5/data/speech_corpora/babel/IARPA-babel205b-v1.0a-build/BABEL_OP2_205/
BABEL_206=/projects/tir5/data/speech_corpora/babel/BABEL_OP1_206/
BABEL_207=/projects/tir5/data/speech_corpora/babel/IARPA-babel207b-v1.0e-build/BABEL_OP2_207/
BABEL_301=/projects/tir5/data/speech_corpora/babel/IARPA-babel301b-v2.0b-build/BABEL_OP2_301/
BABEL_302=/projects/tir5/data/speech_corpora/babel/IARPA-babel302b-v1.0a-build/BABEL_OP2_302/
BABEL_303=/projects/tir5/data/speech_corpora/babel/IARPA-babel303b-v1.0a/BABEL_OP2_303/
BABEL_304=/projects/tir5/data/speech_corpora/babel/IARPA-babel304b-v1.0b/BABEL_OP2_304/
BABEL_305=/projects/tir5/data/speech_corpora/babel/IARPA-babel305b-v1.0c-build/BABEL_OP3_305/
BABEL_306=/projects/tir5/data/speech_corpora/babel/IARPA-babel306b-v2.0c-build/BABEL_OP3_306/
BABEL_307=/projects/tir5/data/speech_corpora/babel/IARPA-babel307b-v1.0b-build/BABEL_OP3_307/
BABEL_401=/projects/tir5/data/speech_corpora/babel/IARPA-babel401b-v2.0b-build/BABEL_OP3_401/
BABEL_402=/projects/tir5/data/speech_corpora/babel/IARPA-babel402b-v1.0b-build/BABEL_OP3_402/
BABEL_403=/projects/tir5/data/speech_corpora/babel/IARPA-babel403b-v1.0b-build/BABEL_OP3_403/
BABEL_404=/projects/tir5/data/speech_corpora/babel/IARPA_BABEL_OP3_404/
GRABO=/projects/tir5/data/speech_corpora/Grabo
IWSLT14=/projects/tir5/data/iwslt14
IWSLT22_DIALECT=/projects/tir5/data/speech_corpora/LDC2022E01_IWSLT22_Tunisian_Arabic_Shared_Task_Training_Data/
PRIMEWORDS_CHINESE=/projects/tir5/data/speech_corpora/Primewords_Chinese
FISHER_CALLHOME_SPANISH=/projects/tir5/data/speech_corpora/fisher_callhome_spanish
DSING=/projects/tir5/data/speech_corpora/sing_300x30x2
fi
# For only JHU environment
if [[ "$(hostname -d)" == clsp.jhu.edu ]]; then
AIDATATANG_200ZH=downloads
AISHELL=
AISHELL3=downloads
ALFFA=downloads
AN4=
DIRHA_ENGLISH_PHDEV=
DIRHA_WSJ=
DIRHA_WSJ_PROCESSED="${PWD}/data/local/dirha_wsj_processed" # Output file path
DNS=
WSJ0=
WSJ1=
WSJCAM0=/export/corpora3/LDC/LDC95S24/wsjcam0
REVERB=/export/corpora5/REVERB_2014/REVERB
REVERB_OUT="${PWD}/REVERB" # Output file path
CHIME3=
CHIME4=
CSJDATATOP=/export/corpora5/CSJ/USB
CSJVER=usb ## Set your CSJ format (dvd or usb).
## Usage :
## Case DVD : We assume CSJ DVDs are copied in this directory with the names dvd1, dvd2,...,dvd17.
## Necessary directory is dvd3 - dvd17.
## e.g. $ ls $CSJDATATOP(DVD) => 00README.txt dvd1 dvd2 ... dvd17
##
## Case USB : Necessary directory is MORPH/SDB and WAV
## e.g. $ ls $CSJDATATOP(USB) => 00README.txt DOC MORPH ... WAV fileList.csv
## Case merl :MERL setup. Necessary directory is WAV and sdb
CSMSC=downloads
CSS10=
HKUST1=
HKUST2=
HUI_ACG=downloads
HUB4_SPANISH=
LABOROTV=
TEDXJP=
LIBRISPEECH=
FSC=
SNIPS= # smart-light-en-closed-field data path
SLURP=
MINI_LIBRISPEECH=downloads
LIBRITTS=
LJSPEECH=downloads
JMD=downloads
JSSS=downloads
JSUT=downloads
JVS=downloads
KSS=
THCHS30=downloads
TIMIT=
TSUKUYOMI=downloads
VOXFORGE=
AMI=/export/corpora4/ami/amicorpus
COMMONVOICE=downloads
BABEL_101=/export/babel/data/101-cantonese
BABEL_102=/export/babel/data/102-assamese
BABEL_103=/export/babel/data/103-bengali
BABEL_104=/export/babel/data/104-pashto
BABEL_105=/export/babel/data/105-turkish
BABEL_106=/export/babel/data/106-tagalog
BABEL_107=/export/babel/data/107-vietnamese
BABEL_201=/export/babel/data/201-haitian
BABEL_202=/export/babel/data/202-swahili/IARPA-babel202b-v1.0d-build/BABEL_OP2_202
BABEL_203=/export/babel/data/203-lao
BABEL_204=/export/babel/data/204-tamil
BABEL_205=/export/babel/data/205-kurmanji/IARPA-babel205b-v1.0a-build/BABEL_OP2_205
BABEL_206=/export/babel/data/206-zulu
BABEL_207=/export/babel/data/207-tokpisin/IARPA-babel207b-v1.0e-build/BABEL_OP2_207
BABEL_301=/export/babel/data/301-cebuano/IARPA-babel301b-v2.0b-build/BABEL_OP2_301
BABEL_302=/export/babel/data/302-kazakh/IARPA-babel302b-v1.0a-build/BABEL_OP2_302
BABEL_303=/export/babel/data/303-telugu/IARPA-babel303b-v1.0a-build/BABEL_OP2_303
BABEL_304=/export/babel/data/304-lithuanian/IARPA-babel304b-v1.0b-build/BABEL_OP2_304
BABEL_305=/export/babel/data/305-guarani/IARPA-babel305b-v1.0b-build/BABEL_OP3_305
BABEL_306=/export/babel/data/306-igbo/IARPA-babel306b-v2.0c-build/BABEL_OP3_306
BABEL_307=/export/babel/data/307-amharic/IARPA-babel307b-v1.0b-build/BABEL_OP3_307
BABEL_401=/export/babel/data/401-mongolian/IARPA-babel401b-v2.0b-build/BABEL_OP3_401
BABEL_402=/export/babel/data/402-javanese/IARPA-babel402b-v1.0b-build/BABEL_OP3_402
BABEL_403=/export/babel/data/403-dholuo/IARPA-babel403b-v1.0b-build/BABEL_OP3_403
BABEL_404=/export/corpora/LDC/LDC2016S12/IARPA_BABEL_OP3_404
PUEBLA_NAHUATL=
TEDLIUM2=downloads
TEDLIUM3=downloads
VCTK=downloads
VIVOS=
YESNO=
YOLOXOCHITL_MIXTEC=downloads
HOW2_TEXT=
HOW2_FEATS=
ZEROTH_KOREAN=downloads
LRS2=
JAVA=
BENGALI=
RU_OPEN_STT=downloads
RUSLAN=downloads
SIWIS=downloads
SUNDA=
CMU_INDIC=
INDIC_SPEECH=
JKAC=
MUCS_SUBTASK1=downloads
MUCS_SUBTASK2=downloads
GAMAYUN=downloads
IWSLT21LR=downloads/iwslt21
TOTONAC=downloads
GOOGLEI18N=downloads
fi