Skip to content

Commit 77722cb

Browse files
committed
fix(bin.synthesize): correctly handle boolean arguments
Previously, e.g. `--use_cuda false` would actually set use_cuda=True: coqui-ai#3762
1 parent a682fa8 commit 77722cb

16 files changed

+57
-80
lines changed

TTS/bin/compute_attention_masks.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
--data_path /root/LJSpeech-1.1/
3636
--batch_size 32
3737
--dataset ljspeech
38-
--use_cuda True
38+
--use_cuda
3939
""",
4040
formatter_class=RawTextHelpFormatter,
4141
)
@@ -62,7 +62,7 @@
6262
help="Dataset metafile inclusing file paths with transcripts.",
6363
)
6464
parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
65-
parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
65+
parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="enable/disable cuda.")
6666

6767
parser.add_argument(
6868
"--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."

TTS/bin/compute_embeddings.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ def compute_embeddings(
150150
default=False,
151151
action="store_true",
152152
)
153-
parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
153+
parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)
154154
parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
155155
parser.add_argument(
156156
"--formatter_name",

TTS/bin/eval_encoder.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@ def compute_encoder_accuracy(dataset_items, encoder_manager):
7575
type=str,
7676
help="Path to dataset config file.",
7777
)
78-
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
79-
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
78+
parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, help="flag to set cuda.", default=True)
79+
parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
8080

8181
args = parser.parse_args()
8282

TTS/bin/extract_tts_spectrograms.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ def main(args): # pylint: disable=redefined-outer-name
282282
parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
283283
parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
284284
parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
285-
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
285+
parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
286286
args = parser.parse_args()
287287

288288
c = load_config(args.config_path)

TTS/bin/remove_silence_using_vad.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def preprocess_audios():
8080
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
8181

8282
parser = argparse.ArgumentParser(
83-
description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
83+
description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end"
8484
)
8585
parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
8686
parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
@@ -95,20 +95,20 @@ def preprocess_audios():
9595
parser.add_argument(
9696
"-t",
9797
"--trim_just_beginning_and_end",
98-
type=bool,
98+
action=argparse.BooleanOptionalAction,
9999
default=True,
100-
help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
100+
help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trimmed.",
101101
)
102102
parser.add_argument(
103103
"-c",
104104
"--use_cuda",
105-
type=bool,
105+
action=argparse.BooleanOptionalAction,
106106
default=False,
107107
help="If True use cuda",
108108
)
109109
parser.add_argument(
110110
"--use_onnx",
111-
type=bool,
111+
action=argparse.BooleanOptionalAction,
112112
default=False,
113113
help="If True use onnx",
114114
)

TTS/bin/synthesize.py

+19-38
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/usr/bin/env python3
2-
# -*- coding: utf-8 -*-
2+
3+
"""Command line interface."""
34

45
import argparse
56
import contextlib
@@ -136,30 +137,16 @@
136137
"""
137138

138139

139-
def str2bool(v):
140-
if isinstance(v, bool):
141-
return v
142-
if v.lower() in ("yes", "true", "t", "y", "1"):
143-
return True
144-
if v.lower() in ("no", "false", "f", "n", "0"):
145-
return False
146-
raise argparse.ArgumentTypeError("Boolean value expected.")
147-
148-
149-
def main():
150-
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
151-
140+
def parse_args() -> argparse.Namespace:
141+
"""Parse arguments."""
152142
parser = argparse.ArgumentParser(
153143
description=description.replace(" ```\n", ""),
154144
formatter_class=RawTextHelpFormatter,
155145
)
156146

157147
parser.add_argument(
158148
"--list_models",
159-
type=str2bool,
160-
nargs="?",
161-
const=True,
162-
default=False,
149+
action="store_true",
163150
help="list available pre-trained TTS and vocoder models.",
164151
)
165152

@@ -207,7 +194,7 @@ def main():
207194
default="tts_output.wav",
208195
help="Output wav file path.",
209196
)
210-
parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
197+
parser.add_argument("--use_cuda", action="store_true", help="Run model on CUDA.")
211198
parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
212199
parser.add_argument(
213200
"--vocoder_path",
@@ -226,10 +213,7 @@ def main():
226213
parser.add_argument(
227214
"--pipe_out",
228215
help="stdout the generated TTS wav file for shell pipe.",
229-
type=str2bool,
230-
nargs="?",
231-
const=True,
232-
default=False,
216+
action="store_true",
233217
)
234218

235219
# args for multi-speaker synthesis
@@ -261,25 +245,18 @@ def main():
261245
parser.add_argument(
262246
"--list_speaker_idxs",
263247
help="List available speaker ids for the defined multi-speaker model.",
264-
type=str2bool,
265-
nargs="?",
266-
const=True,
267-
default=False,
248+
action="store_true",
268249
)
269250
parser.add_argument(
270251
"--list_language_idxs",
271252
help="List available language ids for the defined multi-lingual model.",
272-
type=str2bool,
273-
nargs="?",
274-
const=True,
275-
default=False,
253+
action="store_true",
276254
)
277255
# aux args
278256
parser.add_argument(
279257
"--save_spectogram",
280-
type=bool,
281-
help="If true save raw spectogram for further (vocoder) processing in out_path.",
282-
default=False,
258+
action="store_true",
259+
help="Save raw spectogram for further (vocoder) processing in out_path.",
283260
)
284261
parser.add_argument(
285262
"--reference_wav",
@@ -295,8 +272,8 @@ def main():
295272
)
296273
parser.add_argument(
297274
"--progress_bar",
298-
type=str2bool,
299-
help="If true shows a progress bar for the model download. Defaults to True",
275+
action=argparse.BooleanOptionalAction,
276+
help="Show a progress bar for the model download.",
300277
default=True,
301278
)
302279

@@ -337,19 +314,23 @@ def main():
337314
]
338315
if not any(check_args):
339316
parser.parse_args(["-h"])
317+
return args
318+
319+
320+
def main():
321+
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
322+
args = parse_args()
340323

341324
pipe_out = sys.stdout if args.pipe_out else None
342325

343326
with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
344327
# Late-import to make things load faster
345-
from TTS.api import TTS
346328
from TTS.utils.manage import ModelManager
347329
from TTS.utils.synthesizer import Synthesizer
348330

349331
# load model manager
350332
path = Path(__file__).parent / "../.models.json"
351333
manager = ModelManager(path, progress_bar=args.progress_bar)
352-
api = TTS()
353334

354335
tts_path = None
355336
tts_config_path = None

TTS/encoder/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,5 @@ To run the code, you need to follow the same flow as in TTS.
1414

1515
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
1616
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
17-
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
17+
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
1818
- Watch training on Tensorboard as in TTS

TTS/server/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Run the server with the official models.
1515
```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
1616

1717
Run the server with the official models on a GPU.
18-
```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
18+
```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda```
1919

2020
Run the server with a custom models.
2121
```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```

docs/source/docker_images.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ For the GPU version, you need to have the latest NVIDIA drivers installed.
3232
With `nvidia-smi` you can check the CUDA version supported, it must be >= 11.8
3333

3434
```bash
35-
docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda true
35+
docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda
3636
```
3737

3838
## Start a server
@@ -50,7 +50,7 @@ python3 TTS/server/server.py --model_name tts_models/en/vctk/vits
5050
```bash
5151
docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/coqui-ai/tts
5252
python3 TTS/server/server.py --list_models #To get the list of available models
53-
python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda true
53+
python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda
5454
```
5555

5656
Click [there](http://[::1]:5002/) and have fun with the server!

docs/source/models/bark.md

+2-4
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,12 @@ tts --model_name tts_models/multilingual/multi-dataset/bark \
6969
--text "This is an example." \
7070
--out_path "output.wav" \
7171
--voice_dir bark_voices/ \
72-
--speaker_idx "ljspeech" \
73-
--progress_bar True
72+
--speaker_idx "ljspeech"
7473

7574
# Random voice generation
7675
tts --model_name tts_models/multilingual/multi-dataset/bark \
7776
--text "This is an example." \
78-
--out_path "output.wav" \
79-
--progress_bar True
77+
--out_path "output.wav"
8078
```
8179

8280

docs/source/models/tortoise.md

+2-4
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,12 @@ tts --model_name tts_models/en/multi-dataset/tortoise-v2 \
5757
--text "This is an example." \
5858
--out_path "output.wav" \
5959
--voice_dir path/to/tortoise/voices/dir/ \
60-
--speaker_idx "lj" \
61-
--progress_bar True
60+
--speaker_idx "lj"
6261

6362
# Random voice generation
6463
tts --model_name tts_models/en/multi-dataset/tortoise-v2 \
6564
--text "This is an example." \
66-
--out_path "output.wav" \
67-
--progress_bar True
65+
--out_path "output.wav"
6866
```
6967

7068

docs/source/models/xtts.md

+4-4
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ You can do inference using one of the available speakers using the following com
7272
--text "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent." \
7373
--speaker_idx "Ana Florence" \
7474
--language_idx en \
75-
--use_cuda true
75+
--use_cuda
7676
```
7777

7878
##### Clone a voice
@@ -85,7 +85,7 @@ You can clone a speaker voice using a single or multiple references:
8585
--text "Bugün okula gitmek istemiyorum." \
8686
--speaker_wav /path/to/target/speaker.wav \
8787
--language_idx tr \
88-
--use_cuda true
88+
--use_cuda
8989
```
9090

9191
###### Multiple references
@@ -94,7 +94,7 @@ You can clone a speaker voice using a single or multiple references:
9494
--text "Bugün okula gitmek istemiyorum." \
9595
--speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \
9696
--language_idx tr \
97-
--use_cuda true
97+
--use_cuda
9898
```
9999
or for all wav files in a directory you can use:
100100

@@ -103,7 +103,7 @@ or for all wav files in a directory you can use:
103103
--text "Bugün okula gitmek istemiyorum." \
104104
--speaker_wav /path/to/target/*.wav \
105105
--language_idx tr \
106-
--use_cuda true
106+
--use_cuda
107107
```
108108

109109
#### 🐸TTS API

recipes/ljspeech/fast_pitch/train_fast_pitch.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")
6666
# TODO: make compute_attention python callable
6767
os.system(
68-
f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"
68+
f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda"
6969
)
7070

7171
# INITIALIZE THE AUDIO PROCESSOR

recipes/ljspeech/fast_speech/train_fast_speech.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")
6565
# TODO: make compute_attention python callable
6666
os.system(
67-
f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"
67+
f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda"
6868
)
6969

7070
# INITIALIZE THE AUDIO PROCESSOR

recipes/ljspeech/fastspeech2/train_fastspeech2.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")
6868
# TODO: make compute_attention python callable
6969
os.system(
70-
f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"
70+
f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda"
7171
)
7272

7373
# INITIALIZE THE AUDIO PROCESSOR

0 commit comments

Comments
 (0)