fix(bin.synthesize): correctly handle boolean arguments

eginhard · eginhard · commit 77722cb0dd0c · 2024-05-31T08:39:32.000+02:00
Previously, e.g. `--use_cuda false` would actually set use_cuda=True: coqui-ai#3762
diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py
@@ -35,7 +35,7 @@
         --data_path /root/LJSpeech-1.1/
         --batch_size 32
         --dataset ljspeech
-        --use_cuda True
+        --use_cuda
 """,
         formatter_class=RawTextHelpFormatter,
     )
@@ -62,7 +62,7 @@
         help="Dataset metafile inclusing file paths with transcripts.",
     )
     parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
-    parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
+    parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="enable/disable cuda.")
 
     parser.add_argument(
         "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
@@ -150,7 +150,7 @@ def compute_embeddings(
         default=False,
         action="store_true",
     )
-    parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
+    parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)
     parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
     parser.add_argument(
         "--formatter_name",
diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py
@@ -75,8 +75,8 @@ def compute_encoder_accuracy(dataset_items, encoder_manager):
         type=str,
         help="Path to dataset config file.",
     )
-    parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
-    parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+    parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, help="flag to set cuda.", default=True)
+    parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
 
     args = parser.parse_args()
 
diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
@@ -282,7 +282,7 @@ def main(args):  # pylint: disable=redefined-outer-name
     parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
     parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
     parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
-    parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+    parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
     args = parser.parse_args()
 
     c = load_config(args.config_path)
diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py
@@ -80,7 +80,7 @@ def preprocess_audios():
     setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
 
     parser = argparse.ArgumentParser(
-        description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
+        description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end"
     )
     parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
     parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
@@ -95,20 +95,20 @@ def preprocess_audios():
     parser.add_argument(
         "-t",
         "--trim_just_beginning_and_end",
-        type=bool,
+        action=argparse.BooleanOptionalAction,
         default=True,
-        help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
+        help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trimmed.",
     )
     parser.add_argument(
         "-c",
         "--use_cuda",
-        type=bool,
+        action=argparse.BooleanOptionalAction,
         default=False,
         help="If True use cuda",
     )
     parser.add_argument(
         "--use_onnx",
-        type=bool,
+        action=argparse.BooleanOptionalAction,
         default=False,
         help="If True use onnx",
     )
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
+
+"""Command line interface."""
 
 import argparse
 import contextlib
@@ -136,30 +137,16 @@
 """
 
 
-def str2bool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() in ("yes", "true", "t", "y", "1"):
-        return True
-    if v.lower() in ("no", "false", "f", "n", "0"):
-        return False
-    raise argparse.ArgumentTypeError("Boolean value expected.")
-
-
-def main():
-    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
-
+def parse_args() -> argparse.Namespace:
+    """Parse arguments."""
     parser = argparse.ArgumentParser(
         description=description.replace("    ```\n", ""),
         formatter_class=RawTextHelpFormatter,
     )
 
     parser.add_argument(
         "--list_models",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
+        action="store_true",
         help="list available pre-trained TTS and vocoder models.",
     )
 
@@ -207,7 +194,7 @@ def main():
         default="tts_output.wav",
         help="Output wav file path.",
     )
-    parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
+    parser.add_argument("--use_cuda", action="store_true", help="Run model on CUDA.")
     parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
     parser.add_argument(
         "--vocoder_path",
@@ -226,10 +213,7 @@ def main():
     parser.add_argument(
         "--pipe_out",
         help="stdout the generated TTS wav file for shell pipe.",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
+        action="store_true",
     )
 
     # args for multi-speaker synthesis
@@ -261,25 +245,18 @@ def main():
     parser.add_argument(
         "--list_speaker_idxs",
         help="List available speaker ids for the defined multi-speaker model.",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
+        action="store_true",
     )
     parser.add_argument(
         "--list_language_idxs",
         help="List available language ids for the defined multi-lingual model.",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
+        action="store_true",
     )
     # aux args
     parser.add_argument(
         "--save_spectogram",
-        type=bool,
-        help="If true save raw spectogram for further (vocoder) processing in out_path.",
-        default=False,
+        action="store_true",
+        help="Save raw spectogram for further (vocoder) processing in out_path.",
     )
     parser.add_argument(
         "--reference_wav",
@@ -295,8 +272,8 @@ def main():
     )
     parser.add_argument(
         "--progress_bar",
-        type=str2bool,
-        help="If true shows a progress bar for the model download. Defaults to True",
+        action=argparse.BooleanOptionalAction,
+        help="Show a progress bar for the model download.",
         default=True,
     )
 
@@ -337,19 +314,23 @@ def main():
     ]
     if not any(check_args):
         parser.parse_args(["-h"])
+    return args
+
+
+def main():
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    args = parse_args()
 
     pipe_out = sys.stdout if args.pipe_out else None
 
     with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
         # Late-import to make things load faster
-        from TTS.api import TTS
         from TTS.utils.manage import ModelManager
         from TTS.utils.synthesizer import Synthesizer
 
         # load model manager
         path = Path(__file__).parent / "../.models.json"
         manager = ModelManager(path, progress_bar=args.progress_bar)
-        api = TTS()
 
         tts_path = None
         tts_config_path = None
diff --git a/TTS/encoder/README.md b/TTS/encoder/README.md
@@ -14,5 +14,5 @@ To run the code, you need to follow the same flow as in TTS.
 
 - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
 - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
-- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
+- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
 - Watch training on Tensorboard as in TTS
diff --git a/TTS/server/README.md b/TTS/server/README.md
@@ -15,7 +15,7 @@ Run the server with the official models.
 ```python TTS/server/server.py  --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
 
 Run the server with the official models on a GPU.
-```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py  --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
+```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py  --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda```
 
 Run the server with a custom models.
 ```python TTS/server/server.py  --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
diff --git a/docs/source/docker_images.md b/docs/source/docker_images.md
@@ -32,7 +32,7 @@ For the GPU version, you need to have the latest NVIDIA drivers installed.
 With `nvidia-smi` you can check the CUDA version supported, it must be >= 11.8
 
 ```bash
-docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda true
+docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda
 ```
 
 ## Start a server
@@ -50,7 +50,7 @@ python3 TTS/server/server.py --model_name tts_models/en/vctk/vits
 ```bash
 docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/coqui-ai/tts
 python3 TTS/server/server.py --list_models #To get the list of available models
-python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda true
+python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda
 ```
 
 Click [there](http://[::1]:5002/) and have fun with the server!
diff --git a/docs/source/models/bark.md b/docs/source/models/bark.md
@@ -69,14 +69,12 @@ tts --model_name  tts_models/multilingual/multi-dataset/bark \
 --text "This is an example." \
 --out_path "output.wav" \
 --voice_dir bark_voices/ \
---speaker_idx "ljspeech" \
---progress_bar True
+--speaker_idx "ljspeech"
 
 # Random voice generation
 tts --model_name  tts_models/multilingual/multi-dataset/bark \
 --text "This is an example." \
---out_path "output.wav" \
---progress_bar True
+--out_path "output.wav"
 ```
 
 
diff --git a/docs/source/models/tortoise.md b/docs/source/models/tortoise.md
@@ -57,14 +57,12 @@ tts --model_name  tts_models/en/multi-dataset/tortoise-v2 \
 --text "This is an example." \
 --out_path "output.wav" \
 --voice_dir path/to/tortoise/voices/dir/ \
---speaker_idx "lj" \
---progress_bar True
+--speaker_idx "lj"
 
 # Random voice generation
 tts --model_name  tts_models/en/multi-dataset/tortoise-v2 \
 --text "This is an example." \
---out_path "output.wav" \
---progress_bar True
+--out_path "output.wav"
 ```
 
 
diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md
@@ -72,7 +72,7 @@ You can do inference using one of the available speakers using the following com
      --text "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent." \
      --speaker_idx "Ana Florence" \
      --language_idx en \
-     --use_cuda true
+     --use_cuda
 ```
 
 ##### Clone a voice
@@ -85,7 +85,7 @@ You can clone a speaker voice using a single or multiple references:
      --text "Bugün okula gitmek istemiyorum." \
      --speaker_wav /path/to/target/speaker.wav \
      --language_idx tr \
-     --use_cuda true
+     --use_cuda
 ```
 
 ###### Multiple references
@@ -94,7 +94,7 @@ You can clone a speaker voice using a single or multiple references:
      --text "Bugün okula gitmek istemiyorum." \
      --speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \
      --language_idx tr \
-     --use_cuda true
+     --use_cuda
 ```
 or for all wav files in a directory you can use:
 
@@ -103,7 +103,7 @@ or for all wav files in a directory you can use:
      --text "Bugün okula gitmek istemiyorum." \
      --speaker_wav /path/to/target/*.wav \
      --language_idx tr \
-     --use_cuda true
+     --use_cuda
 ```
 
 #### 🐸TTS API
diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
@@ -65,7 +65,7 @@
     model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")
     # TODO: make compute_attention python callable
     os.system(
-        f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/  --use_cuda true"
+        f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/  --use_cuda"
     )
 
 # INITIALIZE THE AUDIO PROCESSOR
diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py
@@ -64,7 +64,7 @@
     model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")
     # TODO: make compute_attention python callable
     os.system(
-        f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/  --use_cuda true"
+        f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/  --use_cuda"
     )
 
 # INITIALIZE THE AUDIO PROCESSOR
diff --git a/recipes/ljspeech/fastspeech2/train_fastspeech2.py b/recipes/ljspeech/fastspeech2/train_fastspeech2.py
@@ -67,7 +67,7 @@
     model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")
     # TODO: make compute_attention python callable
     os.system(
-        f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/  --use_cuda true"
+        f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/  --use_cuda"
     )
 
 # INITIALIZE THE AUDIO PROCESSOR
diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py

Original file line number	Diff line number	Diff line change
`@@ -150,7 +150,7 @@ def compute_embeddings(`
`150`	`150`	`default=False,`
`151`	`151`	`action="store_true",`
`152`	`152`	`)`
`153`		`- parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)`
	`153`	`+ parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)`
`154`	`154`	`parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")`
`155`	`155`	`parser.add_argument(`
`156`	`156`	`"--formatter_name",`
Original file line number	Diff line number	Diff line change
`@@ -75,8 +75,8 @@ def compute_encoder_accuracy(dataset_items, encoder_manager):`
`75`	`75`	`type=str,`
`76`	`76`	`help="Path to dataset config file.",`
`77`	`77`	`)`
`78`		`- parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)`
`79`		`- parser.add_argument("--eval", type=bool, help="compute eval.", default=True)`
	`78`	`+ parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, help="flag to set cuda.", default=True)`
	`79`	`+ parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)`
`80`	`80`
`81`	`81`	`args = parser.parse_args()`
`82`	`82`
Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@`
`65`	`65`	`model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")`
`66`	`66`	`# TODO: make compute_attention python callable`
`67`	`67`	`os.system(`
`68`		`- f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"`
	`68`	`+ f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda"`
`69`	`69`	`)`
`70`	`70`
`71`	`71`	`# INITIALIZE THE AUDIO PROCESSOR`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@`
`64`	`64`	`model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")`
`65`	`65`	`# TODO: make compute_attention python callable`
`66`	`66`	`os.system(`
`67`		`- f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"`
	`67`	`+ f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda"`
`68`	`68`	`)`
`69`	`69`
`70`	`70`	`# INITIALIZE THE AUDIO PROCESSOR`
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@`
`67`	`67`	`model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")`
`68`	`68`	`# TODO: make compute_attention python callable`
`69`	`69`	`os.system(`
`70`		`- f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"`
	`70`	`+ f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda"`
`71`	`71`	`)`
`72`	`72`
`73`	`73`	`# INITIALIZE THE AUDIO PROCESSOR`