From 8c381e3e48662097e661c0d45b61ad19de56cd30 Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Tue, 3 Dec 2024 22:06:39 +0100
Subject: [PATCH 01/10] docs: use .to("cuda") instead of deprecated gpu=True

---
 TTS/api.py                 |  8 ++++----
 docs/source/models/bark.md |  4 ++--
 docs/source/models/xtts.md | 10 +++++-----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index ed82825007..86787e0364 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -35,21 +35,21 @@ def __init__(
             >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
 
         Example with a single-speaker model:
-            >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
+            >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False)
             >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
 
         Example loading a model from a path:
-            >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
+            >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False)
             >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
 
         Example voice cloning with YourTTS in English, French and Portuguese:
-            >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
+            >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to("cuda")
             >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
             >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
             >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
 
         Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
-            >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
+            >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False).to("cuda")
             >>> tts.tts_to_file("This is a test.", file_path="output.wav")
 
         Args:
diff --git a/docs/source/models/bark.md b/docs/source/models/bark.md
index a180afbb91..77f99c0d3a 100644
--- a/docs/source/models/bark.md
+++ b/docs/source/models/bark.md
@@ -37,7 +37,7 @@ from TTS.api import TTS
 
 # Load the model to GPU
 # Bark is really slow on CPU, so we recommend using GPU.
-tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
+tts = TTS("tts_models/multilingual/multi-dataset/bark").to("cuda")
 
 
 # Cloning a new speaker
@@ -57,7 +57,7 @@ tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
 
 
 # random speaker
-tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
+tts = TTS("tts_models/multilingual/multi-dataset/bark").to("cuda")
 tts.tts_to_file("hello world", file_path="out.wav")
 ```
 
diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md
index c07d879f7c..7c0f1c4a60 100644
--- a/docs/source/models/xtts.md
+++ b/docs/source/models/xtts.md
@@ -118,7 +118,7 @@ You can optionally disable sentence splitting for better coherence but more VRAM
 
 ```python
 from TTS.api import TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
 
 # generate speech by cloning a voice using default settings
 tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
@@ -137,15 +137,15 @@ You can pass multiple audio files to the `speaker_wav` argument for better voice
 from TTS.api import TTS
 
 # using the default version set in 🐸TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
 
 # using a specific version
 # 👀 see the branch names for versions on https://huggingface.co/coqui/XTTS-v2/tree/main
 # ❗some versions might be incompatible with the API
-tts = TTS("xtts_v2.0.2", gpu=True)
+tts = TTS("xtts_v2.0.2").to("cuda")
 
 # getting the latest XTTS_v2
-tts = TTS("xtts", gpu=True)
+tts = TTS("xtts").to("cuda")
 
 # generate speech by cloning a voice using default settings
 tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
@@ -160,7 +160,7 @@ You can do inference using one of the available speakers using the following cod
 
 ```python
 from TTS.api import TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
 
 # generate speech by cloning a voice using default settings
 tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",

From 5cfb4ecccdec909fd4e92b8e8c833dd33870d38e Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Tue, 3 Dec 2024 22:09:03 +0100
Subject: [PATCH 02/10] refactor(api): require keyword arguments except for
 model_name

---
 TTS/api.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index 86787e0364..62dab32922 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -2,6 +2,7 @@
 import tempfile
 import warnings
 from pathlib import Path
+from typing import Optional
 
 from torch import nn
 
@@ -19,12 +20,13 @@ class TTS(nn.Module):
     def __init__(
         self,
         model_name: str = "",
-        model_path: str = None,
-        config_path: str = None,
-        vocoder_path: str = None,
-        vocoder_config_path: str = None,
+        *,
+        model_path: Optional[str] = None,
+        config_path: Optional[str] = None,
+        vocoder_path: Optional[str] = None,
+        vocoder_config_path: Optional[str] = None,
         progress_bar: bool = True,
-        gpu=False,
+        gpu: bool = False,
     ):
         """🐸TTS python interface that allows to load and use the released models.
 

From 42ad9b00c684666080c406840a8ccab5316734ce Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Wed, 4 Dec 2024 10:44:49 +0100
Subject: [PATCH 03/10] feat(api): support specifying vocoders by name

---
 TTS/api.py | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index 62dab32922..49b9a6b78f 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -23,6 +23,7 @@ def __init__(
         *,
         model_path: Optional[str] = None,
         config_path: Optional[str] = None,
+        vocoder_name: Optional[str] = None,
         vocoder_path: Optional[str] = None,
         vocoder_config_path: Optional[str] = None,
         progress_bar: bool = True,
@@ -58,6 +59,7 @@ def __init__(
             model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
             model_path (str, optional): Path to the model checkpoint. Defaults to None.
             config_path (str, optional): Path to the model config. Defaults to None.
+            vocoder_name (str, optional): Pre-trained vocoder to use. Defaults to None, i.e. using the default vocoder.
             vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
             vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
             progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
@@ -74,11 +76,12 @@ def __init__(
 
         if model_name is not None and len(model_name) > 0:
             if "tts_models" in model_name:
-                self.load_tts_model_by_name(model_name, gpu)
+                self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
             elif "voice_conversion_models" in model_name:
-                self.load_vc_model_by_name(model_name, gpu)
+                self.load_vc_model_by_name(model_name, gpu=gpu)
+            # To allow just TTS("xtts")
             else:
-                self.load_model_by_name(model_name, gpu)
+                self.load_model_by_name(model_name, vocoder_name, gpu=gpu)
 
         if model_path:
             self.load_tts_model_by_path(
@@ -129,7 +132,9 @@ def get_models_file_path():
     def list_models():
         return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False).list_models()
 
-    def download_model_by_name(self, model_name: str):
+    def download_model_by_name(
+        self, model_name: str, vocoder_name: Optional[str] = None
+    ) -> tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str]]:
         model_path, config_path, model_item = self.manager.download_model(model_name)
         if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
             # return model directory if there are multiple files
@@ -137,19 +142,21 @@ def download_model_by_name(self, model_name: str):
             return None, None, None, None, model_path
         if model_item.get("default_vocoder") is None:
             return model_path, config_path, None, None, None
-        vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
+        if vocoder_name is None:
+            vocoder_name = model_item["default_vocoder"]
+        vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
         return model_path, config_path, vocoder_path, vocoder_config_path, None
 
-    def load_model_by_name(self, model_name: str, gpu: bool = False):
+    def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False):
         """Load one of the 🐸TTS models by name.
 
         Args:
             model_name (str): Model name to load. You can list models by ```tts.models```.
             gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         """
-        self.load_tts_model_by_name(model_name, gpu)
+        self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
 
-    def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
+    def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False):
         """Load one of the voice conversion models by name.
 
         Args:
@@ -162,7 +169,7 @@ def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
             vc_checkpoint=model_path, vc_config=config_path, model_dir=model_dir, use_cuda=gpu
         )
 
-    def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
+    def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False):
         """Load one of 🐸TTS models by name.
 
         Args:
@@ -174,7 +181,9 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
         self.synthesizer = None
         self.model_name = model_name
 
-        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(model_name)
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+            model_name, vocoder_name
+        )
 
         # init synthesizer
         # None values are fetch from the model

From 5daed879e05178836f6836e7923a6034a1c061e4 Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Wed, 4 Dec 2024 14:47:46 +0100
Subject: [PATCH 04/10] chore(bin.synthesize): remove unused argument

---
 TTS/bin/synthesize.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index 454f528ab4..59ceb1db4f 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -253,11 +253,6 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
     )
     # aux args
-    parser.add_argument(
-        "--save_spectogram",
-        action="store_true",
-        help="Save raw spectogram for further (vocoder) processing in out_path.",
-    )
     parser.add_argument(
         "--reference_wav",
         type=str,

From 1a4e58d0ce1caa15b1d476be4da6e379ef46f084 Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Wed, 4 Dec 2024 15:39:15 +0100
Subject: [PATCH 05/10] feat(api): support passing a custom speaker encoder by
 path

---
 TTS/api.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index 49b9a6b78f..7ca79405cd 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -26,6 +26,8 @@ def __init__(
         vocoder_name: Optional[str] = None,
         vocoder_path: Optional[str] = None,
         vocoder_config_path: Optional[str] = None,
+        encoder_path: Optional[str] = None,
+        encoder_config_path: Optional[str] = None,
         progress_bar: bool = True,
         gpu: bool = False,
     ):
@@ -62,6 +64,8 @@ def __init__(
             vocoder_name (str, optional): Pre-trained vocoder to use. Defaults to None, i.e. using the default vocoder.
             vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
             vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
+            encoder_path: Path to speaker encoder checkpoint. Default to None.
+            encoder_config_path: Path to speaker encoder config file. Defaults to None.
             progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
             gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         """
@@ -71,6 +75,8 @@ def __init__(
         self.synthesizer = None
         self.voice_converter = None
         self.model_name = ""
+        self.encoder_path = encoder_path
+        self.encoder_config_path = encoder_config_path
         if gpu:
             warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
 
@@ -194,8 +200,8 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] =
             tts_languages_file=None,
             vocoder_checkpoint=vocoder_path,
             vocoder_config=vocoder_config_path,
-            encoder_checkpoint=None,
-            encoder_config=None,
+            encoder_checkpoint=self.encoder_path,
+            encoder_config=self.encoder_config_path,
             model_dir=model_dir,
             use_cuda=gpu,
         )
@@ -220,8 +226,8 @@ def load_tts_model_by_path(
             tts_languages_file=None,
             vocoder_checkpoint=vocoder_path,
             vocoder_config=vocoder_config,
-            encoder_checkpoint=None,
-            encoder_config=None,
+            encoder_checkpoint=self.encoder_path,
+            encoder_config=self.encoder_config_path,
             use_cuda=gpu,
         )
 

From 85dbb3b8b3c9bfa9fb863996e2acf55a2d45c568 Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Wed, 4 Dec 2024 16:02:07 +0100
Subject: [PATCH 06/10] feat(api): allow mixing TTS and vocoder model name and
 path

---
 TTS/api.py | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index 7ca79405cd..86593f3fb6 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -75,8 +75,12 @@ def __init__(
         self.synthesizer = None
         self.voice_converter = None
         self.model_name = ""
+
+        self.vocoder_path = vocoder_path
+        self.vocoder_config_path = vocoder_config_path
         self.encoder_path = encoder_path
         self.encoder_config_path = encoder_config_path
+
         if gpu:
             warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
 
@@ -90,9 +94,7 @@ def __init__(
                 self.load_model_by_name(model_name, vocoder_name, gpu=gpu)
 
         if model_path:
-            self.load_tts_model_by_path(
-                model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
-            )
+            self.load_tts_model_by_path(model_path, config_path, gpu=gpu)
 
     @property
     def models(self):
@@ -140,18 +142,22 @@ def list_models():
 
     def download_model_by_name(
         self, model_name: str, vocoder_name: Optional[str] = None
-    ) -> tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str]]:
+    ) -> tuple[Optional[str], Optional[str], Optional[str]]:
         model_path, config_path, model_item = self.manager.download_model(model_name)
         if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
             # return model directory if there are multiple files
             # we assume that the model knows how to load itself
-            return None, None, None, None, model_path
+            return None, None, model_path
         if model_item.get("default_vocoder") is None:
-            return model_path, config_path, None, None, None
+            return model_path, config_path, None
         if vocoder_name is None:
             vocoder_name = model_item["default_vocoder"]
         vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
-        return model_path, config_path, vocoder_path, vocoder_config_path, None
+        # A local vocoder model will take precedence if specified via vocoder_path
+        if self.vocoder_path is None or self.vocoder_config_path is None:
+            self.vocoder_path = vocoder_path
+            self.vocoder_config_path = vocoder_config_path
+        return model_path, config_path, None
 
     def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False):
         """Load one of the 🐸TTS models by name.
@@ -170,7 +176,7 @@ def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False):
             gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         """
         self.model_name = model_name
-        model_path, config_path, _, _, model_dir = self.download_model_by_name(model_name)
+        model_path, config_path, model_dir = self.download_model_by_name(model_name)
         self.voice_converter = Synthesizer(
             vc_checkpoint=model_path, vc_config=config_path, model_dir=model_dir, use_cuda=gpu
         )
@@ -187,9 +193,7 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] =
         self.synthesizer = None
         self.model_name = model_name
 
-        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
-            model_name, vocoder_name
-        )
+        model_path, config_path, model_dir = self.download_model_by_name(model_name, vocoder_name)
 
         # init synthesizer
         # None values are fetch from the model
@@ -198,17 +202,15 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] =
             tts_config_path=config_path,
             tts_speakers_file=None,
             tts_languages_file=None,
-            vocoder_checkpoint=vocoder_path,
-            vocoder_config=vocoder_config_path,
+            vocoder_checkpoint=self.vocoder_path,
+            vocoder_config=self.vocoder_config_path,
             encoder_checkpoint=self.encoder_path,
             encoder_config=self.encoder_config_path,
             model_dir=model_dir,
             use_cuda=gpu,
         )
 
-    def load_tts_model_by_path(
-        self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
-    ):
+    def load_tts_model_by_path(self, model_path: str, config_path: str, *, gpu: bool = False) -> None:
         """Load a model from a path.
 
         Args:
@@ -224,8 +226,8 @@ def load_tts_model_by_path(
             tts_config_path=config_path,
             tts_speakers_file=None,
             tts_languages_file=None,
-            vocoder_checkpoint=vocoder_path,
-            vocoder_config=vocoder_config,
+            vocoder_checkpoint=self.vocoder_path,
+            vocoder_config=self.vocoder_config_path,
             encoder_checkpoint=self.encoder_path,
             encoder_config=self.encoder_config_path,
             use_cuda=gpu,

From a05177ce713a73112b40cb283ad9d3328a08f7cc Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Wed, 4 Dec 2024 16:11:43 +0100
Subject: [PATCH 07/10] chore(api): add type hints

---
 TTS/api.py | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index 86593f3fb6..d16012f849 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -1,3 +1,5 @@
+"""Coqui TTS Python API."""
+
 import logging
 import tempfile
 import warnings
@@ -30,7 +32,7 @@ def __init__(
         encoder_config_path: Optional[str] = None,
         progress_bar: bool = True,
         gpu: bool = False,
-    ):
+    ) -> None:
         """🐸TTS python interface that allows to load and use the released models.
 
         Example with a multi-speaker model:
@@ -97,17 +99,17 @@ def __init__(
             self.load_tts_model_by_path(model_path, config_path, gpu=gpu)
 
     @property
-    def models(self):
+    def models(self) -> list[str]:
         return self.manager.list_tts_models()
 
     @property
-    def is_multi_speaker(self):
+    def is_multi_speaker(self) -> bool:
         if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
             return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
         return False
 
     @property
-    def is_multi_lingual(self):
+    def is_multi_lingual(self) -> bool:
         # Not sure what sets this to None, but applied a fix to prevent crashing.
         if (
             isinstance(self.model_name, str)
@@ -121,23 +123,23 @@ def is_multi_lingual(self):
         return False
 
     @property
-    def speakers(self):
+    def speakers(self) -> list[str]:
         if not self.is_multi_speaker:
             return None
         return self.synthesizer.tts_model.speaker_manager.speaker_names
 
     @property
-    def languages(self):
+    def languages(self) -> list[str]:
         if not self.is_multi_lingual:
             return None
         return self.synthesizer.tts_model.language_manager.language_names
 
     @staticmethod
-    def get_models_file_path():
+    def get_models_file_path() -> Path:
         return Path(__file__).parent / ".models.json"
 
     @staticmethod
-    def list_models():
+    def list_models() -> list[str]:
         return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False).list_models()
 
     def download_model_by_name(
@@ -159,7 +161,7 @@ def download_model_by_name(
             self.vocoder_config_path = vocoder_config_path
         return model_path, config_path, None
 
-    def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False):
+    def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
         """Load one of the 🐸TTS models by name.
 
         Args:
@@ -168,7 +170,7 @@ def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None
         """
         self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
 
-    def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False):
+    def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False) -> None:
         """Load one of the voice conversion models by name.
 
         Args:
@@ -181,7 +183,7 @@ def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False):
             vc_checkpoint=model_path, vc_config=config_path, model_dir=model_dir, use_cuda=gpu
         )
 
-    def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False):
+    def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
         """Load one of 🐸TTS models by name.
 
         Args:
@@ -235,11 +237,11 @@ def load_tts_model_by_path(self, model_path: str, config_path: str, *, gpu: bool
 
     def _check_arguments(
         self,
-        speaker: str = None,
-        language: str = None,
-        speaker_wav: str = None,
-        emotion: str = None,
-        speed: float = None,
+        speaker: Optional[str] = None,
+        language: Optional[str] = None,
+        speaker_wav: Optional[str] = None,
+        emotion: Optional[str] = None,
+        speed: Optional[float] = None,
         **kwargs,
     ) -> None:
         """Check if the arguments are valid for the model."""
@@ -320,7 +322,7 @@ def tts_to_file(
         file_path: str = "output.wav",
         split_sentences: bool = True,
         **kwargs,
-    ):
+    ) -> str:
         """Convert text to speech.
 
         Args:
@@ -451,7 +453,7 @@ def tts_with_vc_to_file(
         file_path: str = "output.wav",
         speaker: str = None,
         split_sentences: bool = True,
-    ):
+    ) -> str:
         """Convert text to speech with voice conversion and save to file.
 
         Check `tts_with_vc` for more details.
@@ -479,3 +481,4 @@ def tts_with_vc_to_file(
             text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
         )
         save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
+        return file_path

From 89abd9862009f642385af96e7c0e2fdd0b50a5f6 Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Thu, 5 Dec 2024 21:34:04 +0100
Subject: [PATCH 08/10] feat(api): support passing speaker/language id file
 paths

---
 TTS/api.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index d16012f849..90f167dc52 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -30,6 +30,8 @@ def __init__(
         vocoder_config_path: Optional[str] = None,
         encoder_path: Optional[str] = None,
         encoder_config_path: Optional[str] = None,
+        speakers_file_path: Optional[str] = None,
+        language_ids_file_path: Optional[str] = None,
         progress_bar: bool = True,
         gpu: bool = False,
     ) -> None:
@@ -68,8 +70,10 @@ def __init__(
             vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
             encoder_path: Path to speaker encoder checkpoint. Default to None.
             encoder_config_path: Path to speaker encoder config file. Defaults to None.
-            progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
-            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+            speakers_file_path: JSON file for multi-speaker model. Defaults to None.
+            language_ids_file_path: JSON file for multilingual model. Defaults to None
+            progress_bar (bool, optional): Whether to print a progress bar while downloading a model. Defaults to True.
+            gpu (bool, optional): Enable/disable GPU. Defaults to False. DEPRECATED, use TTS(...).to("cuda")
         """
         super().__init__()
         self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar)
@@ -82,6 +86,8 @@ def __init__(
         self.vocoder_config_path = vocoder_config_path
         self.encoder_path = encoder_path
         self.encoder_config_path = encoder_config_path
+        self.speakers_file_path = speakers_file_path
+        self.language_ids_file_path = language_ids_file_path
 
         if gpu:
             warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
@@ -226,8 +232,8 @@ def load_tts_model_by_path(self, model_path: str, config_path: str, *, gpu: bool
         self.synthesizer = Synthesizer(
             tts_checkpoint=model_path,
             tts_config_path=config_path,
-            tts_speakers_file=None,
-            tts_languages_file=None,
+            tts_speakers_file=self.speakers_file_path,
+            tts_languages_file=self.language_ids_file_path,
             vocoder_checkpoint=self.vocoder_path,
             vocoder_config=self.vocoder_config_path,
             encoder_checkpoint=self.encoder_path,

From 806af96e4c993eae45d2129773784a5624fb4bb5 Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Fri, 6 Dec 2024 11:56:54 +0100
Subject: [PATCH 09/10] refactor(api): use save_wav() from Synthesizer instance

---
 TTS/api.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index 90f167dc52..be6141d312 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -9,7 +9,6 @@
 from torch import nn
 
 from TTS.config import load_config
-from TTS.utils.audio.numpy_transforms import save_wav
 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
 
@@ -394,6 +393,7 @@ def voice_conversion_to_file(
         source_wav: str,
         target_wav: str,
         file_path: str = "output.wav",
+        pipe_out=None,
     ) -> str:
         """Voice conversion with FreeVC. Convert source wav to target speaker.
 
@@ -404,9 +404,11 @@ def voice_conversion_to_file(
                 Path to the target wav file.
             file_path (str, optional):
                 Output file path. Defaults to "output.wav".
+            pipe_out (BytesIO, optional):
+                Flag to stdout the generated TTS wav file for shell pipe.
         """
         wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
-        save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
+        self.voice_converter.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
         return file_path
 
     def tts_with_vc(
@@ -459,6 +461,7 @@ def tts_with_vc_to_file(
         file_path: str = "output.wav",
         speaker: str = None,
         split_sentences: bool = True,
+        pipe_out=None,
     ) -> str:
         """Convert text to speech with voice conversion and save to file.
 
@@ -482,9 +485,11 @@ def tts_with_vc_to_file(
                 Split text into sentences, synthesize them separately and concatenate the file audio.
                 Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
                 applicable to the 🐸TTS models. Defaults to True.
+            pipe_out (BytesIO, optional):
+                Flag to stdout the generated TTS wav file for shell pipe.
         """
         wav = self.tts_with_vc(
             text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
         )
-        save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
+        self.voice_converter.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
         return file_path

From e0f621180f328eac461e9fee978073db3c7b8421 Mon Sep 17 00:00:00 2001
From: Enno Hermann <enno.hermann@idiap.ch>
Date: Mon, 2 Dec 2024 22:34:19 +0100
Subject: [PATCH 10/10] refactor(bin.synthesize): use Python API for CLI

---
 TTS/api.py                     |  16 +++--
 TTS/bin/synthesize.py          | 125 +++++++++++----------------------
 tests/zoo_tests/test_models.py |  33 ++++-----
 3 files changed, 65 insertions(+), 109 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index be6141d312..83189482cb 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -109,7 +109,11 @@ def models(self) -> list[str]:
 
     @property
     def is_multi_speaker(self) -> bool:
-        if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
+        if (
+            self.synthesizer is not None
+            and hasattr(self.synthesizer.tts_model, "speaker_manager")
+            and self.synthesizer.tts_model.speaker_manager
+        ):
             return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
         return False
 
@@ -123,7 +127,11 @@ def is_multi_lingual(self) -> bool:
             and ("xtts" in self.config.model or "languages" in self.config and len(self.config.languages) > 1)
         ):
             return True
-        if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
+        if (
+            self.synthesizer is not None
+            and hasattr(self.synthesizer.tts_model, "language_manager")
+            and self.synthesizer.tts_model.language_manager
+        ):
             return self.synthesizer.tts_model.language_manager.num_languages > 1
         return False
 
@@ -306,10 +314,6 @@ def tts(
             speaker_name=speaker,
             language_name=language,
             speaker_wav=speaker_wav,
-            reference_wav=None,
-            style_wav=None,
-            style_text=None,
-            reference_speaker_name=None,
             split_sentences=split_sentences,
             **kwargs,
         )
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index 59ceb1db4f..885f6d6f0c 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -9,8 +9,6 @@
 from argparse import RawTextHelpFormatter
 
 # pylint: disable=redefined-outer-name, unused-argument
-from pathlib import Path
-
 from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
 
 logger = logging.getLogger(__name__)
@@ -312,7 +310,8 @@ def parse_args() -> argparse.Namespace:
     return args
 
 
-def main():
+def main() -> None:
+    """Entry point for `tts` command line interface."""
     setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
     args = parse_args()
 
@@ -320,12 +319,11 @@ def main():
 
     with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
         # Late-import to make things load faster
+        from TTS.api import TTS
         from TTS.utils.manage import ModelManager
-        from TTS.utils.synthesizer import Synthesizer
 
         # load model manager
-        path = Path(__file__).parent / "../.models.json"
-        manager = ModelManager(path, progress_bar=args.progress_bar)
+        manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=args.progress_bar)
 
         tts_path = None
         tts_config_path = None
@@ -339,12 +337,12 @@ def main():
         vc_config_path = None
         model_dir = None
 
-        # CASE1 #list : list pre-trained TTS models
+        # 1) List pre-trained TTS models
         if args.list_models:
             manager.list_models()
             sys.exit()
 
-        # CASE2 #info : model info for pre-trained TTS models
+        # 2) Info about pre-trained TTS models (without loading a model)
         if args.model_info_by_idx:
             model_query = args.model_info_by_idx
             manager.model_info_by_idx(model_query)
@@ -355,91 +353,50 @@ def main():
             manager.model_info_by_full_name(model_query_full_name)
             sys.exit()
 
-        # CASE3: load pre-trained model paths
-        if args.model_name is not None and not args.model_path:
-            model_path, config_path, model_item = manager.download_model(args.model_name)
-            # tts model
-            if model_item["model_type"] == "tts_models":
-                tts_path = model_path
-                tts_config_path = config_path
-                if args.vocoder_name is None and "default_vocoder" in model_item:
-                    args.vocoder_name = model_item["default_vocoder"]
-
-            # voice conversion model
-            if model_item["model_type"] == "voice_conversion_models":
-                vc_path = model_path
-                vc_config_path = config_path
-
-            # tts model with multiple files to be loaded from the directory path
-            if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
-                model_dir = model_path
-                tts_path = None
-                tts_config_path = None
-                args.vocoder_name = None
-
-        # load vocoder
-        if args.vocoder_name is not None and not args.vocoder_path:
-            vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
-
-        # CASE4: set custom model paths
-        if args.model_path is not None:
-            tts_path = args.model_path
-            tts_config_path = args.config_path
-            speakers_file_path = args.speakers_file_path
-            language_ids_file_path = args.language_ids_file_path
-
-        if args.vocoder_path is not None:
-            vocoder_path = args.vocoder_path
-            vocoder_config_path = args.vocoder_config_path
-
-        if args.encoder_path is not None:
-            encoder_path = args.encoder_path
-            encoder_config_path = args.encoder_config_path
-
+        # 3) Load a model for further info or TTS/VC
         device = args.device
         if args.use_cuda:
             device = "cuda"
-
-        # load models
-        synthesizer = Synthesizer(
-            tts_checkpoint=tts_path,
-            tts_config_path=tts_config_path,
-            tts_speakers_file=speakers_file_path,
-            tts_languages_file=language_ids_file_path,
-            vocoder_checkpoint=vocoder_path,
-            vocoder_config=vocoder_config_path,
-            encoder_checkpoint=encoder_path,
-            encoder_config=encoder_config_path,
-            vc_checkpoint=vc_path,
-            vc_config=vc_config_path,
-            model_dir=model_dir,
-            voice_dir=args.voice_dir,
+        # A local model will take precedence if specified via modeL_path
+        model_name = args.model_name if args.model_path is None else None
+        api = TTS(
+            model_name=model_name,
+            model_path=args.model_path,
+            config_path=args.config_path,
+            vocoder_name=args.vocoder_name,
+            vocoder_path=args.vocoder_path,
+            vocoder_config_path=args.vocoder_config_path,
+            encoder_path=args.encoder_path,
+            encoder_config_path=args.encoder_config_path,
+            speakers_file_path=args.speakers_file_path,
+            language_ids_file_path=args.language_ids_file_path,
+            progress_bar=args.progress_bar,
         ).to(device)
 
         # query speaker ids of a multi-speaker model.
         if args.list_speaker_idxs:
-            if synthesizer.tts_model.speaker_manager is None:
+            if not api.is_multi_speaker:
                 logger.info("Model only has a single speaker.")
                 return
             logger.info(
                 "Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
             )
-            logger.info(list(synthesizer.tts_model.speaker_manager.name_to_id.keys()))
+            logger.info(api.speakers)
             return
 
         # query langauge ids of a multi-lingual model.
         if args.list_language_idxs:
-            if synthesizer.tts_model.language_manager is None:
+            if not api.is_multi_lingual:
                 logger.info("Monolingual model.")
                 return
             logger.info(
                 "Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
             )
-            logger.info(synthesizer.tts_model.language_manager.name_to_id)
+            logger.info(api.languages)
             return
 
         # check the arguments against a multi-speaker model.
-        if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
+        if api.is_multi_speaker and (not args.speaker_idx and not args.speaker_wav):
             logger.error(
                 "Looks like you use a multi-speaker model. Define `--speaker_idx` to "
                 "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
@@ -450,31 +407,29 @@ def main():
         if args.text:
             logger.info("Text: %s", args.text)
 
-        # kick it
-        if tts_path is not None:
-            wav = synthesizer.tts(
-                args.text,
-                speaker_name=args.speaker_idx,
-                language_name=args.language_idx,
+        if args.text is not None:
+            api.tts_to_file(
+                text=args.text,
+                speaker=args.speaker_idx,
+                language=args.language_idx,
                 speaker_wav=args.speaker_wav,
+                pipe_out=pipe_out,
+                file_path=args.out_path,
                 reference_wav=args.reference_wav,
                 style_wav=args.capacitron_style_wav,
                 style_text=args.capacitron_style_text,
                 reference_speaker_name=args.reference_speaker_idx,
+                voice_dir=args.voice_dir,
             )
-        elif vc_path is not None:
-            wav = synthesizer.voice_conversion(
+            logger.info("Saved TTS output to %s", args.out_path)
+        elif args.source_wav is not None and args.target_wav is not None:
+            api.voice_conversion_to_file(
                 source_wav=args.source_wav,
                 target_wav=args.target_wav,
+                file_path=args.out_path,
+                pipe_out=pipe_out,
             )
-        elif model_dir is not None:
-            wav = synthesizer.tts(
-                args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
-            )
-
-        # save the results
-        synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
-        logger.info("Saved output to %s", args.out_path)
+            logger.info("Saved VC output to %s", args.out_path)
 
 
 if __name__ == "__main__":
diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py
index b944423988..f38880b51f 100644
--- a/tests/zoo_tests/test_models.py
+++ b/tests/zoo_tests/test_models.py
@@ -34,30 +34,27 @@ def run_models(offset=0, step=1):
             # download and run the model
             speaker_files = glob.glob(local_download_dir + "/speaker*")
             language_files = glob.glob(local_download_dir + "/language*")
-            language_id = ""
+            speaker_arg = ""
+            language_arg = ""
             if len(speaker_files) > 0:
                 # multi-speaker model
                 if "speaker_ids" in speaker_files[0]:
                     speaker_manager = SpeakerManager(speaker_id_file_path=speaker_files[0])
                 elif "speakers" in speaker_files[0]:
                     speaker_manager = SpeakerManager(d_vectors_file_path=speaker_files[0])
-
-                # multi-lingual model - Assuming multi-lingual models are also multi-speaker
-                if len(language_files) > 0 and "language_ids" in language_files[0]:
-                    language_manager = LanguageManager(language_ids_file_path=language_files[0])
-                    language_id = language_manager.language_names[0]
-
-                speaker_id = list(speaker_manager.name_to_id.keys())[0]
-                run_cli(
-                    f"tts --model_name  {model_name} "
-                    f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" --no-progress_bar'
-                )
-            else:
-                # single-speaker model
-                run_cli(
-                    f"tts --model_name  {model_name} "
-                    f'--text "This is an example." --out_path "{output_path}" --no-progress_bar'
-                )
+                speakers = list(speaker_manager.name_to_id.keys())
+                if len(speakers) > 1:
+                    speaker_arg = f'--speaker_idx "{speakers[0]}"'
+            if len(language_files) > 0 and "language_ids" in language_files[0]:
+                # multi-lingual model
+                language_manager = LanguageManager(language_ids_file_path=language_files[0])
+                languages = language_manager.language_names
+                if len(languages) > 1:
+                    language_arg = f'--language_idx "{languages[0]}"'
+            run_cli(
+                f'tts --model_name  {model_name} --text "This is an example." '
+                f'--out_path "{output_path}" {speaker_arg} {language_arg} --no-progress_bar'
+            )
             # remove downloaded models
             shutil.rmtree(local_download_dir)
             shutil.rmtree(get_user_data_dir("tts"))