feat: support vocoders for voice conversion

eginhard · eginhard · commit 58a11ab1251c · 2025-01-14T15:07:28.000+01:00
So far, FreeVC and OpenVoice are both Vits-based and don't have a separate
vocoder. kNN-VC needs to be combined with a Hifigan.
diff --git a/TTS/api.py b/TTS/api.py
@@ -95,7 +95,7 @@ def __init__(
             if "tts_models" in model_name:
                 self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
             elif "voice_conversion_models" in model_name:
-                self.load_vc_model_by_name(model_name, gpu=gpu)
+                self.load_vc_model_by_name(model_name, vocoder_name, gpu=gpu)
             # To allow just TTS("xtts")
             else:
                 self.load_model_by_name(model_name, vocoder_name, gpu=gpu)
@@ -157,22 +157,24 @@ def list_models() -> list[str]:
 
     def download_model_by_name(
         self, model_name: str, vocoder_name: Optional[str] = None
-    ) -> tuple[Optional[Path], Optional[Path], Optional[Path]]:
+    ) -> tuple[Optional[Path], Optional[Path], Optional[Path], Optional[Path], Optional[Path]]:
         model_path, config_path, model_item = self.manager.download_model(model_name)
         if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
             # return model directory if there are multiple files
             # we assume that the model knows how to load itself
-            return None, None, model_path
+            return None, None, None, None, model_path
         if model_item.get("default_vocoder") is None:
-            return model_path, config_path, None
+            return model_path, config_path, None, None, None
         if vocoder_name is None:
             vocoder_name = model_item["default_vocoder"]
-        vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
-        # A local vocoder model will take precedence if specified via vocoder_path
-        if self.vocoder_path is None or self.vocoder_config_path is None:
-            self.vocoder_path = vocoder_path
-            self.vocoder_config_path = vocoder_config_path
-        return model_path, config_path, None
+        vocoder_path, vocoder_config_path = None, None
+        # A local vocoder model will take precedence if already specified in __init__
+        if model_item["model_type"] == "tts_models":
+            vocoder_path = self.vocoder_path
+            vocoder_config_path = self.vocoder_config_path
+        if vocoder_path is None or vocoder_config_path is None:
+            vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
+        return model_path, config_path, vocoder_path, vocoder_config_path, None
 
     def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
         """Load one of the 🐸TTS models by name.
@@ -183,17 +185,24 @@ def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None
         """
         self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
 
-    def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False) -> None:
+    def load_vc_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
         """Load one of the voice conversion models by name.
 
         Args:
             model_name (str): Model name to load. You can list models by ```tts.models```.
             gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         """
         self.model_name = model_name
-        model_path, config_path, model_dir = self.download_model_by_name(model_name)
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+            model_name, vocoder_name
+        )
         self.voice_converter = Synthesizer(
-            vc_checkpoint=model_path, vc_config=config_path, model_dir=model_dir, use_cuda=gpu
+            vc_checkpoint=model_path,
+            vc_config=config_path,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config_path,
+            model_dir=model_dir,
+            use_cuda=gpu,
         )
 
     def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
@@ -208,7 +217,9 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] =
         self.synthesizer = None
         self.model_name = model_name
 
-        model_path, config_path, model_dir = self.download_model_by_name(model_name, vocoder_name)
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
+            model_name, vocoder_name
+        )
 
         # init synthesizer
         # None values are fetch from the model
@@ -217,8 +228,8 @@ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] =
             tts_config_path=config_path,
             tts_speakers_file=None,
             tts_languages_file=None,
-            vocoder_checkpoint=self.vocoder_path,
-            vocoder_config=self.vocoder_config_path,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config_path,
             encoder_checkpoint=self.encoder_path,
             encoder_config=self.encoder_config_path,
             model_dir=model_dir,
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
@@ -98,12 +98,12 @@ def __init__(
         if tts_checkpoint:
             self._load_tts(self.tts_checkpoint, self.tts_config_path, use_cuda)
 
-        if vocoder_checkpoint:
-            self._load_vocoder(self.vocoder_checkpoint, self.vocoder_config, use_cuda)
-
         if vc_checkpoint and model_dir == "":
             self._load_vc(self.vc_checkpoint, self.vc_config, use_cuda)
 
+        if vocoder_checkpoint:
+            self._load_vocoder(self.vocoder_checkpoint, self.vocoder_config, use_cuda)
+
         if model_dir:
             if "fairseq" in model_dir:
                 self._load_fairseq_from_dir(model_dir, use_cuda)
@@ -273,8 +273,10 @@ def save_wav(self, wav: List[int], path: str, pipe_out=None) -> None:
         save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate, pipe_out=pipe_out)
 
     def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]:
-        output_wav = self.vc_model.voice_conversion(source_wav, target_wav)
-        return output_wav
+        output = self.vc_model.voice_conversion(source_wav, target_wav)
+        if self.vocoder_model is not None:
+            output = self.vocoder_model.inference(output)
+        return output.squeeze()
 
     def tts(
         self,