diff --git a/README.md b/README.md index 711a6ac3..769cfc62 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ pip install so-vits-svc-fork ## Features not available in the original repo - **Realtime voice conversion** +- GUI available - Unified command-line interface (no need to run Python scripts) - Ready to use just by installing with `pip`. - Automatically download pretrained base model and HuBERT model @@ -51,7 +52,25 @@ pip install so-vits-svc-fork ## Usage -### Realtime Voice conversion +### Inference + +#### GUI + +![GUI](https://raw.githubusercontent.com/34j/so-vits-svc-fork/main/docs/_static/gui.png) + +```shell +svcg +``` + +#### CLI + +- Realtime (from microphone) + +```shell +svc --model-path source.wav +``` + +- File ```shell svc vc --model-path @@ -70,11 +89,7 @@ svc pre-hubert svc train ``` -### Inference - -```shell -svc --model-path source.wav -``` +### Further help For more details, run `svc -h` or `svc -h`. diff --git a/docs/_static/gui.png b/docs/_static/gui.png new file mode 100644 index 00000000..dbc93c0b Binary files /dev/null and b/docs/_static/gui.png differ diff --git a/poetry.lock b/poetry.lock index c2ace889..df7f724a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2578,6 +2578,18 @@ pytz = ">=2020.1" [package.extras] test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] +[[package]] +name = "pebble" +version = "5.0.3" +description = "Threading and multiprocessing eye-candy." +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "Pebble-5.0.3-py3-none-any.whl", hash = "sha256:8274aa0959f387b368ede47666129cbe5d123f276a1bd9cafe77e020194b2141"}, + {file = "Pebble-5.0.3.tar.gz", hash = "sha256:bdcfd9ea7e0aedb895b204177c19e6d6543d9962f4e3402ebab2175004863da8"}, +] + [[package]] name = "pillow" version = "9.4.0" @@ -3118,6 +3130,18 @@ files = [ {file = "pyrsistent-0.19.3.tar.gz", hash = "sha256:1a2994773706bbb4995c31a97bc94f1418314923bd1048c6d964837040376440"}, ] +[[package]] +name = "pysimplegui" +version = "4.60.4" +description = "Python GUIs for Humans. Launched in 2018. It's 2022 & PySimpleGUI is an ACTIVE & supported project. Super-simple to create custom GUI's. 325+ Demo programs & Cookbook for rapid start. Extensive documentation. Main docs at www.PySimpleGUI.org. Fun & your success are the focus. Examples using Machine Learning (GUI, OpenCV Integration), Rainmeter Style Desktop Widgets, Matplotlib + Pyplot, PIL support, add GUI to command line scripts, PDF & Image Viewers. Great for beginners & advanced GUI programmers." +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "PySimpleGUI-4.60.4-py3-none-any.whl", hash = "sha256:e133fbd21779f0f125cebbc2a4e1f5a931a383738661013ff33ad525d5611eda"}, + {file = "PySimpleGUI-4.60.4.tar.gz", hash = "sha256:f88c82c301a51aea35be605dc060bcceb0dcb6682e16280544884701ab4b23ba"}, +] + [[package]] name = "pysimplevalidate" version = "0.2.12" @@ -4670,4 +4694,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.11" -content-hash = "bd0a5148f6634dc9b2df2d30a8752d0de8dc72d509827ea6b4245e12bfb34060" +content-hash = "c484b8f4456aa9c2c6964b1173b94cfed86b84643fe10adf55fd49714bfc8a16" diff --git a/pyproject.toml b/pyproject.toml index 560ab76a..ecf11490 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,10 @@ packages = [ [tool.poetry.scripts] so-vits-svc-fork = "so_vits_svc_fork.__main__:cli" svc = "so_vits_svc_fork.__main__:cli" +svcf = "so_vits_svc_fork.__main__:cli" +svcg = "so_vits_svc_fork.gui:main" +svc-gui = "so_vits_svc_fork.gui:main" +svcf-gui = "so_vits_svc_fork.gui:main" [tool.poetry.urls] "Bug Tracker" = "https://github.com/34j/so-vits-svc-fork/issues" @@ -55,6 +59,8 @@ tqdm-joblib = "*" tensorboardx = "*" pyinputplus = "*" cm-time = "^0.1.2" +pysimplegui = ">=4.6" +pebble = "^5.0.3" [tool.poetry.group.dev.dependencies] pre-commit = ">=3" diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py index 4f4b57ed..a07c7532 100644 --- a/src/so_vits_svc_fork/__main__.py +++ b/src/so_vits_svc_fork/__main__.py @@ -17,18 +17,22 @@ import torch from rich.logging import RichHandler -IN_COLAB = os.getenv("COLAB_RELEASE_TAG") - -basicConfig( - level=INFO, - format="%(asctime)s %(message)s", - datefmt="[%X]", - handlers=[ - RichHandler() if not IN_COLAB else StreamHandler(), - FileHandler(f"{__name__.split('.')[0]}.log"), - ], -) -captureWarnings(True) + +def init_logger() -> None: + IN_COLAB = os.getenv("COLAB_RELEASE_TAG") + + basicConfig( + level=INFO, + format="%(asctime)s %(message)s", + datefmt="[%X]", + handlers=[ + RichHandler() if not IN_COLAB else StreamHandler(), + FileHandler(f"{__name__.split('.')[0]}.log"), + ], + ) + captureWarnings(True) + + LOG = getLogger(__name__) @@ -48,6 +52,7 @@ def cli(): To train a model, run pre-resample, pre-config, pre-hubert, train. To infer a model, run infer. """ + init_logger() @click.help_option("--help", "-h") diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py new file mode 100644 index 00000000..d09ca2b8 --- /dev/null +++ b/src/so_vits_svc_fork/gui.py @@ -0,0 +1,234 @@ +from logging import getLogger +from pathlib import Path + +import PySimpleGUI as sg +import sounddevice as sd +import soundfile as sf +from pebble import ProcessPool + +from .__main__ import init_logger + +LOG = getLogger(__name__) + +init_logger() + + +def play_audio(path: Path | str): + if isinstance(path, Path): + path = path.as_posix() + data, sr = sf.read(path) + sd.play(data, sr) + + +def main(): + sg.theme("Dark") + model_candidates = list(sorted(Path("./logs/44k/").glob("G_*.pth"))) + layout = [ + [ + sg.Text("Model path: "), + sg.InputText( + key="model_path", + default_text=model_candidates[-1].as_posix() + if model_candidates + else "", + ), + sg.FileBrowse( + initial_folder="./logs/44k/" if Path("./logs/44k/").exists() else "." + ), + ], + [ + sg.Text("Config path: "), + sg.InputText( + key="config_path", + default_text="./configs/44k/config.json", + enable_events=True, + ), + sg.FileBrowse( + initial_folder="./configs/44k/" + if Path("./configs/44k/").exists() + else "." + ), + ], + [sg.Text("Speaker"), sg.Combo(values=[], key="speaker", size=(20, 1))], + [ + sg.Text("Input audio path:"), + sg.InputText(key="input_path"), + sg.FileBrowse(initial_folder="."), + sg.Button("Play", key="play_input"), + ], + [ + sg.Text("Silence threshold: "), + sg.Slider( + range=(-60.0, 0), + orientation="h", + key="silence_threshold", + default_value=-20, + resolution=0.1, + ), + ], + [ + sg.Checkbox( + key="auto_predict_f0", + default=True, + text="Auto predict F0 (Pitch may become unstable when turned on in real-time inference.)", + ) + ], + [ + sg.Text("Pitch: "), + sg.Slider( + range=(-20, 20), orientation="h", key="transpose", default_value=0 + ), + ], + [ + sg.Text("Cluster infer ratio: "), + sg.Slider( + range=(0, 1.0), + orientation="h", + key="cluster_infer_ratio", + default_value=0, + resolution=0.01, + ), + ], + [ + sg.Text("Cluster model path: "), + sg.InputText(key="cluster_model_path"), + sg.FileBrowse(), + ], + [ + sg.Text("Noise scale: "), + sg.Slider( + range=(0.0, 1.0), + orientation="h", + key="noise_scale", + default_value=0.4, + resolution=0.01, + ), + ], + [ + sg.Text("Pad seconds"), + sg.Slider( + range=(0.0, 1.0), + orientation="h", + key="pad_seconds", + default_value=0.1, + resolution=0.01, + ), + ], + [ + sg.Text("Crossfade seconds"), + sg.Slider( + range=(0, 0.6), + orientation="h", + key="crossfade_seconds", + default_value=0.1, + resolution=0.001, + ), + ], + [ + sg.Text("Block seconds"), + sg.Slider( + range=(0, 3.0), + orientation="h", + key="block_seconds", + default_value=1, + resolution=0.01, + ), + ], + [sg.Checkbox(key="use_gpu", default=True, text="Use GPU")], + [sg.Checkbox(key="auto_play", default=True, text="Auto play")], + [ + sg.Button("Infer", key="infer"), + sg.Button("(Re)Start Voice Changer", key="start_vc"), + sg.Button("Stop Voice Changer", key="stop_vc"), + ], + ] + + window = sg.Window( + f"{__name__.split('.')[0]}", layout + ) # , use_custom_titlebar=True) + with ProcessPool(max_workers=1) as pool: + future = None + while True: + event, values = window.read(100) + if event == sg.WIN_CLOSED: + break + + def update_combo() -> None: + from . import utils + + if Path(values["config_path"]).exists(): + hp = utils.get_hparams_from_file(values["config_path"]) + LOG.info(f"Loaded config from {values['config_path']}") + window["speaker"].update( + values=list(hp.__dict__["spk"].keys()), set_to_index=0 + ) + + if not event == sg.EVENT_TIMEOUT: + LOG.info(f"Event: {event}, values: {values}") + if values["speaker"] == "": + update_combo() + + if event == "config_path": + update_combo() + elif event == "infer": + from .inference_main import infer + + input_path = Path(values["input_path"]) + output_path = ( + input_path.parent / f"{input_path.stem}.out{input_path.suffix}" + ) + infer( + model_path=Path(values["model_path"]), + config_path=Path(values["config_path"]), + input_path=input_path, + output_path=output_path, + speaker=values["speaker"], + cluster_model_path=Path(values["cluster_model_path"]) + if values["cluster_model_path"] + else None, + transpose=values["transpose"], + auto_predict_f0=values["auto_predict_f0"], + cluster_infer_ratio=values["cluster_infer_ratio"], + noise_scale=values["noise_scale"], + db_thresh=values["silence_threshold"], + pad_seconds=values["pad_seconds"], + device="cuda" if values["use_gpu"] else "cpu", + ) + if values["auto_play"]: + pool.schedule(play_audio, args=[output_path]) + elif event == "play_input": + if Path(values["input_path"]).exists(): + pool.schedule(play_audio, args=[Path(values["input_path"])]) + elif event == "start_vc": + from .inference_main import realtime + + if future: + LOG.info("Canceling previous task") + future.cancel() + future = pool.schedule( + realtime, + kwargs=dict( + model_path=Path(values["model_path"]), + config_path=Path(values["config_path"]), + speaker=values["speaker"], + cluster_model_path=Path(values["cluster_model_path"]) + if values["cluster_model_path"] + else None, + transpose=values["transpose"], + auto_predict_f0=values["auto_predict_f0"], + cluster_infer_ratio=values["cluster_infer_ratio"], + noise_scale=values["noise_scale"], + crossfade_seconds=values["crossfade_seconds"], + db_thresh=values["silence_threshold"], + pad_seconds=values["pad_seconds"], + device="cuda" if values["use_gpu"] else "cpu", + block_seconds=values["block_seconds"], + ), + ) + elif event == "stop_vc": + if future: + future.cancel() + future = None + if future: + future.cancel() + window.close() diff --git a/src/so_vits_svc_fork/inference/infer_tool.py b/src/so_vits_svc_fork/inference/infer_tool.py index 0458af2e..1217f00a 100644 --- a/src/so_vits_svc_fork/inference/infer_tool.py +++ b/src/so_vits_svc_fork/inference/infer_tool.py @@ -119,13 +119,19 @@ def infer( ): audio = audio.astype(np.float32) # get speaker id - speaker_id = self.spk2id.__dict__.get(speaker) - if not speaker_id and isinstance(speaker, int): + if isinstance(speaker, int): if len(self.spk2id.__dict__) >= speaker: speaker_id = speaker + else: + raise ValueError( + f"Speaker id {speaker} >= number of speakers {len(self.spk2id.__dict__)}" + ) else: - LOG.warning(f"Speaker {speaker} is not found. Use speaker 0 instead.") - speaker_id = 0 + if speaker in self.spk2id.__dict__: + speaker_id = self.spk2id.__dict__[speaker] + else: + LOG.warning(f"Speaker {speaker} is not found. Use speaker 0 instead.") + speaker_id = 0 sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0) # get unit f0 @@ -167,7 +173,7 @@ def infer_silence( # slice config db_thresh: int = -40, pad_seconds: float = 0.5, - fade_seconds: float = 0.04, + # fade_seconds: float = 0.0, ) -> np.ndarray[Any, np.dtype[np.float32]]: chunks = slicer.cut(audio, self.target_sample, db_thresh=db_thresh) LOG.info(f"Cut audio into chunks {chunks}") @@ -197,9 +203,9 @@ def infer_silence( _audio = _audio[pad_len:-pad_len] # add fade - fade_len = int(self.target_sample * fade_seconds) - _audio[:fade_len] = _audio[:fade_len] * np.linspace(0, 1, fade_len) - _audio[-fade_len:] = _audio[-fade_len:] * np.linspace(1, 0, fade_len) + # fade_len = int(self.target_sample * fade_seconds) + # _audio[:fade_len] = _audio[:fade_len] * np.linspace(0, 1, fade_len) + # _audio[-fade_len:] = _audio[-fade_len:] * np.linspace(1, 0, fade_len) result_audio = np.concatenate([result_audio, pad_array(_audio, length)]) result_audio = result_audio[: audio.shape[0]] return result_audio @@ -238,6 +244,15 @@ def process( db_thresh: int = -40, pad_seconds: float = 0.5, ): + """ + chunks : ■■■■■■□□□□□□ + add last input:□■■■■■■ + ■□□□□□□ + infer :□■■■■■■ + ■□□□□□□ + crossfade :▲■■■■■ + ▲□□□□□ + """ if input_audio.ndim != 1: raise ValueError("Input audio must be 1-dimensional.") if input_audio.shape[0] < self.crossfade_len: @@ -286,15 +301,14 @@ def process( noise_scale=noise_scale, ) infered_audio_c = infered_audio_c.cpu().numpy() - infered_audio_c = infered_audio_c LOG.info(f"Concentrated Inferred shape: {infered_audio_c.shape}") assert infered_audio_c.shape[0] == input_audio_c.shape[0] # crossfade result = maad.util.crossfade( self.last_infered, infered_audio_c, 1, self.crossfade_len - )[: input_audio.shape[0]] + )[-(input_audio.shape[0] + self.crossfade_len) : -self.crossfade_len] LOG.info(f"Result shape: {result.shape}") assert result.shape[0] == input_audio.shape[0] - self.last_infered = infered_audio_c + self.last_infered = infered_audio_c[-self.crossfade_len - 1 :].copy() return result