feat: add gui (#3)

voicepaw · Mar 17, 2023 · 34aec2b · 34aec2b
1 parent 7b74606
commit 34aec2b
Show file tree

Hide file tree

Showing 7 changed files with 328 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -44,14 +44,33 @@ pip install so-vits-svc-fork
 ## Features not available in the original repo
 
 - **Realtime voice conversion**
+- GUI available
 - Unified command-line interface (no need to run Python scripts)
 - Ready to use just by installing with `pip`.
 - Automatically download pretrained base model and HuBERT model
 - Code completely formatted with black, isort, autoflake etc.
 
 ## Usage
 
-### Realtime Voice conversion
+### Inference
+
+#### GUI
+
+![GUI](https://raw.githubusercontent.com/34j/so-vits-svc-fork/main/docs/_static/gui.png)
+
+```shell
+svcg
+```
+
+#### CLI
+
+- Realtime (from microphone)
+
+```shell
+svc --model-path <model-path> source.wav
+```
+
+- File
 
 ```shell
 svc vc --model-path <model-path>
@@ -70,11 +89,7 @@ svc pre-hubert
 svc train
 ```
 
-### Inference
-
-```shell
-svc --model-path <model-path> source.wav
-```
+### Further help
 
 For more details, run `svc -h` or `svc <subcommand> -h`.
 

diff --git a/docs/_static/gui.png b/docs/_static/gui.png
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,10 @@ packages = [
 [tool.poetry.scripts]
 so-vits-svc-fork = "so_vits_svc_fork.__main__:cli"
 svc = "so_vits_svc_fork.__main__:cli"
+svcf = "so_vits_svc_fork.__main__:cli"
+svcg = "so_vits_svc_fork.gui:main"
+svc-gui = "so_vits_svc_fork.gui:main"
+svcf-gui = "so_vits_svc_fork.gui:main"
 
 [tool.poetry.urls]
 "Bug Tracker" = "https://github.com/34j/so-vits-svc-fork/issues"
@@ -55,6 +59,8 @@ tqdm-joblib = "*"
 tensorboardx = "*"
 pyinputplus = "*"
 cm-time = "^0.1.2"
+pysimplegui = ">=4.6"
+pebble = "^5.0.3"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = ">=3"

diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
@@ -17,18 +17,22 @@
 import torch
 from rich.logging import RichHandler
 
-IN_COLAB = os.getenv("COLAB_RELEASE_TAG")
-
-basicConfig(
-    level=INFO,
-    format="%(asctime)s %(message)s",
-    datefmt="[%X]",
-    handlers=[
-        RichHandler() if not IN_COLAB else StreamHandler(),
-        FileHandler(f"{__name__.split('.')[0]}.log"),
-    ],
-)
-captureWarnings(True)
+
+def init_logger() -> None:
+    IN_COLAB = os.getenv("COLAB_RELEASE_TAG")
+
+    basicConfig(
+        level=INFO,
+        format="%(asctime)s %(message)s",
+        datefmt="[%X]",
+        handlers=[
+            RichHandler() if not IN_COLAB else StreamHandler(),
+            FileHandler(f"{__name__.split('.')[0]}.log"),
+        ],
+    )
+    captureWarnings(True)
+
+
 LOG = getLogger(__name__)
 
 
@@ -48,6 +52,7 @@ def cli():
     To train a model, run pre-resample, pre-config, pre-hubert, train.
     To infer a model, run infer.
     """
+    init_logger()
 
 
 @click.help_option("--help", "-h")

diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py
@@ -0,0 +1,234 @@
+from logging import getLogger
+from pathlib import Path
+
+import PySimpleGUI as sg
+import sounddevice as sd
+import soundfile as sf
+from pebble import ProcessPool
+
+from .__main__ import init_logger
+
+LOG = getLogger(__name__)
+
+init_logger()
+
+
+def play_audio(path: Path | str):
+    if isinstance(path, Path):
+        path = path.as_posix()
+    data, sr = sf.read(path)
+    sd.play(data, sr)
+
+
+def main():
+    sg.theme("Dark")
+    model_candidates = list(sorted(Path("./logs/44k/").glob("G_*.pth")))
+    layout = [
+        [
+            sg.Text("Model path: "),
+            sg.InputText(
+                key="model_path",
+                default_text=model_candidates[-1].as_posix()
+                if model_candidates
+                else "",
+            ),
+            sg.FileBrowse(
+                initial_folder="./logs/44k/" if Path("./logs/44k/").exists() else "."
+            ),
+        ],
+        [
+            sg.Text("Config path: "),
+            sg.InputText(
+                key="config_path",
+                default_text="./configs/44k/config.json",
+                enable_events=True,
+            ),
+            sg.FileBrowse(
+                initial_folder="./configs/44k/"
+                if Path("./configs/44k/").exists()
+                else "."
+            ),
+        ],
+        [sg.Text("Speaker"), sg.Combo(values=[], key="speaker", size=(20, 1))],
+        [
+            sg.Text("Input audio path:"),
+            sg.InputText(key="input_path"),
+            sg.FileBrowse(initial_folder="."),
+            sg.Button("Play", key="play_input"),
+        ],
+        [
+            sg.Text("Silence threshold: "),
+            sg.Slider(
+                range=(-60.0, 0),
+                orientation="h",
+                key="silence_threshold",
+                default_value=-20,
+                resolution=0.1,
+            ),
+        ],
+        [
+            sg.Checkbox(
+                key="auto_predict_f0",
+                default=True,
+                text="Auto predict F0 (Pitch may become unstable when turned on in real-time inference.)",
+            )
+        ],
+        [
+            sg.Text("Pitch: "),
+            sg.Slider(
+                range=(-20, 20), orientation="h", key="transpose", default_value=0
+            ),
+        ],
+        [
+            sg.Text("Cluster infer ratio: "),
+            sg.Slider(
+                range=(0, 1.0),
+                orientation="h",
+                key="cluster_infer_ratio",
+                default_value=0,
+                resolution=0.01,
+            ),
+        ],
+        [
+            sg.Text("Cluster model path: "),
+            sg.InputText(key="cluster_model_path"),
+            sg.FileBrowse(),
+        ],
+        [
+            sg.Text("Noise scale: "),
+            sg.Slider(
+                range=(0.0, 1.0),
+                orientation="h",
+                key="noise_scale",
+                default_value=0.4,
+                resolution=0.01,
+            ),
+        ],
+        [
+            sg.Text("Pad seconds"),
+            sg.Slider(
+                range=(0.0, 1.0),
+                orientation="h",
+                key="pad_seconds",
+                default_value=0.1,
+                resolution=0.01,
+            ),
+        ],
+        [
+            sg.Text("Crossfade seconds"),
+            sg.Slider(
+                range=(0, 0.6),
+                orientation="h",
+                key="crossfade_seconds",
+                default_value=0.1,
+                resolution=0.001,
+            ),
+        ],
+        [
+            sg.Text("Block seconds"),
+            sg.Slider(
+                range=(0, 3.0),
+                orientation="h",
+                key="block_seconds",
+                default_value=1,
+                resolution=0.01,
+            ),
+        ],
+        [sg.Checkbox(key="use_gpu", default=True, text="Use GPU")],
+        [sg.Checkbox(key="auto_play", default=True, text="Auto play")],
+        [
+            sg.Button("Infer", key="infer"),
+            sg.Button("(Re)Start Voice Changer", key="start_vc"),
+            sg.Button("Stop Voice Changer", key="stop_vc"),
+        ],
+    ]
+
+    window = sg.Window(
+        f"{__name__.split('.')[0]}", layout
+    )  # , use_custom_titlebar=True)
+    with ProcessPool(max_workers=1) as pool:
+        future = None
+        while True:
+            event, values = window.read(100)
+            if event == sg.WIN_CLOSED:
+                break
+
+            def update_combo() -> None:
+                from . import utils
+
+                if Path(values["config_path"]).exists():
+                    hp = utils.get_hparams_from_file(values["config_path"])
+                    LOG.info(f"Loaded config from {values['config_path']}")
+                    window["speaker"].update(
+                        values=list(hp.__dict__["spk"].keys()), set_to_index=0
+                    )
+
+            if not event == sg.EVENT_TIMEOUT:
+                LOG.info(f"Event: {event}, values: {values}")
+            if values["speaker"] == "":
+                update_combo()
+
+            if event == "config_path":
+                update_combo()
+            elif event == "infer":
+                from .inference_main import infer
+
+                input_path = Path(values["input_path"])
+                output_path = (
+                    input_path.parent / f"{input_path.stem}.out{input_path.suffix}"
+                )
+                infer(
+                    model_path=Path(values["model_path"]),
+                    config_path=Path(values["config_path"]),
+                    input_path=input_path,
+                    output_path=output_path,
+                    speaker=values["speaker"],
+                    cluster_model_path=Path(values["cluster_model_path"])
+                    if values["cluster_model_path"]
+                    else None,
+                    transpose=values["transpose"],
+                    auto_predict_f0=values["auto_predict_f0"],
+                    cluster_infer_ratio=values["cluster_infer_ratio"],
+                    noise_scale=values["noise_scale"],
+                    db_thresh=values["silence_threshold"],
+                    pad_seconds=values["pad_seconds"],
+                    device="cuda" if values["use_gpu"] else "cpu",
+                )
+                if values["auto_play"]:
+                    pool.schedule(play_audio, args=[output_path])
+            elif event == "play_input":
+                if Path(values["input_path"]).exists():
+                    pool.schedule(play_audio, args=[Path(values["input_path"])])
+            elif event == "start_vc":
+                from .inference_main import realtime
+
+                if future:
+                    LOG.info("Canceling previous task")
+                    future.cancel()
+                future = pool.schedule(
+                    realtime,
+                    kwargs=dict(
+                        model_path=Path(values["model_path"]),
+                        config_path=Path(values["config_path"]),
+                        speaker=values["speaker"],
+                        cluster_model_path=Path(values["cluster_model_path"])
+                        if values["cluster_model_path"]
+                        else None,
+                        transpose=values["transpose"],
+                        auto_predict_f0=values["auto_predict_f0"],
+                        cluster_infer_ratio=values["cluster_infer_ratio"],
+                        noise_scale=values["noise_scale"],
+                        crossfade_seconds=values["crossfade_seconds"],
+                        db_thresh=values["silence_threshold"],
+                        pad_seconds=values["pad_seconds"],
+                        device="cuda" if values["use_gpu"] else "cpu",
+                        block_seconds=values["block_seconds"],
+                    ),
+                )
+            elif event == "stop_vc":
+                if future:
+                    future.cancel()
+                    future = None
+        if future:
+            future.cancel()
+    window.close()