diff --git a/.github/actions/setup-uv/action.yml b/.github/actions/setup-uv/action.yml index 619b138fb2..88a73e8481 100644 --- a/.github/actions/setup-uv/action.yml +++ b/.github/actions/setup-uv/action.yml @@ -4,8 +4,9 @@ runs: using: 'composite' steps: - name: Install uv - uses: astral-sh/setup-uv@v3 + uses: astral-sh/setup-uv@v5 with: - version: "0.5.1" + version: "0.5.17" enable-cache: true cache-dependency-glob: "**/pyproject.toml" + python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml deleted file mode 100644 index 4dc8c76c1a..0000000000 --- a/.github/workflows/integration-tests.yml +++ /dev/null @@ -1,82 +0,0 @@ -name: integration - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] - workflow_dispatch: - inputs: - trainer_branch: - description: "Branch of Trainer to test" - required: false - default: "main" - coqpit_branch: - description: "Branch of Coqpit to test" - required: false - default: "main" -jobs: - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: ["3.9", "3.12"] - subset: ["test_tts", "test_tts2", "test_vocoder", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"] - steps: - - uses: actions/checkout@v4 - - name: Setup uv - uses: ./.github/actions/setup-uv - - name: Set up Python ${{ matrix.python-version }} - run: uv python install ${{ matrix.python-version }} - - name: Install Espeak - if: contains(fromJSON('["test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset) - run: | - sudo apt-get update - sudo apt-get install espeak espeak-ng - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - make system-deps - - name: Install custom Trainer and/or Coqpit if requested - run: | - if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then - uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }} - fi - if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then - uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }} - fi - - name: Integration tests - run: | - resolution=highest - if [ "${{ matrix.python-version }}" == "3.9" ]; then - resolution=lowest-direct - fi - uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }} - - name: Upload coverage data - uses: actions/upload-artifact@v4 - with: - include-hidden-files: true - name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }} - path: .coverage.* - if-no-files-found: ignore - coverage: - if: always() - needs: test - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Setup uv - uses: ./.github/actions/setup-uv - - uses: actions/download-artifact@v4 - with: - pattern: coverage-data-* - merge-multiple: true - - name: Combine coverage - run: | - uv python install - uvx coverage combine - uvx coverage html --skip-covered --skip-empty - uvx coverage report --format=markdown >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index 1b7f44654c..ef74c60da6 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -46,8 +46,8 @@ jobs: steps: - uses: actions/download-artifact@v4 with: - path: dist - pattern: build + path: "dist/" + name: build - run: | ls -lh dist/ - name: Publish package distributions to PyPI diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml index d1060f6be2..03426808cc 100644 --- a/.github/workflows/style_check.yml +++ b/.github/workflows/style_check.yml @@ -9,15 +9,9 @@ on: jobs: lint: runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9] steps: - uses: actions/checkout@v4 - name: Setup uv uses: ./.github/actions/setup-uv - - name: Set up Python ${{ matrix.python-version }} - run: uv python install ${{ matrix.python-version }} - name: Lint check run: make lint diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 576de150fd..fdacf0acc9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,4 +1,4 @@ -name: unit +name: test on: push: @@ -17,19 +17,17 @@ on: required: false default: "main" jobs: - test: + unit: runs-on: ubuntu-latest strategy: fail-fast: false matrix: - python-version: [3.9, "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12"] subset: ["data_tests", "inference_tests", "test_aux", "test_text"] steps: - uses: actions/checkout@v4 - name: Setup uv uses: ./.github/actions/setup-uv - - name: Set up Python ${{ matrix.python-version }} - run: uv python install ${{ matrix.python-version }} - name: Install Espeak if: contains(fromJSON('["inference_tests", "test_text"]'), matrix.subset) run: | @@ -37,7 +35,6 @@ jobs: sudo apt-get install espeak espeak-ng - name: Install dependencies run: | - sudo apt-get update sudo apt-get install -y --no-install-recommends git make gcc make system-deps - name: Install custom Trainer and/or Coqpit if requested @@ -51,7 +48,7 @@ jobs: - name: Unit tests run: | resolution=highest - if [ "${{ matrix.python-version }}" == "3.9" ]; then + if [ "${{ matrix.python-version }}" == "3.10" ]; then resolution=lowest-direct fi uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }} @@ -61,10 +58,90 @@ jobs: include-hidden-files: true name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }} path: .coverage.* - if-no-files-found: ignore + integration: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.12"] + shard: [0, 1, 2, 3, 4] + steps: + - uses: actions/checkout@v4 + - name: Setup uv + uses: ./.github/actions/setup-uv + - name: Install Espeak + run: | + sudo apt-get update + sudo apt-get install espeak espeak-ng + - name: Install dependencies + run: | + sudo apt-get install -y --no-install-recommends git make gcc + make system-deps + - name: Install custom Trainer and/or Coqpit if requested + run: | + if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then + uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }} + fi + if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then + uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }} + fi + - name: Integration tests for shard ${{ matrix.shard }} + run: | + uv run pytest tests/integration --collect-only --quiet | grep "::" > integration_tests.txt + total_shards=5 + shard_tests=$(awk "NR % $total_shards == ${{ matrix.shard }}" integration_tests.txt) + resolution=highest + if [ "${{ matrix.python-version }}" == "3.10" ]; then + resolution=lowest-direct + fi + uv run --resolution=$resolution --extra languages coverage run -m pytest -x -v --durations=0 $shard_tests + - name: Upload coverage data + uses: actions/upload-artifact@v4 + with: + include-hidden-files: true + name: coverage-data-integration-${{ matrix.shard }}-${{ matrix.python-version }} + path: .coverage.* + zoo: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.12"] + partition: ["0", "1", "2"] + steps: + - uses: actions/checkout@v4 + - name: Setup uv + uses: ./.github/actions/setup-uv + - name: Install Espeak + run: | + sudo apt-get update + sudo apt-get install espeak espeak-ng + - name: Install dependencies + run: | + sudo apt-get install -y --no-install-recommends git make gcc + make system-deps + - name: Install custom Trainer and/or Coqpit if requested + run: | + if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then + uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }} + fi + if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then + uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }} + fi + - name: Zoo tests + run: uv run --extra server --extra languages make test_zoo + env: + NUM_PARTITIONS: 3 + TEST_PARTITION: ${{ matrix.partition }} + - name: Upload coverage data + uses: actions/upload-artifact@v4 + with: + include-hidden-files: true + name: coverage-data-zoo-${{ matrix.partition }} + path: .coverage.* coverage: if: always() - needs: test + needs: [unit, integration, zoo] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 92f6f3ab3c..2f070ad085 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,16 +2,14 @@ repos: - repo: "https://github.com/pre-commit/pre-commit-hooks" rev: v5.0.0 hooks: + - id: check-json + files: "TTS/.models.json" - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - - repo: "https://github.com/psf/black" - rev: 24.2.0 - hooks: - - id: black - language_version: python3 - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.0 + rev: v0.9.1 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d4a8cf0090..5fe9421442 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,30 +11,25 @@ You can contribute not only with code but with bug reports, comments, questions, If you like to contribute code, squash a bug but if you don't know where to start, here are some pointers. -- [Development Road Map](https://github.com/coqui-ai/TTS/issues/378) - - You can pick something out of our road map. We keep the progess of the project in this simple issue thread. It has new model proposals or developmental updates etc. - - [Github Issues Tracker](https://github.com/idiap/coqui-ai-TTS/issues) This is a place to find feature requests, bugs. - Issues with the ```good first issue``` tag are good place for beginners to take on. - -- ✨**PR**✨ [pages](https://github.com/idiap/coqui-ai-TTS/pulls) with the ```🚀new version``` tag. - - We list all the target improvements for the next version. You can pick one of them and start contributing. + Issues with the ```good first issue``` tag are good place for beginners to + take on. Issues tagged with `help wanted` are suited for more experienced + outside contributors. - Also feel free to suggest new features, ideas and models. We're always open for new things. -## Call for sharing language models +## Call for sharing pretrained models If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified. This model can be shared in two ways: 1. Share the model files with us and we serve them with the next 🐸 TTS release. 2. Upload your models on GDrive and share the link. -Models are served under `.models.json` file and any model is available under TTS CLI or Server end points. +Models are served under `.models.json` file and any model is available under TTS +CLI and Python API end points. Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/discussions/930). @@ -93,7 +88,7 @@ curl -LsSf https://astral.sh/uv/install.sh | sh uv run make test_all # run all the tests, report all the errors ``` -9. Format your code. We use ```black``` for code formatting. +9. Format your code. We use ```ruff``` for code formatting. ```bash make style @@ -135,7 +130,8 @@ curl -LsSf https://astral.sh/uv/install.sh | sh 13. Let's discuss until it is perfect. 💪 - We might ask you for certain changes that would appear in the ✨**PR**✨'s page under 🐸TTS[https://github.com/idiap/coqui-ai-TTS/pulls]. + We might ask you for certain changes that would appear in the + [Github ✨**PR**✨'s page](https://github.com/idiap/coqui-ai-TTS/pulls). 14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version. @@ -143,9 +139,9 @@ curl -LsSf https://astral.sh/uv/install.sh | sh If you prefer working within a Docker container as your development environment, you can do the following: -1. Fork 🐸TTS[https://github.com/idiap/coqui-ai-TTS] by clicking the fork button at the top right corner of the project page. +1. Fork the 🐸TTS [Github repository](https://github.com/idiap/coqui-ai-TTS) by clicking the fork button at the top right corner of the page. -2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```. +2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```. ```bash git clone git@github.com:/coqui-ai-TTS.git diff --git a/Makefile b/Makefile index 1d6867f5e8..da714e7b34 100644 --- a/Makefile +++ b/Makefile @@ -6,62 +6,46 @@ help: target_dirs := tests TTS notebooks recipes -test_all: ## run tests and don't stop on an error. - nose2 --with-coverage --coverage TTS tests - ./run_bash_tests.sh - test: ## run tests. - coverage run -m nose2 -F -v -B tests + coverage run -m pytest -x -v --durations=0 tests test_vocoder: ## run vocoder tests. - coverage run -m nose2 -F -v -B tests.vocoder_tests + coverage run -m pytest -x -v --durations=0 tests/vocoder_tests test_tts: ## run tts tests. - coverage run -m nose2 -F -v -B tests.tts_tests - -test_tts2: ## run tts tests. - coverage run -m nose2 -F -v -B tests.tts_tests2 - -test_xtts: - coverage run -m nose2 -F -v -B tests.xtts_tests + coverage run -m pytest -x -v --durations=0 tests/tts_tests test_aux: ## run aux tests. - coverage run -m nose2 -F -v -B tests.aux_tests - ./run_bash_tests.sh + coverage run -m pytest -x -v --durations=0 tests/aux_tests -test_zoo0: ## run zoo tests. - coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_0_step_3 \ - tests.zoo_tests.test_models.test_voice_conversion -test_zoo1: ## run zoo tests. - coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_1_step_3 -test_zoo2: ## run zoo tests. - coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_2_step_3 +test_zoo: ## run zoo tests. + coverage run -m pytest -x -v --durations=0 tests/zoo_tests/test_models.py + +test_zoo_big: ## run tests for models that are too big for CI. + coverage run -m pytest -x -v --durations=0 tests/zoo_tests/test_big_models.py inference_tests: ## run inference tests. - coverage run -m nose2 -F -v -B tests.inference_tests + coverage run -m pytest -x -v --durations=0 tests/inference_tests data_tests: ## run data tests. - coverage run -m nose2 -F -v -B tests.data_tests + coverage run -m pytest -x -v --durations=0 tests/data_tests test_text: ## run text tests. - coverage run -m nose2 -F -v -B tests.text_tests + coverage run -m pytest -x -v --durations=0 tests/text_tests test_failed: ## only run tests failed the last time. - coverage run -m nose2 -F -v -B tests + coverage run -m pytest -x -v --last-failed tests style: ## update code style. - uv run --only-dev black ${target_dirs} + uv run --only-dev ruff format ${target_dirs} lint: ## run linters. uv run --only-dev ruff check ${target_dirs} - uv run --only-dev black ${target_dirs} --check + uv run --only-dev ruff format ${target_dirs} --check system-deps: ## install linux system deps sudo apt-get install -y libsndfile1-dev -build-docs: ## build the docs - cd docs && make clean && make build - install: ## install 🐸 TTS uv sync --all-extras @@ -70,4 +54,4 @@ install_dev: ## install 🐸 TTS for development. uv run pre-commit install docs: ## build the docs - $(MAKE) -C docs clean && $(MAKE) -C docs html + uv run --group docs $(MAKE) -C docs clean && uv run --group docs $(MAKE) -C docs html diff --git a/README.md b/README.md index 5ca825b6ba..db8868b26d 100644 --- a/README.md +++ b/README.md @@ -1,40 +1,34 @@ +# -## 🐸Coqui TTS News -- 📣 Fork of the [original, unmaintained repository](https://github.com/coqui-ai/TTS). New PyPI package: [coqui-tts](https://pypi.org/project/coqui-tts) -- 📣 Prebuilt wheels are now also published for Mac and Windows (in addition to Linux as before) for easier installation across platforms. -- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board. -- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes/ljspeech). -- 📣 ⓍTTS can now stream with <200ms latency. -- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://coqui-tts.readthedocs.io/en/latest/models/xtts.html) -- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://coqui-tts.readthedocs.io/en/latest/models/bark.html) -- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS. -## - - -**🐸TTS is a library for advanced Text-to-Speech generation.** +**🐸 Coqui TTS is a library for advanced Text-to-Speech generation.** 🚀 Pretrained models in +1100 languages. 🛠️ Tools for training new models and fine-tuning existing models in any language. 📚 Utilities for dataset analysis and curation. -______________________________________________________________________ [![Discord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/coqui-tts)](https://pypi.org/project/coqui-tts/) [![License]()](https://opensource.org/licenses/MPL-2.0) -[![PyPI version](https://badge.fury.io/py/coqui-tts.svg)](https://badge.fury.io/py/coqui-tts) +[![PyPI version](https://badge.fury.io/py/coqui-tts.svg)](https://pypi.org/project/coqui-tts/) [![Downloads](https://pepy.tech/badge/coqui-tts)](https://pepy.tech/project/coqui-tts) [![DOI](https://zenodo.org/badge/265612440.svg)](https://zenodo.org/badge/latestdoi/265612440) - -![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/tests.yml/badge.svg) -![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/docker.yaml/badge.svg) -![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/style_check.yml/badge.svg) +[![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/tests.yml/badge.svg)](https://github.com/idiap/coqui-ai-TTS/actions/workflows/tests.yml) +[![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/docker.yaml/badge.svg)](https://github.com/idiap/coqui-ai-TTS/actions/workflows/docker.yaml) +[![GithubActions](https://github.com/idiap/coqui-ai-TTS/actions/workflows/style_check.yml/badge.svg)](https://github.com/idiap/coqui-ai-TTS/actions/workflows/style_check.yml) [![Docs]()](https://coqui-tts.readthedocs.io/en/latest/) -______________________________________________________________________ +## 📣 News +- **Fork of the [original, unmaintained repository](https://github.com/coqui-ai/TTS). New PyPI package: [coqui-tts](https://pypi.org/project/coqui-tts)** +- 0.25.0: [OpenVoice](https://github.com/myshell-ai/OpenVoice) models now available for voice conversion. +- 0.24.2: Prebuilt wheels are now also published for Mac and Windows (in addition to Linux as before) for easier installation across platforms. +- 0.20.0: XTTSv2 is here with 17 languages and better performance across the board. XTTS can stream with <200ms latency. +- 0.19.0: XTTS fine-tuning code is out. Check the [example recipes](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes/ljspeech). +- 0.14.1: You can use [Fairseq models in ~1100 languages](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS. ## 💬 Where to ask questions Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly so that more people can benefit from it. @@ -64,70 +58,68 @@ repository are also still a useful source of information. | 🚀 **Released Models** | [Standard models](https://github.com/idiap/coqui-ai-TTS/blob/dev/TTS/.models.json) and [Fairseq models in ~1100 languages](https://github.com/idiap/coqui-ai-TTS#example-text-to-speech-using-fairseq-models-in-1100-languages-)| ## Features -- High-performance Deep Learning models for Text2Speech tasks. See lists of models below. -- Fast and efficient model training. -- Detailed training logs on the terminal and Tensorboard. -- Support for Multi-speaker TTS. -- Efficient, flexible, lightweight but feature complete `Trainer API`. +- High-performance text-to-speech and voice conversion models, see list below. +- Fast and efficient model training with detailed training logs on the terminal and Tensorboard. +- Support for multi-speaker and multilingual TTS. - Released and ready-to-use models. -- Tools to curate Text2Speech datasets under```dataset_analysis```. -- Utilities to use and test your models. +- Tools to curate TTS datasets under ```dataset_analysis/```. +- Command line and Python APIs to use and test your models. - Modular (but not too much) code base enabling easy implementation of new ideas. ## Model Implementations ### Spectrogram models -- Tacotron: [paper](https://arxiv.org/abs/1703.10135) -- Tacotron2: [paper](https://arxiv.org/abs/1712.05884) -- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129) -- Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802) -- Align-TTS: [paper](https://arxiv.org/abs/2003.01950) -- FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf) -- FastSpeech: [paper](https://arxiv.org/abs/1905.09263) -- FastSpeech2: [paper](https://arxiv.org/abs/2006.04558) -- SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557) -- Capacitron: [paper](https://arxiv.org/abs/1906.03402) -- OverFlow: [paper](https://arxiv.org/abs/2211.06892) -- Neural HMM TTS: [paper](https://arxiv.org/abs/2108.13320) -- Delightful TTS: [paper](https://arxiv.org/abs/2110.12612) +- [Tacotron](https://arxiv.org/abs/1703.10135), [Tacotron2](https://arxiv.org/abs/1712.05884) +- [Glow-TTS](https://arxiv.org/abs/2005.11129), [SC-GlowTTS](https://arxiv.org/abs/2104.05557) +- [Speedy-Speech](https://arxiv.org/abs/2008.03802) +- [Align-TTS](https://arxiv.org/abs/2003.01950) +- [FastPitch](https://arxiv.org/pdf/2006.06873.pdf) +- [FastSpeech](https://arxiv.org/abs/1905.09263), [FastSpeech2](https://arxiv.org/abs/2006.04558) +- [Capacitron](https://arxiv.org/abs/1906.03402) +- [OverFlow](https://arxiv.org/abs/2211.06892) +- [Neural HMM TTS](https://arxiv.org/abs/2108.13320) +- [Delightful TTS](https://arxiv.org/abs/2110.12612) ### End-to-End Models -- ⓍTTS: [blog](https://coqui.ai/blog/tts/open_xtts) -- VITS: [paper](https://arxiv.org/pdf/2106.06103) -- 🐸 YourTTS: [paper](https://arxiv.org/abs/2112.02418) -- 🐢 Tortoise: [orig. repo](https://github.com/neonbjb/tortoise-tts) -- 🐶 Bark: [orig. repo](https://github.com/suno-ai/bark) - -### Attention Methods -- Guided Attention: [paper](https://arxiv.org/abs/1710.08969) -- Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006) -- Graves Attention: [paper](https://arxiv.org/abs/1910.10288) -- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) -- Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf) -- Alignment Network: [paper](https://arxiv.org/abs/2108.10447) - -### Speaker Encoder -- GE2E: [paper](https://arxiv.org/abs/1710.10467) -- Angular Loss: [paper](https://arxiv.org/pdf/2003.11982.pdf) +- [XTTS](https://arxiv.org/abs/2406.04904) +- [VITS](https://arxiv.org/pdf/2106.06103) +- 🐸[YourTTS](https://arxiv.org/abs/2112.02418) +- 🐢[Tortoise](https://github.com/neonbjb/tortoise-tts) +- 🐶[Bark](https://github.com/suno-ai/bark) ### Vocoders -- MelGAN: [paper](https://arxiv.org/abs/1910.06711) -- MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106) -- ParallelWaveGAN: [paper](https://arxiv.org/abs/1910.11480) -- GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646) -- WaveRNN: [origin](https://github.com/fatchord/WaveRNN/) -- WaveGrad: [paper](https://arxiv.org/abs/2009.00713) -- HiFiGAN: [paper](https://arxiv.org/abs/2010.05646) -- UnivNet: [paper](https://arxiv.org/abs/2106.07889) +- [MelGAN](https://arxiv.org/abs/1910.06711) +- [MultiBandMelGAN](https://arxiv.org/abs/2005.05106) +- [ParallelWaveGAN](https://arxiv.org/abs/1910.11480) +- [GAN-TTS discriminators](https://arxiv.org/abs/1909.11646) +- [WaveRNN](https://github.com/fatchord/WaveRNN/) +- [WaveGrad](https://arxiv.org/abs/2009.00713) +- [HiFiGAN](https://arxiv.org/abs/2010.05646) +- [UnivNet](https://arxiv.org/abs/2106.07889) ### Voice Conversion -- FreeVC: [paper](https://arxiv.org/abs/2210.15418) +- [FreeVC](https://arxiv.org/abs/2210.15418) +- [kNN-VC](https://doi.org/10.21437/Interspeech.2023-419) +- [OpenVoice](https://arxiv.org/abs/2312.01479) + +### Others +- Attention methods: [Guided Attention](https://arxiv.org/abs/1710.08969), + [Forward Backward Decoding](https://arxiv.org/abs/1907.09006), + [Graves Attention](https://arxiv.org/abs/1910.10288), + [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/), + [Dynamic Convolutional Attention](https://arxiv.org/pdf/1910.10288.pdf), + [Alignment Network](https://arxiv.org/abs/2108.10447) +- Speaker encoders: [GE2E](https://arxiv.org/abs/1710.10467), + [Angular Loss](https://arxiv.org/pdf/2003.11982.pdf) You can also help us implement more models. + ## Installation -🐸TTS is tested on Ubuntu 22.04 with **python >= 3.9, < 3.13.**. -If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option. +🐸TTS is tested on Ubuntu 24.04 with **python >= 3.10, < 3.13**, but should also +work on Mac and Windows. + +If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the pretrained 🐸TTS models, installing from PyPI is the easiest option. ```bash pip install coqui-tts @@ -165,24 +157,21 @@ pip install -e .[server,ja] ### Platforms -If you are on Ubuntu (Debian), you can also run following commands for installation. +If you are on Ubuntu (Debian), you can also run the following commands for installation. ```bash -make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS. +make system-deps make install ``` -If you are on Windows, 👑@GuyPaddock wrote installation instructions -[here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system) -(note that these are out of date, e.g. you need to have at least Python 3.9). - + ## Docker Image -You can also try TTS without install with the docker image. -Simply run the following command and you will be able to run TTS without installing it. +You can also try out Coqui TTS without installation with the docker image. +Simply run the following command and you will be able to run TTS: ```bash -docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu +docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/idiap/coqui-tts-cpu python3 TTS/server/server.py --list_models #To get the list of available models python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server ``` @@ -193,10 +182,10 @@ More details about the docker images (like GPU support) can be found ## Synthesizing speech by 🐸TTS - + ### 🐍 Python API -#### Running a multi-speaker and multi-lingual model +#### Multi-speaker and multi-lingual model ```python import torch @@ -208,44 +197,67 @@ device = "cuda" if torch.cuda.is_available() else "cpu" # List available 🐸TTS models print(TTS().list_models()) -# Init TTS +# Initialize TTS tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) +# List speakers +print(tts.speakers) + # Run TTS -# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language -# Text to speech list of amplitude values as output -wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en") -# Text to speech to a file -tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav") +# ❗ XTTS supports both, but many models allow only one of the `speaker` and +# `speaker_wav` arguments + +# TTS with list of amplitude values as output, clone the voice from `speaker_wav` +wav = tts.tts( + text="Hello world!", + speaker_wav="my/cloning/audio.wav", + language="en" +) + +# TTS to a file, use a preset speaker +tts.tts_to_file( + text="Hello world!", + speaker="Craig Gutsy", + language="en", + file_path="output.wav" +) ``` -#### Running a single speaker model +#### Single speaker model ```python -# Init TTS with the target model name -tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False).to(device) +# Initialize TTS with the target model name +tts = TTS("tts_models/de/thorsten/tacotron2-DDC").to(device) # Run TTS tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH) - -# Example voice cloning with YourTTS in English, French and Portuguese -tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device) -tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav") -tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav") -tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav") ``` -#### Example voice conversion +#### Voice conversion (VC) -Converting the voice in `source_wav` to the voice of `target_wav` +Converting the voice in `source_wav` to the voice of `target_wav`: ```python -tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda") -tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav") +tts = TTS("voice_conversion_models/multilingual/vctk/freevc24").to("cuda") +tts.voice_conversion_to_file( + source_wav="my/source.wav", + target_wav="my/target.wav", + file_path="output.wav" +) ``` -#### Example voice cloning together with the voice conversion model. -This way, you can clone voices by using any model in 🐸TTS. +Other available voice conversion models: +- `voice_conversion_models/multilingual/multi-dataset/knnvc` +- `voice_conversion_models/multilingual/multi-dataset/openvoice_v1` +- `voice_conversion_models/multilingual/multi-dataset/openvoice_v2` + +For more details, see the +[documentation](https://coqui-tts.readthedocs.io/en/latest/vc.html). + +#### Voice cloning by combining single speaker TTS model with the default VC model + +This way, you can clone voices by using any model in 🐸TTS. The FreeVC model is +used for voice conversion after synthesizing speech. ```python @@ -257,7 +269,7 @@ tts.tts_with_vc_to_file( ) ``` -#### Example text to speech using **Fairseq models in ~1100 languages** 🤯. +#### TTS using Fairseq models in ~1100 languages 🤯 For Fairseq models, use the following name format: `tts_models//fairseq/vits`. You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms). @@ -271,145 +283,126 @@ api.tts_to_file( ) ``` -### Command-line `tts` +### Command-line interface `tts` -Synthesize speech on command line. +Synthesize speech on the command line. You can either use your trained model or choose a model from the provided list. -If you don't specify any models, then it uses LJSpeech based English model. - -#### Single Speaker Models - - List provided models: + ```sh + tts --list_models ``` - $ tts --list_models - ``` - -- Get model info (for both tts_models and vocoder_models): - - - Query by type/name: - The model_info_by_name uses the name as it from the --list_models. - ``` - $ tts --model_info_by_name "///" - ``` - For example: - ``` - $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts - $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2 - ``` - - Query by type/idx: - The model_query_idx uses the corresponding idx from --list_models. - ``` - $ tts --model_info_by_idx "/" - ``` - - For example: - - ``` - $ tts --model_info_by_idx tts_models/3 - ``` +- Get model information. Use the names obtained from `--list_models`. + ```sh + tts --model_info_by_name "///" + ``` + For example: + ```sh + tts --model_info_by_name tts_models/tr/common-voice/glow-tts + tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2 + ``` - - Query info for model info by full name: - ``` - $ tts --model_info_by_name "///" - ``` +#### Single speaker models -- Run TTS with default models: +- Run TTS with the default model (`tts_models/en/ljspeech/tacotron2-DDC`): - ``` - $ tts --text "Text for TTS" --out_path output/path/speech.wav + ```sh + tts --text "Text for TTS" --out_path output/path/speech.wav ``` - Run TTS and pipe out the generated TTS wav file data: - ``` - $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay + ```sh + tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay ``` - Run a TTS model with its default vocoder model: - ``` - $ tts --text "Text for TTS" --model_name "///" --out_path output/path/speech.wav + ```sh + tts --text "Text for TTS" \ + --model_name "///" \ + --out_path output/path/speech.wav ``` For example: - ``` - $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav + ```sh + tts --text "Text for TTS" \ + --model_name "tts_models/en/ljspeech/glow-tts" \ + --out_path output/path/speech.wav ``` -- Run with specific TTS and vocoder models from the list: +- Run with specific TTS and vocoder models from the list. Note that not every vocoder is compatible with every TTS model. - ``` - $ tts --text "Text for TTS" --model_name "///" --vocoder_name "///" --out_path output/path/speech.wav + ```sh + tts --text "Text for TTS" \ + --model_name "///" \ + --vocoder_name "///" \ + --out_path output/path/speech.wav ``` For example: - ``` - $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav + ```sh + tts --text "Text for TTS" \ + --model_name "tts_models/en/ljspeech/glow-tts" \ + --vocoder_name "vocoder_models/en/ljspeech/univnet" \ + --out_path output/path/speech.wav ``` -- Run your own TTS model (Using Griffin-Lim Vocoder): +- Run your own TTS model (using Griffin-Lim Vocoder): - ``` - $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav + ```sh + tts --text "Text for TTS" \ + --model_path path/to/model.pth \ + --config_path path/to/config.json \ + --out_path output/path/speech.wav ``` - Run your own TTS and Vocoder models: - ``` - $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav - --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json + ```sh + tts --text "Text for TTS" \ + --model_path path/to/model.pth \ + --config_path path/to/config.json \ + --out_path output/path/speech.wav \ + --vocoder_path path/to/vocoder.pth \ + --vocoder_config_path path/to/vocoder_config.json ``` -#### Multi-speaker Models +#### Multi-speaker models -- List the available speakers and choose a among them: +- List the available speakers and choose a `` among them: - ``` - $ tts --model_name "//" --list_speaker_idxs + ```sh + tts --model_name "//" --list_speaker_idxs ``` - Run the multi-speaker TTS model with the target speaker ID: - ``` - $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "//" --speaker_idx + ```sh + tts --text "Text for TTS." --out_path output/path/speech.wav \ + --model_name "//" --speaker_idx ``` - Run your own multi-speaker TTS model: - ``` - $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx + ```sh + tts --text "Text for TTS" --out_path output/path/speech.wav \ + --model_path path/to/model.pth --config_path path/to/config.json \ + --speakers_file_path path/to/speaker.json --speaker_idx ``` -### Voice Conversion Models +#### Voice conversion models -``` -$ tts --out_path output/path/speech.wav --model_name "//" --source_wav --target_wav +```sh +tts --out_path output/path/speech.wav --model_name "//" \ + --source_wav --target_wav ``` - -## Directory Structure -``` -|- notebooks/ (Jupyter Notebooks for model evaluation, parameter selection and data analysis.) -|- utils/ (common utilities.) -|- TTS - |- bin/ (folder for all the executables.) - |- train*.py (train your target model.) - |- ... - |- tts/ (text to speech models) - |- layers/ (model layer definitions) - |- models/ (model definitions) - |- utils/ (model specific utilities.) - |- speaker_encoder/ (Speaker Encoder models.) - |- (same) - |- vocoder/ (Vocoder models.) - |- (same) -``` diff --git a/TTS/.models.json b/TTS/.models.json index 1a12e8c8a3..4cc3344167 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -803,6 +803,22 @@ "license": "apache 2.0" } }, + "librispeech100": { + "wavlm-hifigan": { + "description": "HiFiGAN vocoder for WavLM features from kNN-VC", + "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan.zip", + "commit": "cfba7e0", + "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5", + "license": "MIT" + }, + "wavlm-hifigan_prematched": { + "description": "Prematched HiFiGAN vocoder for WavLM features from kNN-VC", + "github_rls_url": "https://github.com/idiap/coqui-ai-TTS/releases/download/v0.25.2_models/vocoder_models--en--librispeech100--wavlm-hifigan_prematched.zip", + "commit": "cfba7e0", + "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5", + "license": "MIT" + } + }, "ljspeech": { "multiband-melgan": { "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip", @@ -943,10 +959,42 @@ "freevc24": { "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip", "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC", + "default_vocoder": null, "author": "Jing-Yi Li @OlaWod", "license": "MIT", "commit": null } + }, + "multi-dataset": { + "knnvc": { + "description": "kNN-VC model from https://github.com/bshall/knn-vc", + "default_vocoder": "vocoder_models/en/librispeech100/wavlm-hifigan_prematched", + "author": "Benjamin van Niekerk @bshall, Matthew Baas @RF5", + "license": "MIT", + "commit": null + }, + "openvoice_v1": { + "hf_url": [ + "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/config.json", + "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth" + ], + "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2", + "default_vocoder": null, + "author": "MyShell.ai", + "license": "MIT", + "commit": null + }, + "openvoice_v2": { + "hf_url": [ + "https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/config.json", + "https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/checkpoint.pth" + ], + "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2", + "default_vocoder": null, + "author": "MyShell.ai", + "license": "MIT", + "commit": null + } } } } diff --git a/TTS/__init__.py b/TTS/__init__.py index 8e93c9b5db..d270e09e22 100644 --- a/TTS/__init__.py +++ b/TTS/__init__.py @@ -4,6 +4,15 @@ __version__ = importlib.metadata.version("coqui-tts") +if "coqpit" in importlib.metadata.packages_distributions().get("coqpit", []): + msg = ( + "coqui-tts switched to a forked version of Coqpit, but you still have the original " + "package installed. Run the following to avoid conflicts:\n" + " pip uninstall coqpit\n" + " pip install coqpit-config" + ) + raise ImportError(msg) + if is_pytorch_at_least_2_4(): import _codecs diff --git a/TTS/api.py b/TTS/api.py index 250ed1a0d9..3db1e25b11 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -1,3 +1,5 @@ +"""Coqui TTS Python API.""" + import logging import tempfile import warnings @@ -6,7 +8,6 @@ from torch import nn from TTS.config import load_config -from TTS.utils.audio.numpy_transforms import save_wav from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer @@ -19,13 +20,19 @@ class TTS(nn.Module): def __init__( self, model_name: str = "", - model_path: str = None, - config_path: str = None, - vocoder_path: str = None, - vocoder_config_path: str = None, + *, + model_path: str | None = None, + config_path: str | None = None, + vocoder_name: str | None = None, + vocoder_path: str | None = None, + vocoder_config_path: str | None = None, + encoder_path: str | None = None, + encoder_config_path: str | None = None, + speakers_file_path: str | None = None, + language_ids_file_path: str | None = None, progress_bar: bool = True, - gpu=False, - ): + gpu: bool = False, + ) -> None: """🐸TTS python interface that allows to load and use the released models. Example with a multi-speaker model: @@ -35,66 +42,82 @@ def __init__( >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav") Example with a single-speaker model: - >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False) + >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False) >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav") Example loading a model from a path: - >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False) + >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False) >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav") Example voice cloning with YourTTS in English, French and Portuguese: - >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True) + >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to("cuda") >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav") >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav") >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav") Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html): - >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True) + >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False).to("cuda") >>> tts.tts_to_file("This is a test.", file_path="output.wav") Args: model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None. model_path (str, optional): Path to the model checkpoint. Defaults to None. config_path (str, optional): Path to the model config. Defaults to None. + vocoder_name (str, optional): Pre-trained vocoder to use. Defaults to None, i.e. using the default vocoder. vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None. vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None. - progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True. - gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. + encoder_path: Path to speaker encoder checkpoint. Default to None. + encoder_config_path: Path to speaker encoder config file. Defaults to None. + speakers_file_path: JSON file for multi-speaker model. Defaults to None. + language_ids_file_path: JSON file for multilingual model. Defaults to None + progress_bar (bool, optional): Whether to print a progress bar while downloading a model. Defaults to True. + gpu (bool, optional): Enable/disable GPU. Defaults to False. DEPRECATED, use TTS(...).to("cuda") """ super().__init__() self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar) self.config = load_config(config_path) if config_path else None - self.synthesizer = None - self.voice_converter = None + self.synthesizer: Synthesizer | None = None + self.voice_converter: Synthesizer | None = None self.model_name = "" + + self.vocoder_path = vocoder_path + self.vocoder_config_path = vocoder_config_path + self.encoder_path = encoder_path + self.encoder_config_path = encoder_config_path + self.speakers_file_path = speakers_file_path + self.language_ids_file_path = language_ids_file_path + if gpu: warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.") if model_name is not None and len(model_name) > 0: if "tts_models" in model_name: - self.load_tts_model_by_name(model_name, gpu) + self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu) elif "voice_conversion_models" in model_name: - self.load_vc_model_by_name(model_name, gpu) + self.load_vc_model_by_name(model_name, vocoder_name, gpu=gpu) + # To allow just TTS("xtts") else: - self.load_model_by_name(model_name, gpu) + self.load_model_by_name(model_name, vocoder_name, gpu=gpu) if model_path: - self.load_tts_model_by_path( - model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu - ) + self.load_tts_model_by_path(model_path, config_path, gpu=gpu) @property - def models(self): + def models(self) -> list[str]: return self.manager.list_tts_models() @property - def is_multi_speaker(self): - if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager: + def is_multi_speaker(self) -> bool: + if ( + self.synthesizer is not None + and hasattr(self.synthesizer.tts_model, "speaker_manager") + and self.synthesizer.tts_model.speaker_manager + ): return self.synthesizer.tts_model.speaker_manager.num_speakers > 1 return False @property - def is_multi_lingual(self): + def is_multi_lingual(self) -> bool: # Not sure what sets this to None, but applied a fix to prevent crashing. if ( isinstance(self.model_name, str) @@ -103,31 +126,37 @@ def is_multi_lingual(self): and ("xtts" in self.config.model or "languages" in self.config and len(self.config.languages) > 1) ): return True - if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager: + if ( + self.synthesizer is not None + and hasattr(self.synthesizer.tts_model, "language_manager") + and self.synthesizer.tts_model.language_manager + ): return self.synthesizer.tts_model.language_manager.num_languages > 1 return False @property - def speakers(self): + def speakers(self) -> list[str]: if not self.is_multi_speaker: return None return self.synthesizer.tts_model.speaker_manager.speaker_names @property - def languages(self): + def languages(self) -> list[str]: if not self.is_multi_lingual: return None return self.synthesizer.tts_model.language_manager.language_names @staticmethod - def get_models_file_path(): + def get_models_file_path() -> Path: return Path(__file__).parent / ".models.json" @staticmethod - def list_models(): + def list_models() -> list[str]: return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False).list_models() - def download_model_by_name(self, model_name: str): + def download_model_by_name( + self, model_name: str, vocoder_name: str | None = None + ) -> tuple[Path | None, Path | None, Path | None, Path | None, Path | None]: model_path, config_path, model_item = self.manager.download_model(model_name) if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)): # return model directory if there are multiple files @@ -135,19 +164,27 @@ def download_model_by_name(self, model_name: str): return None, None, None, None, model_path if model_item.get("default_vocoder") is None: return model_path, config_path, None, None, None - vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"]) + if vocoder_name is None: + vocoder_name = model_item["default_vocoder"] + vocoder_path, vocoder_config_path = None, None + # A local vocoder model will take precedence if already specified in __init__ + if model_item["model_type"] == "tts_models": + vocoder_path = self.vocoder_path + vocoder_config_path = self.vocoder_config_path + if vocoder_path is None or vocoder_config_path is None: + vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name) return model_path, config_path, vocoder_path, vocoder_config_path, None - def load_model_by_name(self, model_name: str, gpu: bool = False): + def load_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None: """Load one of the 🐸TTS models by name. Args: model_name (str): Model name to load. You can list models by ```tts.models```. gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. """ - self.load_tts_model_by_name(model_name, gpu) + self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu) - def load_vc_model_by_name(self, model_name: str, gpu: bool = False): + def load_vc_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None: """Load one of the voice conversion models by name. Args: @@ -155,10 +192,19 @@ def load_vc_model_by_name(self, model_name: str, gpu: bool = False): gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. """ self.model_name = model_name - model_path, config_path, _, _, _ = self.download_model_by_name(model_name) - self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu) + model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name( + model_name, vocoder_name + ) + self.voice_converter = Synthesizer( + vc_checkpoint=model_path, + vc_config=config_path, + vocoder_checkpoint=vocoder_path, + vocoder_config=vocoder_config_path, + model_dir=model_dir, + use_cuda=gpu, + ) - def load_tts_model_by_name(self, model_name: str, gpu: bool = False): + def load_tts_model_by_name(self, model_name: str, vocoder_name: str | None = None, *, gpu: bool = False) -> None: """Load one of 🐸TTS models by name. Args: @@ -170,7 +216,9 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False): self.synthesizer = None self.model_name = model_name - model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(model_name) + model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name( + model_name, vocoder_name + ) # init synthesizer # None values are fetch from the model @@ -181,15 +229,13 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False): tts_languages_file=None, vocoder_checkpoint=vocoder_path, vocoder_config=vocoder_config_path, - encoder_checkpoint=None, - encoder_config=None, + encoder_checkpoint=self.encoder_path, + encoder_config=self.encoder_config_path, model_dir=model_dir, use_cuda=gpu, ) - def load_tts_model_by_path( - self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False - ): + def load_tts_model_by_path(self, model_path: str, config_path: str, *, gpu: bool = False) -> None: """Load a model from a path. Args: @@ -203,22 +249,21 @@ def load_tts_model_by_path( self.synthesizer = Synthesizer( tts_checkpoint=model_path, tts_config_path=config_path, - tts_speakers_file=None, - tts_languages_file=None, - vocoder_checkpoint=vocoder_path, - vocoder_config=vocoder_config, - encoder_checkpoint=None, - encoder_config=None, + tts_speakers_file=self.speakers_file_path, + tts_languages_file=self.language_ids_file_path, + vocoder_checkpoint=self.vocoder_path, + vocoder_config=self.vocoder_config_path, + encoder_checkpoint=self.encoder_path, + encoder_config=self.encoder_config_path, use_cuda=gpu, ) def _check_arguments( self, - speaker: str = None, - language: str = None, - speaker_wav: str = None, - emotion: str = None, - speed: float = None, + speaker: str | None = None, + language: str | None = None, + speaker_wav: str | None = None, + emotion: str | None = None, **kwargs, ) -> None: """Check if the arguments are valid for the model.""" @@ -231,17 +276,16 @@ def _check_arguments( raise ValueError("Model is not multi-speaker but `speaker` is provided.") if not self.is_multi_lingual and language is not None: raise ValueError("Model is not multi-lingual but `language` is provided.") - if emotion is not None and speed is not None: - raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.") + if emotion is not None: + raise ValueError("Emotion can only be used with Coqui Studio models. Which is discontinued.") def tts( self, text: str, - speaker: str = None, - language: str = None, - speaker_wav: str = None, - emotion: str = None, - speed: float = None, + speaker: str | None = None, + language: str | None = None, + speaker_wav: str | None = None, + emotion: str | None = None, split_sentences: bool = True, **kwargs, ): @@ -260,9 +304,6 @@ def tts( Defaults to None. emotion (str, optional): Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None. - speed (float, optional): - Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0. - Defaults to None. split_sentences (bool, optional): Split text into sentences, synthesize them separately and concatenate the file audio. Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only @@ -270,18 +311,12 @@ def tts( kwargs (dict, optional): Additional arguments for the model. """ - self._check_arguments( - speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs - ) + self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, **kwargs) wav = self.synthesizer.tts( text=text, speaker_name=speaker, language_name=language, speaker_wav=speaker_wav, - reference_wav=None, - style_wav=None, - style_text=None, - reference_speaker_name=None, split_sentences=split_sentences, **kwargs, ) @@ -290,16 +325,15 @@ def tts( def tts_to_file( self, text: str, - speaker: str = None, - language: str = None, - speaker_wav: str = None, - emotion: str = None, - speed: float = 1.0, + speaker: str | None = None, + language: str | None = None, + speaker_wav: str | None = None, + emotion: str | None = None, pipe_out=None, file_path: str = "output.wav", split_sentences: bool = True, **kwargs, - ): + ) -> str: """Convert text to speech. Args: @@ -316,8 +350,6 @@ def tts_to_file( Defaults to None. emotion (str, optional): Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral". - speed (float, optional): - Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None. pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. file_path (str, optional): @@ -345,7 +377,7 @@ def tts_to_file( def voice_conversion( self, source_wav: str, - target_wav: str, + target_wav: str | list[str], ): """Voice conversion with FreeVC. Convert source wav to target speaker. @@ -355,15 +387,18 @@ def voice_conversion( target_wav (str):` Path to the target wav file. """ - wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav) - return wav + if self.voice_converter is None: + msg = "The selected model does not support voice conversion." + raise RuntimeError(msg) + return self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav) def voice_conversion_to_file( self, source_wav: str, - target_wav: str, + target_wav: str | list[str], file_path: str = "output.wav", - ): + pipe_out=None, + ) -> str: """Voice conversion with FreeVC. Convert source wav to target speaker. Args: @@ -373,17 +408,20 @@ def voice_conversion_to_file( Path to the target wav file. file_path (str, optional): Output file path. Defaults to "output.wav". + pipe_out (BytesIO, optional): + Flag to stdout the generated TTS wav file for shell pipe. """ wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav) - save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) + self.voice_converter.save_wav(wav=wav, path=file_path, pipe_out=pipe_out) return file_path def tts_with_vc( self, text: str, - language: str = None, - speaker_wav: str = None, - speaker: str = None, + *, + language: str | None = None, + speaker_wav: str | list[str], + speaker: str | None = None, split_sentences: bool = True, ): """Convert text to speech with voice conversion. @@ -423,12 +461,14 @@ def tts_with_vc( def tts_with_vc_to_file( self, text: str, - language: str = None, - speaker_wav: str = None, + *, + language: str | None = None, + speaker_wav: str | list[str], file_path: str = "output.wav", - speaker: str = None, + speaker: str | None = None, split_sentences: bool = True, - ): + pipe_out=None, + ) -> str: """Convert text to speech with voice conversion and save to file. Check `tts_with_vc` for more details. @@ -451,8 +491,11 @@ def tts_with_vc_to_file( Split text into sentences, synthesize them separately and concatenate the file audio. Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only applicable to the 🐸TTS models. Defaults to True. + pipe_out (BytesIO, optional): + Flag to stdout the generated TTS wav file for shell pipe. """ wav = self.tts_with_vc( text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences ) - save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) + self.voice_converter.save_wav(wav=wav, path=file_path, pipe_out=pipe_out) + return file_path diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index 127199186b..8d7a2633a0 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -2,6 +2,7 @@ import importlib import logging import os +import sys from argparse import RawTextHelpFormatter import numpy as np @@ -18,7 +19,7 @@ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger if __name__ == "__main__": - setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) # pylint: disable=bad-option-value parser = argparse.ArgumentParser( @@ -80,7 +81,7 @@ num_chars = len(phonemes) if C.use_phonemes else len(symbols) # TODO: handle multi-speaker model = setup_model(C) - model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True) + model, _ = load_checkpoint(model, args.model_path, use_cuda=args.use_cuda, eval=True) # data loader preprocessor = importlib.import_module("TTS.tts.datasets.formatters") @@ -112,7 +113,7 @@ # compute attentions file_paths = [] - with torch.no_grad(): + with torch.inference_mode(): for data in tqdm(loader): # setup input data text_input = data[0] diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 1bdb8d733c..d450e26fba 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -1,6 +1,7 @@ import argparse import logging import os +import sys from argparse import RawTextHelpFormatter import torch @@ -14,6 +15,88 @@ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n""" + """ + Example runs: + python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json + + python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv + """, + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument( + "--model_path", + type=str, + help="Path to model checkpoint file. It defaults to the released speaker encoder.", + default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar", + ) + parser.add_argument( + "--config_path", + type=str, + help="Path to model config file. It defaults to the released speaker encoder config.", + default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json", + ) + parser.add_argument( + "--config_dataset_path", + type=str, + help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.", + default=None, + ) + parser.add_argument( + "--output_path", + type=str, + help="Path for output `pth` or `json` file.", + default="speakers.pth", + ) + parser.add_argument( + "--old_file", + type=str, + help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.", + default=None, + ) + parser.add_argument( + "--old_append", + help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False", + default=False, + action="store_true", + ) + parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False) + parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true") + parser.add_argument( + "--formatter_name", + type=str, + help="Name of the formatter to use. You either need to provide this or `config_dataset_path`", + default=None, + ) + parser.add_argument( + "--dataset_name", + type=str, + help="Name of the dataset to use. You either need to provide this or `config_dataset_path`", + default=None, + ) + parser.add_argument( + "--dataset_path", + type=str, + help="Path to the dataset. You either need to provide this or `config_dataset_path`", + default=None, + ) + parser.add_argument( + "--meta_file_train", + type=str, + help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`", + default=None, + ) + parser.add_argument( + "--meta_file_val", + type=str, + help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`", + default=None, + ) + return parser.parse_args() + + def compute_embeddings( model_path, config_path, @@ -101,88 +184,9 @@ def compute_embeddings( print("Speaker embeddings saved at:", mapping_file_path) -if __name__ == "__main__": - setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) - - parser = argparse.ArgumentParser( - description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n""" - """ - Example runs: - python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json - - python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv - """, - formatter_class=RawTextHelpFormatter, - ) - parser.add_argument( - "--model_path", - type=str, - help="Path to model checkpoint file. It defaults to the released speaker encoder.", - default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar", - ) - parser.add_argument( - "--config_path", - type=str, - help="Path to model config file. It defaults to the released speaker encoder config.", - default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json", - ) - parser.add_argument( - "--config_dataset_path", - type=str, - help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.", - default=None, - ) - parser.add_argument( - "--output_path", - type=str, - help="Path for output `pth` or `json` file.", - default="speakers.pth", - ) - parser.add_argument( - "--old_file", - type=str, - help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.", - default=None, - ) - parser.add_argument( - "--old_append", - help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False", - default=False, - action="store_true", - ) - parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False) - parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true") - parser.add_argument( - "--formatter_name", - type=str, - help="Name of the formatter to use. You either need to provide this or `config_dataset_path`", - default=None, - ) - parser.add_argument( - "--dataset_name", - type=str, - help="Name of the dataset to use. You either need to provide this or `config_dataset_path`", - default=None, - ) - parser.add_argument( - "--dataset_path", - type=str, - help="Path to the dataset. You either need to provide this or `config_dataset_path`", - default=None, - ) - parser.add_argument( - "--meta_file_train", - type=str, - help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`", - default=None, - ) - parser.add_argument( - "--meta_file_val", - type=str, - help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`", - default=None, - ) - args = parser.parse_args() +def main(arg_list: list[str] | None = None): + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) + args = parse_args(arg_list) compute_embeddings( args.model_path, @@ -199,3 +203,7 @@ def compute_embeddings( disable_cuda=args.disable_cuda, no_eval=args.no_eval, ) + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index dc5423a691..1da7a092fb 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- import argparse import glob import logging import os +import sys import numpy as np from tqdm import tqdm @@ -16,10 +16,7 @@ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger -def main(): - """Run preprocessing process.""" - setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) - +def parse_args(arg_list: list[str] | None) -> tuple[argparse.Namespace, list[str]]: parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.") parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.") parser.add_argument("out_path", type=str, help="save path (directory and filename).") @@ -29,7 +26,13 @@ def main(): required=False, help="folder including the target set of wavs overriding dataset config.", ) - args, overrides = parser.parse_known_args() + return parser.parse_known_args(arg_list) + + +def main(arg_list: list[str] | None = None): + """Run preprocessing process.""" + setup_logger("TTS", level=logging.INFO, stream=sys.stderr, formatter=ConsoleFormatter()) + args, overrides = parse_args(arg_list) CONFIG = load_config(args.config_path) CONFIG.parse_known_args(overrides, relaxed_parser=True) @@ -94,6 +97,7 @@ def main(): stats["audio_config"] = CONFIG.audio.to_dict() np.save(output_file_path, stats, allow_pickle=True) print(f" > stats saved to {output_file_path}") + sys.exit(0) if __name__ == "__main__": diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py index 711c8221db..701c7d8e82 100644 --- a/TTS/bin/eval_encoder.py +++ b/TTS/bin/eval_encoder.py @@ -1,5 +1,6 @@ import argparse import logging +import sys from argparse import RawTextHelpFormatter import torch @@ -53,7 +54,7 @@ def compute_encoder_accuracy(dataset_items, encoder_manager): if __name__ == "__main__": - setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) parser = argparse.ArgumentParser( description="""Compute the accuracy of the encoder.\n\n""" diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 86a4dce177..be9387f015 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -3,7 +3,8 @@ import argparse import logging -import os +import sys +from pathlib import Path import numpy as np import torch @@ -12,8 +13,10 @@ from trainer.generic_utils import count_parameters from TTS.config import load_config +from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.datasets import TTSDataset, load_tts_samples from TTS.tts.models import setup_model +from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor @@ -23,56 +26,66 @@ use_cuda = torch.cuda.is_available() -def setup_loader(ap, r): - tokenizer, _ = TTSTokenizer.init_from_config(c) +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True) + parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True) + parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True) + parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug") + parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files") + parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero") + parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True) + return parser.parse_args(arg_list) + + +def setup_loader(config: BaseTTSConfig, ap: AudioProcessor, r, speaker_manager: SpeakerManager, samples) -> DataLoader: + tokenizer, _ = TTSTokenizer.init_from_config(config) dataset = TTSDataset( outputs_per_step=r, compute_linear_spec=False, - samples=meta_data, + samples=samples, tokenizer=tokenizer, ap=ap, batch_group_size=0, - min_text_len=c.min_text_len, - max_text_len=c.max_text_len, - min_audio_len=c.min_audio_len, - max_audio_len=c.max_audio_len, - phoneme_cache_path=c.phoneme_cache_path, + min_text_len=config.min_text_len, + max_text_len=config.max_text_len, + min_audio_len=config.min_audio_len, + max_audio_len=config.max_audio_len, + phoneme_cache_path=config.phoneme_cache_path, precompute_num_workers=0, use_noise_augment=False, - speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None, - d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None, + speaker_id_mapping=speaker_manager.name_to_id if config.use_speaker_embedding else None, + d_vector_mapping=speaker_manager.embeddings if config.use_d_vector_file else None, ) - if c.use_phonemes and c.compute_input_seq_cache: + if config.use_phonemes and config.compute_input_seq_cache: # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(c.num_loader_workers) + dataset.compute_input_seq(config.num_loader_workers) dataset.preprocess_samples() - loader = DataLoader( + return DataLoader( dataset, - batch_size=c.batch_size, + batch_size=config.batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=False, sampler=None, - num_workers=c.num_loader_workers, + num_workers=config.num_loader_workers, pin_memory=False, ) - return loader -def set_filename(wav_path, out_path): - wav_file = os.path.basename(wav_path) - file_name = wav_file.split(".")[0] - os.makedirs(os.path.join(out_path, "quant"), exist_ok=True) - os.makedirs(os.path.join(out_path, "mel"), exist_ok=True) - os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True) - os.makedirs(os.path.join(out_path, "wav"), exist_ok=True) - wavq_path = os.path.join(out_path, "quant", file_name) - mel_path = os.path.join(out_path, "mel", file_name) - wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav") - wav_path = os.path.join(out_path, "wav", file_name + ".wav") - return file_name, wavq_path, mel_path, wav_gl_path, wav_path +def set_filename(wav_path: str, out_path: Path) -> tuple[Path, Path, Path, Path]: + wav_name = Path(wav_path).stem + (out_path / "quant").mkdir(exist_ok=True, parents=True) + (out_path / "mel").mkdir(exist_ok=True, parents=True) + (out_path / "wav_gl").mkdir(exist_ok=True, parents=True) + (out_path / "wav").mkdir(exist_ok=True, parents=True) + wavq_path = out_path / "quant" / wav_name + mel_path = out_path / "mel" / wav_name + wav_gl_path = out_path / "wav_gl" / f"{wav_name}.wav" + out_wav_path = out_path / "wav" / f"{wav_name}.wav" + return wavq_path, mel_path, wav_gl_path, out_wav_path def format_data(data): @@ -114,18 +127,18 @@ def format_data(data): ) -@torch.no_grad() +@torch.inference_mode() def inference( - model_name, - model, - ap, + model_name: str, + model: BaseTTS, + ap: AudioProcessor, text_input, text_lengths, mel_input, mel_lengths, speaker_ids=None, d_vectors=None, -): +) -> np.ndarray: if model_name == "glow_tts": speaker_c = None if speaker_ids is not None: @@ -140,9 +153,9 @@ def inference( aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids}, ) model_output = outputs["model_outputs"] - model_output = model_output.detach().cpu().numpy() + return model_output.detach().cpu().numpy() - elif "tacotron" in model_name: + if "tacotron" in model_name: aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input) postnet_outputs = outputs["model_outputs"] @@ -153,16 +166,24 @@ def inference( for b in range(postnet_outputs.shape[0]): postnet_output = postnet_outputs[b] mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T)) - model_output = torch.stack(mel_specs).cpu().numpy() - - elif model_name == "tacotron2": - model_output = postnet_outputs.detach().cpu().numpy() - return model_output + return torch.stack(mel_specs).cpu().numpy() + if model_name == "tacotron2": + return postnet_outputs.detach().cpu().numpy() + msg = f"Model not supported: {model_name}" + raise ValueError(msg) def extract_spectrograms( - data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt" -): + model_name: str, + data_loader: DataLoader, + model: BaseTTS, + ap: AudioProcessor, + output_path: Path, + quantize_bits: int = 0, + save_audio: bool = False, + debug: bool = False, + metadata_name: str = "metadata.txt", +) -> None: model.eval() export_metadata = [] for _, data in tqdm(enumerate(data_loader), total=len(data_loader)): @@ -181,7 +202,7 @@ def extract_spectrograms( ) = format_data(data) model_output = inference( - c.model.lower(), + model_name, model, ap, text_input, @@ -195,7 +216,7 @@ def extract_spectrograms( for idx in range(text_input.shape[0]): wav_file_path = item_idx[idx] wav = ap.load_wav(wav_file_path) - _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path) + wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path) # quantize and save wav if quantize_bits > 0: @@ -217,74 +238,67 @@ def extract_spectrograms( wav = ap.inv_melspectrogram(mel) ap.save_wav(wav, wav_gl_path) - with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f: + with (output_path / metadata_name).open("w") as f: for data in export_metadata: - f.write(f"{data[0]}|{data[1]+'.npy'}\n") + f.write(f"{data[0] / data[1]}.npy\n") -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data, speaker_manager +def main(arg_list: list[str] | None = None) -> None: + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) + args = parse_args(arg_list) + config = load_config(args.config_path) + config.audio.trim_silence = False # Audio processor - ap = AudioProcessor(**c.audio) + ap = AudioProcessor(**config.audio) # load data instances meta_data_train, meta_data_eval = load_tts_samples( - c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size + config.datasets, + eval_split=args.eval, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, ) # use eval and training partitions meta_data = meta_data_train + meta_data_eval # init speaker manager - if c.use_speaker_embedding: + if config.use_speaker_embedding: speaker_manager = SpeakerManager(data_items=meta_data) - elif c.use_d_vector_file: - speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file) + elif config.use_d_vector_file: + speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) else: speaker_manager = None # setup model - model = setup_model(c) + model = setup_model(config) # restore model - model.load_checkpoint(c, args.checkpoint_path, eval=True) + model.load_checkpoint(config, args.checkpoint_path, eval=True) if use_cuda: model.cuda() num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) + print(f"\n > Model has {num_params} parameters", flush=True) # set r - r = 1 if c.model.lower() == "glow_tts" else model.decoder.r - own_loader = setup_loader(ap, r) + r = 1 if config.model.lower() == "glow_tts" else model.decoder.r + own_loader = setup_loader(config, ap, r, speaker_manager, meta_data) extract_spectrograms( + config.model.lower(), own_loader, model, ap, - args.output_path, + Path(args.output_path), quantize_bits=args.quantize_bits, save_audio=args.save_audio, debug=args.debug, - metada_name="metada.txt", + metadata_name="metadata.txt", ) + sys.exit(0) if __name__ == "__main__": - setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) - - parser = argparse.ArgumentParser() - parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True) - parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True) - parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True) - parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug") - parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files") - parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero") - parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True) - args = parser.parse_args() - - c = load_config(args.config_path) - c.audio.trim_silence = False - main(args) + main() diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index 0519d43769..7a7fdf5dd4 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -2,6 +2,7 @@ import argparse import logging +import sys from argparse import RawTextHelpFormatter from TTS.config import load_config @@ -10,7 +11,7 @@ def main(): - setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) # pylint: disable=bad-option-value parser = argparse.ArgumentParser( diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index d99acb9893..40afa1456c 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -1,8 +1,9 @@ -"""Find all the unique characters in a dataset""" +"""Find all the unique characters in a dataset.""" import argparse import logging import multiprocessing +import sys from argparse import RawTextHelpFormatter from tqdm.contrib.concurrent import process_map @@ -13,18 +14,13 @@ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger -def compute_phonemes(item): +def compute_phonemes(item: dict) -> set[str]: text = item["text"] ph = phonemizer.phonemize(text).replace("|", "") return set(ph) -def main(): - setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) - - # pylint: disable=W0601 - global c, phonemizer - # pylint: disable=bad-option-value +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: parser = argparse.ArgumentParser( description="""Find all the unique characters or phonemes in a dataset.\n\n""" """ @@ -35,13 +31,21 @@ def main(): formatter_class=RawTextHelpFormatter, ) parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) - args = parser.parse_args() + return parser.parse_args(arg_list) + - c = load_config(args.config_path) +def main(arg_list: list[str] | None = None) -> None: + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) + global phonemizer + args = parse_args(arg_list) + config = load_config(args.config_path) # load all datasets train_items, eval_items = load_tts_samples( - c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size + config.datasets, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, ) items = train_items + eval_items print("Num items:", len(items)) @@ -49,13 +53,16 @@ def main(): language_list = [item["language"] for item in items] is_lang_def = all(language_list) - if not c.phoneme_language or not is_lang_def: - raise ValueError("Phoneme language must be defined in config.") + if not config.phoneme_language or not is_lang_def: + msg = "Phoneme language must be defined in config." + raise ValueError(msg) - if not language_list.count(language_list[0]) == len(language_list): - raise ValueError( - "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!" + if language_list.count(language_list[0]) != len(language_list): + msg = ( + "Currently, just one phoneme language per config file is supported !! " + "Please split the dataset config into different configs and run it individually for each language !!" ) + raise ValueError(msg) phonemizer = Gruut(language=language_list[0], keep_puncs=True) @@ -73,6 +80,7 @@ def main(): print(f" > Unique phonemes: {''.join(sorted(phones))}") print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}") print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}") + sys.exit(0) if __name__ == "__main__": diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index edab882db8..f9121d7f77 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -4,6 +4,7 @@ import multiprocessing import os import pathlib +import sys import torch from tqdm import tqdm @@ -77,7 +78,7 @@ def preprocess_audios(): if __name__ == "__main__": - setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) parser = argparse.ArgumentParser( description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end" diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 20e429df04..00d7530427 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -9,135 +9,132 @@ from argparse import RawTextHelpFormatter # pylint: disable=redefined-outer-name, unused-argument -from pathlib import Path - from TTS.utils.generic_utils import ConsoleFormatter, setup_logger logger = logging.getLogger(__name__) description = """ -Synthesize speech on command line. +Synthesize speech on the command line. You can either use your trained model or choose a model from the provided list. -If you don't specify any models, then it uses LJSpeech based English model. - -#### Single Speaker Models - - List provided models: + ```sh + tts --list_models ``` - $ tts --list_models - ``` - -- Get model info (for both tts_models and vocoder_models): - - - Query by type/name: - The model_info_by_name uses the name as it from the --list_models. - ``` - $ tts --model_info_by_name "///" - ``` - For example: - ``` - $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts - $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2 - ``` - - Query by type/idx: - The model_query_idx uses the corresponding idx from --list_models. - - ``` - $ tts --model_info_by_idx "/" - ``` - - For example: - ``` - $ tts --model_info_by_idx tts_models/3 - ``` +- Get model information. Use the names obtained from `--list_models`. + ```sh + tts --model_info_by_name "///" + ``` + For example: + ```sh + tts --model_info_by_name tts_models/tr/common-voice/glow-tts + tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2 + ``` - - Query info for model info by full name: - ``` - $ tts --model_info_by_name "///" - ``` +#### Single speaker models -- Run TTS with default models: +- Run TTS with the default model (`tts_models/en/ljspeech/tacotron2-DDC`): - ``` - $ tts --text "Text for TTS" --out_path output/path/speech.wav + ```sh + tts --text "Text for TTS" --out_path output/path/speech.wav ``` - Run TTS and pipe out the generated TTS wav file data: - ``` - $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay + ```sh + tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay ``` - Run a TTS model with its default vocoder model: - ``` - $ tts --text "Text for TTS" --model_name "///" --out_path output/path/speech.wav + ```sh + tts --text "Text for TTS" \\ + --model_name "///" \\ + --out_path output/path/speech.wav ``` For example: - ``` - $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav + ```sh + tts --text "Text for TTS" \\ + --model_name "tts_models/en/ljspeech/glow-tts" \\ + --out_path output/path/speech.wav ``` -- Run with specific TTS and vocoder models from the list: +- Run with specific TTS and vocoder models from the list. Note that not every vocoder is compatible with every TTS model. - ``` - $ tts --text "Text for TTS" --model_name "///" --vocoder_name "///" --out_path output/path/speech.wav + ```sh + tts --text "Text for TTS" \\ + --model_name "///" \\ + --vocoder_name "///" \\ + --out_path output/path/speech.wav ``` For example: - ``` - $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav + ```sh + tts --text "Text for TTS" \\ + --model_name "tts_models/en/ljspeech/glow-tts" \\ + --vocoder_name "vocoder_models/en/ljspeech/univnet" \\ + --out_path output/path/speech.wav ``` -- Run your own TTS model (Using Griffin-Lim Vocoder): +- Run your own TTS model (using Griffin-Lim Vocoder): - ``` - $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav + ```sh + tts --text "Text for TTS" \\ + --model_path path/to/model.pth \\ + --config_path path/to/config.json \\ + --out_path output/path/speech.wav ``` - Run your own TTS and Vocoder models: - ``` - $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav - --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json + ```sh + tts --text "Text for TTS" \\ + --model_path path/to/model.pth \\ + --config_path path/to/config.json \\ + --out_path output/path/speech.wav \\ + --vocoder_path path/to/vocoder.pth \\ + --vocoder_config_path path/to/vocoder_config.json ``` -#### Multi-speaker Models +#### Multi-speaker models -- List the available speakers and choose a among them: +- List the available speakers and choose a `` among them: - ``` - $ tts --model_name "//" --list_speaker_idxs + ```sh + tts --model_name "//" --list_speaker_idxs ``` - Run the multi-speaker TTS model with the target speaker ID: - ``` - $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "//" --speaker_idx + ```sh + tts --text "Text for TTS." --out_path output/path/speech.wav \\ + --model_name "//" --speaker_idx ``` - Run your own multi-speaker TTS model: - ``` - $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx + ```sh + tts --text "Text for TTS" --out_path output/path/speech.wav \\ + --model_path path/to/model.pth --config_path path/to/config.json \\ + --speakers_file_path path/to/speaker.json --speaker_idx ``` -### Voice Conversion Models +#### Voice conversion models -``` -$ tts --out_path output/path/speech.wav --model_name "//" --source_wav --target_wav +```sh +tts --out_path output/path/speech.wav --model_name "//" \\ + --source_wav --target_wav ``` """ -def parse_args() -> argparse.Namespace: +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: """Parse arguments.""" parser = argparse.ArgumentParser( description=description.replace(" ```\n", ""), @@ -253,11 +250,6 @@ def parse_args() -> argparse.Namespace: action="store_true", ) # aux args - parser.add_argument( - "--save_spectogram", - action="store_true", - help="Save raw spectogram for further (vocoder) processing in out_path.", - ) parser.add_argument( "--reference_wav", type=str, @@ -282,13 +274,14 @@ def parse_args() -> argparse.Namespace: "--source_wav", type=str, default=None, - help="Original audio file to convert in the voice of the target_wav", + help="Original audio file to convert into the voice of the target_wav", ) parser.add_argument( "--target_wav", type=str, + nargs="*", default=None, - help="Target audio file to convert in the voice of the source_wav", + help="Audio file(s) of the target voice into which to convert the source_wav", ) parser.add_argument( @@ -298,7 +291,7 @@ def parse_args() -> argparse.Namespace: help="Voice dir for tortoise model", ) - args = parser.parse_args() + args = parser.parse_args(arg_list) # print the description if either text or list_models is not set check_args = [ @@ -317,20 +310,21 @@ def parse_args() -> argparse.Namespace: return args -def main(): - setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) - args = parse_args() +def main(arg_list: list[str] | None = None) -> None: + """Entry point for `tts` command line interface.""" + args = parse_args(arg_list) + stream = sys.stderr if args.pipe_out else sys.stdout + setup_logger("TTS", level=logging.INFO, stream=stream, formatter=ConsoleFormatter()) pipe_out = sys.stdout if args.pipe_out else None with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout): # Late-import to make things load faster + from TTS.api import TTS from TTS.utils.manage import ModelManager - from TTS.utils.synthesizer import Synthesizer # load model manager - path = Path(__file__).parent / "../.models.json" - manager = ModelManager(path, progress_bar=args.progress_bar) + manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=args.progress_bar) tts_path = None tts_config_path = None @@ -344,142 +338,100 @@ def main(): vc_config_path = None model_dir = None - # CASE1 #list : list pre-trained TTS models + # 1) List pre-trained TTS models if args.list_models: manager.list_models() - sys.exit() + sys.exit(0) - # CASE2 #info : model info for pre-trained TTS models + # 2) Info about pre-trained TTS models (without loading a model) if args.model_info_by_idx: model_query = args.model_info_by_idx manager.model_info_by_idx(model_query) - sys.exit() + sys.exit(0) if args.model_info_by_name: model_query_full_name = args.model_info_by_name manager.model_info_by_full_name(model_query_full_name) - sys.exit() - - # CASE3: load pre-trained model paths - if args.model_name is not None and not args.model_path: - model_path, config_path, model_item = manager.download_model(args.model_name) - # tts model - if model_item["model_type"] == "tts_models": - tts_path = model_path - tts_config_path = config_path - if args.vocoder_name is None and "default_vocoder" in model_item: - args.vocoder_name = model_item["default_vocoder"] - - # voice conversion model - if model_item["model_type"] == "voice_conversion_models": - vc_path = model_path - vc_config_path = config_path - - # tts model with multiple files to be loaded from the directory path - if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list): - model_dir = model_path - tts_path = None - tts_config_path = None - args.vocoder_name = None - - # load vocoder - if args.vocoder_name is not None and not args.vocoder_path: - vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) - - # CASE4: set custom model paths - if args.model_path is not None: - tts_path = args.model_path - tts_config_path = args.config_path - speakers_file_path = args.speakers_file_path - language_ids_file_path = args.language_ids_file_path - - if args.vocoder_path is not None: - vocoder_path = args.vocoder_path - vocoder_config_path = args.vocoder_config_path - - if args.encoder_path is not None: - encoder_path = args.encoder_path - encoder_config_path = args.encoder_config_path + sys.exit(0) + # 3) Load a model for further info or TTS/VC device = args.device if args.use_cuda: device = "cuda" - - # load models - synthesizer = Synthesizer( - tts_path, - tts_config_path, - speakers_file_path, - language_ids_file_path, - vocoder_path, - vocoder_config_path, - encoder_path, - encoder_config_path, - vc_path, - vc_config_path, - model_dir, - args.voice_dir, + # A local model will take precedence if specified via modeL_path + model_name = args.model_name if args.model_path is None else None + api = TTS( + model_name=model_name, + model_path=args.model_path, + config_path=args.config_path, + vocoder_name=args.vocoder_name, + vocoder_path=args.vocoder_path, + vocoder_config_path=args.vocoder_config_path, + encoder_path=args.encoder_path, + encoder_config_path=args.encoder_config_path, + speakers_file_path=args.speakers_file_path, + language_ids_file_path=args.language_ids_file_path, + progress_bar=args.progress_bar, ).to(device) # query speaker ids of a multi-speaker model. if args.list_speaker_idxs: - if synthesizer.tts_model.speaker_manager is None: + if not api.is_multi_speaker: logger.info("Model only has a single speaker.") - return + sys.exit(0) logger.info( "Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." ) - logger.info(list(synthesizer.tts_model.speaker_manager.name_to_id.keys())) - return + logger.info(api.speakers) + sys.exit(0) # query langauge ids of a multi-lingual model. if args.list_language_idxs: - if synthesizer.tts_model.language_manager is None: + if not api.is_multi_lingual: logger.info("Monolingual model.") - return + sys.exit(0) logger.info( "Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." ) - logger.info(synthesizer.tts_model.language_manager.name_to_id) - return + logger.info(api.languages) + sys.exit(0) # check the arguments against a multi-speaker model. - if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): + if api.is_multi_speaker and (not args.speaker_idx and not args.speaker_wav): logger.error( "Looks like you use a multi-speaker model. Define `--speaker_idx` to " "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." ) - return + sys.exit(1) # RUN THE SYNTHESIS if args.text: logger.info("Text: %s", args.text) - # kick it - if tts_path is not None: - wav = synthesizer.tts( - args.text, - speaker_name=args.speaker_idx, - language_name=args.language_idx, + if args.text is not None: + api.tts_to_file( + text=args.text, + speaker=args.speaker_idx, + language=args.language_idx, speaker_wav=args.speaker_wav, + pipe_out=pipe_out, + file_path=args.out_path, reference_wav=args.reference_wav, style_wav=args.capacitron_style_wav, style_text=args.capacitron_style_text, reference_speaker_name=args.reference_speaker_idx, + voice_dir=args.voice_dir, ) - elif vc_path is not None: - wav = synthesizer.voice_conversion( + logger.info("Saved TTS output to %s", args.out_path) + elif args.source_wav is not None and args.target_wav is not None: + api.voice_conversion_to_file( source_wav=args.source_wav, target_wav=args.target_wav, + file_path=args.out_path, + pipe_out=pipe_out, ) - elif model_dir is not None: - wav = synthesizer.tts( - args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav - ) - - # save the results - synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out) - logger.info("Saved output to %s", args.out_path) + logger.info("Saved VC output to %s", args.out_path) + sys.exit(0) if __name__ == "__main__": diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index ba03c42b6d..06189a44c3 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -1,25 +1,30 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- + +# TODO: use Trainer import logging import os import sys import time -import traceback import warnings +from dataclasses import dataclass, field import torch from torch.utils.data import DataLoader -from trainer.generic_utils import count_parameters, remove_experiment_folder -from trainer.io import copy_model_files, save_best_model, save_checkpoint +from trainer import TrainerArgs, TrainerConfig +from trainer.generic_utils import count_parameters, get_experiment_folder_path, get_git_branch +from trainer.io import copy_model_files, get_last_checkpoint, save_best_model, save_checkpoint +from trainer.logging import BaseDashboardLogger, ConsoleLogger, logger_factory from trainer.torch import NoamLR from trainer.trainer_utils import get_optimizer +from TTS.config import load_config, register_config +from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig from TTS.encoder.dataset import EncoderDataset from TTS.encoder.utils.generic_utils import setup_encoder_model -from TTS.encoder.utils.training import init_training from TTS.encoder.utils.visual import plot_embeddings from TTS.tts.datasets import load_tts_samples +from TTS.tts.utils.text.characters import parse_symbols from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import ConsoleFormatter, setup_logger from TTS.utils.samplers import PerfectBatchSampler @@ -34,7 +39,77 @@ print(" > Number of GPUs: ", num_gpus) -def setup_loader(ap: AudioProcessor, is_val: bool = False): +@dataclass +class TrainArgs(TrainerArgs): + config_path: str | None = field(default=None, metadata={"help": "Path to the config file."}) + + +def process_args( + args, config: BaseEncoderConfig | None = None +) -> tuple[BaseEncoderConfig, str, str, ConsoleLogger, BaseDashboardLogger | None]: + """Process parsed comand line arguments and initialize the config if not provided. + Args: + args (argparse.Namespace or dict like): Parsed input arguments. + config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None. + Returns: + c (Coqpit): Config paramaters. + out_path (str): Path to save models and logging. + audio_path (str): Path to save generated test audios. + c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does + logging to the console. + dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging + TODO: + - Interactive config definition. + """ + coqpit_overrides = None + if isinstance(args, tuple): + args, coqpit_overrides = args + if args.continue_path: + # continue a previous training from its output folder + experiment_path = args.continue_path + args.config_path = os.path.join(args.continue_path, "config.json") + args.restore_path, best_model = get_last_checkpoint(args.continue_path) + if not args.best_path: + args.best_path = best_model + # init config if not already defined + if config is None: + if args.config_path: + # init from a file + config = load_config(args.config_path) + else: + # init from console args + from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel + + config_base = BaseTrainingConfig() + config_base.parse_known_args(coqpit_overrides) + config = register_config(config_base.model)() + # override values from command-line args + config.parse_known_args(coqpit_overrides, relaxed_parser=True) + experiment_path = args.continue_path + if not experiment_path: + experiment_path = get_experiment_folder_path(config.output_path, config.run_name) + audio_path = os.path.join(experiment_path, "test_audios") + config.output_log_path = experiment_path + # setup rank 0 process in distributed training + dashboard_logger = None + if args.rank == 0: + new_fields = {} + if args.restore_path: + new_fields["restore_path"] = args.restore_path + new_fields["github_branch"] = get_git_branch() + # if model characters are not set in the config file + # save the default set to the config file for future + # compatibility. + if config.has("characters") and config.characters is None: + used_characters = parse_symbols() + new_fields["characters"] = used_characters + copy_model_files(config, experiment_path, new_fields) + dashboard_logger = logger_factory(config, experiment_path) + c_logger = ConsoleLogger() + return config, experiment_path, audio_path, c_logger, dashboard_logger + + +def setup_loader(c: TrainerConfig, ap: AudioProcessor, is_val: bool = False): num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch @@ -84,10 +159,10 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False): return loader, classes, dataset.get_map_classid_to_classname() -def evaluation(model, criterion, data_loader, global_step): +def evaluation(c: BaseEncoderConfig, model, criterion, data_loader, global_step, dashboard_logger: BaseDashboardLogger): eval_loss = 0 for _, data in enumerate(data_loader): - with torch.no_grad(): + with torch.inference_mode(): # setup input data inputs, labels = data @@ -128,7 +203,17 @@ def evaluation(model, criterion, data_loader, global_step): return eval_avg_loss -def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step): +def train( + c: BaseEncoderConfig, + model, + optimizer, + scheduler, + criterion, + data_loader, + eval_data_loader, + global_step, + dashboard_logger: BaseDashboardLogger, +): model.train() best_loss = {"train_loss": None, "eval_loss": float("inf")} avg_loader_time = 0 @@ -219,37 +304,33 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, if global_step % c.print_step == 0: print( - " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} " - "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format( - global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr - ), + f" | > Step:{global_step} Loss:{loss.item():.5f} GradNorm:{grad_norm:.5f} " + f"StepTime:{step_time:.2f} LoaderTime:{loader_time:.2f} AvGLoaderTime:{avg_loader_time:.2f} LR:{current_lr:.6f}", flush=True, ) if global_step % c.save_step == 0: # save model save_checkpoint( - c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict() + c, model, optimizer, None, global_step, epoch, c.output_log_path, criterion=criterion.state_dict() ) end_time = time.time() print("") print( - ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} " - "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format( - epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time - ), + f">>> Epoch:{epoch} AvgLoss: {tot_loss / len(data_loader):.5f} GradNorm:{grad_norm:.5f} " + f"EpochTime:{epoch_time:.2f} AvGLoaderTime:{avg_loader_time:.2f} ", flush=True, ) # evaluation if c.run_eval: model.eval() - eval_loss = evaluation(model, criterion, eval_data_loader, global_step) + eval_loss = evaluation(c, model, criterion, eval_data_loader, global_step, dashboard_logger) print("\n\n") print("--> EVAL PERFORMANCE") print( - " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss), + f" | > Epoch:{epoch} AvgLoss: {eval_loss:.5f} ", flush=True, ) # save the best checkpoint @@ -262,7 +343,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, None, global_step, epoch, - OUT_PATH, + c.output_log_path, criterion=criterion.state_dict(), ) model.train() @@ -270,7 +351,13 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, return best_loss, global_step -def main(args): # pylint: disable=redefined-outer-name +def main(arg_list: list[str] | None = None): + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) + + train_config = TrainArgs() + parser = train_config.init_argparse(arg_prefix="") + args, overrides = parser.parse_known_args(arg_list) + c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args((args, overrides)) # pylint: disable=global-variable-undefined global meta_data_train global meta_data_eval @@ -284,9 +371,9 @@ def main(args): # pylint: disable=redefined-outer-name # pylint: disable=redefined-outer-name meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True) - train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False) + train_data_loader, train_classes, map_classid_to_classname = setup_loader(c, ap, is_val=False) if c.run_eval: - eval_data_loader, _, _ = setup_loader(ap, is_val=True) + eval_data_loader, _, _ = setup_loader(c, ap, is_val=True) else: eval_data_loader = None @@ -301,7 +388,7 @@ def main(args): # pylint: disable=redefined-outer-name criterion, args.restore_step = model.load_checkpoint( c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion ) - print(" > Model restored from step %d" % args.restore_step, flush=True) + print(f" > Model restored from step {args.restore_step}", flush=True) else: args.restore_step = 0 @@ -311,30 +398,30 @@ def main(args): # pylint: disable=redefined-outer-name scheduler = None num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) + print(f"\n > Model has {num_params} parameters", flush=True) if use_cuda: model = model.cuda() criterion.cuda() global_step = args.restore_step - _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step) + _, global_step = train( + c, model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step, dashboard_logger + ) + sys.exit(0) if __name__ == "__main__": - setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) - - args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training() - - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) + main() + # try: + # main() + # except KeyboardInterrupt: + # remove_experiment_folder(OUT_PATH) + # try: + # sys.exit(0) + # except SystemExit: + # os._exit(0) # pylint: disable=protected-access + # except Exception: # pylint: disable=broad-except + # remove_experiment_folder(OUT_PATH) + # traceback.print_exc() + # sys.exit(1) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 6d6342a762..deaa350878 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -1,5 +1,6 @@ import logging import os +import sys from dataclasses import dataclass, field from trainer import Trainer, TrainerArgs @@ -15,16 +16,16 @@ class TrainTTSArgs(TrainerArgs): config_path: str = field(default=None, metadata={"help": "Path to the config file."}) -def main(): +def main(arg_list: list[str] | None = None): """Run `tts` model training directly by a `config.json` file.""" - setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) # init trainer args train_args = TrainTTSArgs() parser = train_args.init_argparse(arg_prefix="") - # override trainer args from comman-line args - args, config_overrides = parser.parse_known_args() + # override trainer args from command-line args + args, config_overrides = parser.parse_known_args(arg_list) train_args.parse_args(args) # load config.json and register @@ -69,6 +70,7 @@ def main(): parse_command_line_args=False, ) trainer.fit() + sys.exit(0) if __name__ == "__main__": diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py index 221ff4cff0..58122b9005 100644 --- a/TTS/bin/train_vocoder.py +++ b/TTS/bin/train_vocoder.py @@ -1,5 +1,6 @@ import logging import os +import sys from dataclasses import dataclass, field from trainer import Trainer, TrainerArgs @@ -16,16 +17,16 @@ class TrainVocoderArgs(TrainerArgs): config_path: str = field(default=None, metadata={"help": "Path to the config file."}) -def main(): +def main(arg_list: list[str] | None = None): """Run `tts` model training directly by a `config.json` file.""" - setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) # init trainer args train_args = TrainVocoderArgs() parser = train_args.init_argparse(arg_prefix="") # override trainer args from comman-line args - args, config_overrides = parser.parse_known_args() + args, config_overrides = parser.parse_known_args(arg_list) train_args.parse_args(args) # load config.json and register @@ -75,6 +76,7 @@ def main(): parse_command_line_args=False, ) trainer.fit() + sys.exit(0) if __name__ == "__main__": diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py index df2923952d..d05ae14b7f 100644 --- a/TTS/bin/tune_wavegrad.py +++ b/TTS/bin/tune_wavegrad.py @@ -2,6 +2,7 @@ import argparse import logging +import sys from itertools import product as cartesian_product import numpy as np @@ -17,7 +18,7 @@ from TTS.vocoder.models import setup_model if __name__ == "__main__": - setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, help="Path to model checkpoint.") diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index 5103f200b0..401003504e 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -1,7 +1,7 @@ import json import os import re -from typing import Dict +from typing import Any, Union import fsspec import yaml @@ -54,11 +54,11 @@ def register_config(model_name: str) -> Coqpit: return config_class -def _process_model_name(config_dict: Dict) -> str: +def _process_model_name(config_dict: dict) -> str: """Format the model name as expected. It is a band-aid for the old `vocoder` model names. Args: - config_dict (Dict): A dictionary including the config fields. + config_dict (dict): A dictionary including the config fields. Returns: str: Formatted modelname. @@ -68,7 +68,7 @@ def _process_model_name(config_dict: Dict) -> str: return model_name -def load_config(config_path: str) -> Coqpit: +def load_config(config_path: str | os.PathLike[Any]) -> Coqpit: """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name to find the corresponding Config class. Then initialize the Config. @@ -81,6 +81,7 @@ def load_config(config_path: str) -> Coqpit: Returns: Coqpit: TTS config object. """ + config_path = str(config_path) config_dict = {} ext = os.path.splitext(config_path)[1] if ext in (".yml", ".yaml"): diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index 7fae77d613..a0a013b0de 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -1,5 +1,4 @@ from dataclasses import asdict, dataclass -from typing import List from coqpit import Coqpit, check_argument from trainer import TrainerConfig @@ -227,7 +226,7 @@ class BaseDatasetConfig(Coqpit): dataset_name: str = "" path: str = "" meta_file_train: str = "" - ignored_speakers: List[str] = None + ignored_speakers: list[str] = None language: str = "" phonemizer: str = "" meta_file_val: str = "" diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py index f838297af3..411a9b0dbe 100644 --- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -5,7 +5,8 @@ from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.datasets import load_tts_samples -from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig +from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig +from TTS.tts.models.xtts import XttsAudioConfig from TTS.utils.manage import ModelManager diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 7ac38ed6ee..dac5f0870a 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -104,7 +104,7 @@ def isatty(self): def read_logs(): sys.stdout.flush() - with open(sys.stdout.log_file, "r") as f: + with open(sys.stdout.log_file) as f: return f.read() diff --git a/TTS/encoder/configs/base_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py index ebbaa0457b..d2d0ef580d 100644 --- a/TTS/encoder/configs/base_encoder_config.py +++ b/TTS/encoder/configs/base_encoder_config.py @@ -1,5 +1,4 @@ from dataclasses import asdict, dataclass, field -from typing import Dict, List from coqpit import MISSING @@ -12,9 +11,9 @@ class BaseEncoderConfig(BaseTrainingConfig): model: str = None audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) - datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) + datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # model params - model_params: Dict = field( + model_params: dict = field( default_factory=lambda: { "model_name": "lstm", "input_dim": 80, @@ -25,7 +24,7 @@ class BaseEncoderConfig(BaseTrainingConfig): } ) - audio_augmentation: Dict = field(default_factory=lambda: {}) + audio_augmentation: dict = field(default_factory=lambda: {}) # training params epochs: int = 10000 @@ -33,7 +32,7 @@ class BaseEncoderConfig(BaseTrainingConfig): grad_clip: float = 3.0 lr: float = 0.0001 optimizer: str = "radam" - optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0}) lr_decay: bool = False warmup_steps: int = 4000 @@ -56,6 +55,6 @@ class BaseEncoderConfig(BaseTrainingConfig): def check_values(self): super().check_values() c = asdict(self) - assert ( - c["model_params"]["input_dim"] == self.audio.num_mels - ), " [!] model input dimendion must be equal to melspectrogram dimension." + assert c["model_params"]["input_dim"] == self.audio.num_mels, ( + " [!] model input dimendion must be equal to melspectrogram dimension." + ) diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py index f7137c2186..c6680c3a25 100644 --- a/TTS/encoder/models/base_encoder.py +++ b/TTS/encoder/models/base_encoder.py @@ -5,10 +5,10 @@ import torchaudio from coqpit import Coqpit from torch import nn +from trainer.generic_utils import set_partial_state_dict from trainer.io import load_fsspec from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss -from TTS.utils.generic_utils import set_init_dict logger = logging.getLogger(__name__) @@ -34,7 +34,7 @@ class BaseEncoder(nn.Module): # pylint: disable=W0102 def __init__(self): - super(BaseEncoder, self).__init__() + super().__init__() def get_torch_mel_spectrogram_class(self, audio_config): return torch.nn.Sequential( @@ -64,11 +64,11 @@ def get_torch_mel_spectrogram_class(self, audio_config): ), ) - @torch.no_grad() + @torch.inference_mode() def inference(self, x, l2_norm=True): return self.forward(x, l2_norm) - @torch.no_grad() + @torch.inference_mode() def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True): """ Generate embeddings for a batch of utterances @@ -107,7 +107,7 @@ def get_criterion(self, c: Coqpit, num_classes=None): elif c.loss == "softmaxproto": criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes) else: - raise Exception("The %s not is a loss supported" % c.loss) + raise Exception(f"The {c.loss} not is a loss supported") return criterion def load_checkpoint( @@ -130,7 +130,7 @@ def load_checkpoint( logger.info("Partial model initialization.") model_dict = self.state_dict() - model_dict = set_init_dict(model_dict, state["model"], c) + model_dict = set_partial_state_dict(model_dict, state["model"], config) self.load_state_dict(model_dict) del model_dict diff --git a/TTS/encoder/models/resnet.py b/TTS/encoder/models/resnet.py index 5eafcd6005..d7f3a2f4bd 100644 --- a/TTS/encoder/models/resnet.py +++ b/TTS/encoder/models/resnet.py @@ -7,7 +7,7 @@ class SELayer(nn.Module): def __init__(self, channel, reduction=8): - super(SELayer, self).__init__() + super().__init__() self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Sequential( nn.Linear(channel, channel // reduction), @@ -27,7 +27,7 @@ class SEBasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): - super(SEBasicBlock, self).__init__() + super().__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False) @@ -73,7 +73,7 @@ def __init__( use_torch_spec=False, audio_config=None, ): - super(ResNetSpeakerEncoder, self).__init__() + super().__init__() self.encoder_type = encoder_type self.input_dim = input_dim diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py index 495b4def5a..54ab37a52f 100644 --- a/TTS/encoder/utils/generic_utils.py +++ b/TTS/encoder/utils/generic_utils.py @@ -6,13 +6,14 @@ import numpy as np from scipy import signal +from TTS.encoder.models.base_encoder import BaseEncoder from TTS.encoder.models.lstm import LSTMSpeakerEncoder from TTS.encoder.models.resnet import ResNetSpeakerEncoder logger = logging.getLogger(__name__) -class AugmentWAV(object): +class AugmentWAV: def __init__(self, ap, augmentation_config): self.ap = ap self.use_additive_noise = False @@ -120,7 +121,7 @@ def apply_one(self, audio): return self.additive_noise(noise_type, audio) -def setup_encoder_model(config: "Coqpit"): +def setup_encoder_model(config: "Coqpit") -> BaseEncoder: if config.model_params["model_name"].lower() == "lstm": model = LSTMSpeakerEncoder( config.model_params["input_dim"], @@ -138,4 +139,7 @@ def setup_encoder_model(config: "Coqpit"): use_torch_spec=config.model_params.get("use_torch_spec", False), audio_config=config.audio, ) + else: + msg = f"Model not supported: {config.model_params['model_name']}" + raise ValueError(msg) return model diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py index da7522a512..8d50ffd5f5 100644 --- a/TTS/encoder/utils/prepare_voxceleb.py +++ b/TTS/encoder/utils/prepare_voxceleb.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo # All rights reserved. # @@ -17,7 +16,7 @@ # Only support eager mode and TF>=2.0.0 # pylint: disable=no-member, invalid-name, relative-beyond-top-level # pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes -""" voxceleb 1 & 2 """ +"""voxceleb 1 & 2""" import csv import hashlib @@ -81,19 +80,19 @@ def download_and_extract(directory, subset, urls): zip_filepath = os.path.join(directory, url.split("/")[-1]) if os.path.exists(zip_filepath): continue - logger.info("Downloading %s to %s" % (url, zip_filepath)) + logger.info("Downloading %s to %s", url, zip_filepath) subprocess.call( - "wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath), + "wget {} --user {} --password {} -O {}".format(url, USER["user"], USER["password"], zip_filepath), shell=True, ) statinfo = os.stat(zip_filepath) - logger.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size)) + logger.info("Successfully downloaded %s, size(bytes): %d", url, statinfo.st_size) # concatenate all parts into zip files if ".zip" not in zip_filepath: zip_filepath = "_".join(zip_filepath.split("_")[:-1]) - subprocess.call("cat %s* > %s.zip" % (zip_filepath, zip_filepath), shell=True) + subprocess.call(f"cat {zip_filepath}* > {zip_filepath}.zip", shell=True) zip_filepath += ".zip" extract_path = zip_filepath.strip(".zip") @@ -101,12 +100,12 @@ def download_and_extract(directory, subset, urls): with open(zip_filepath, "rb") as f_zip: md5 = hashlib.md5(f_zip.read()).hexdigest() if md5 != MD5SUM[subset]: - raise ValueError("md5sum of %s mismatch" % zip_filepath) + raise ValueError(f"md5sum of {zip_filepath} mismatch") with zipfile.ZipFile(zip_filepath, "r") as zfile: zfile.extractall(directory) extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename) - subprocess.call("mv %s %s" % (extract_path_ori, extract_path), shell=True) + subprocess.call(f"mv {extract_path_ori} {extract_path}", shell=True) finally: # os.remove(zip_filepath) pass @@ -122,9 +121,9 @@ def exec_cmd(cmd): try: retcode = subprocess.call(cmd, shell=True) if retcode < 0: - logger.info(f"Child was terminated by signal {retcode}") + logger.info("Child was terminated by signal %d", retcode) except OSError as e: - logger.info(f"Execution failed: {e}") + logger.info("Execution failed: %s", e) retcode = -999 return retcode @@ -138,10 +137,10 @@ def decode_aac_with_ffmpeg(aac_file, wav_file): bool, True if success. """ cmd = f"ffmpeg -i {aac_file} {wav_file}" - logger.info(f"Decoding aac file using command line: {cmd}") + logger.info("Decoding aac file using command line: %s", cmd) ret = exec_cmd(cmd) if ret != 0: - logger.error(f"Failed to decode aac file with retcode {ret}") + logger.error("Failed to decode aac file with retcode %s", ret) logger.error("Please check your ffmpeg installation.") return False return True @@ -156,7 +155,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file): output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv """ - logger.info("Preprocessing audio and label for subset %s" % subset) + logger.info("Preprocessing audio and label for subset %s", subset) source_dir = os.path.join(input_dir, subset) files = [] @@ -194,7 +193,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file): writer.writerow(["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"]) for wav_file in files: writer.writerow(wav_file) - logger.info("Successfully generated csv file {}".format(csv_file_path)) + logger.info("Successfully generated csv file %s", csv_file_path) def processor(directory, subset, force_process): @@ -216,7 +215,7 @@ def processor(directory, subset, force_process): if __name__ == "__main__": - setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) + setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) if len(sys.argv) != 4: print("Usage: python prepare_data.py save_directory user password") sys.exit() diff --git a/TTS/encoder/utils/training.py b/TTS/encoder/utils/training.py deleted file mode 100644 index cc3a78b084..0000000000 --- a/TTS/encoder/utils/training.py +++ /dev/null @@ -1,99 +0,0 @@ -import os -from dataclasses import dataclass, field - -from coqpit import Coqpit -from trainer import TrainerArgs, get_last_checkpoint -from trainer.generic_utils import get_experiment_folder_path, get_git_branch -from trainer.io import copy_model_files -from trainer.logging import logger_factory -from trainer.logging.console_logger import ConsoleLogger - -from TTS.config import load_config, register_config -from TTS.tts.utils.text.characters import parse_symbols - - -@dataclass -class TrainArgs(TrainerArgs): - config_path: str = field(default=None, metadata={"help": "Path to the config file."}) - - -def getarguments(): - train_config = TrainArgs() - parser = train_config.init_argparse(arg_prefix="") - return parser - - -def process_args(args, config=None): - """Process parsed comand line arguments and initialize the config if not provided. - Args: - args (argparse.Namespace or dict like): Parsed input arguments. - config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None. - Returns: - c (Coqpit): Config paramaters. - out_path (str): Path to save models and logging. - audio_path (str): Path to save generated test audios. - c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does - logging to the console. - dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging - TODO: - - Interactive config definition. - """ - if isinstance(args, tuple): - args, coqpit_overrides = args - if args.continue_path: - # continue a previous training from its output folder - experiment_path = args.continue_path - args.config_path = os.path.join(args.continue_path, "config.json") - args.restore_path, best_model = get_last_checkpoint(args.continue_path) - if not args.best_path: - args.best_path = best_model - # init config if not already defined - if config is None: - if args.config_path: - # init from a file - config = load_config(args.config_path) - else: - # init from console args - from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel - - config_base = BaseTrainingConfig() - config_base.parse_known_args(coqpit_overrides) - config = register_config(config_base.model)() - # override values from command-line args - config.parse_known_args(coqpit_overrides, relaxed_parser=True) - experiment_path = args.continue_path - if not experiment_path: - experiment_path = get_experiment_folder_path(config.output_path, config.run_name) - audio_path = os.path.join(experiment_path, "test_audios") - config.output_log_path = experiment_path - # setup rank 0 process in distributed training - dashboard_logger = None - if args.rank == 0: - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - # if model characters are not set in the config file - # save the default set to the config file for future - # compatibility. - if config.has("characters") and config.characters is None: - used_characters = parse_symbols() - new_fields["characters"] = used_characters - copy_model_files(config, experiment_path, new_fields) - dashboard_logger = logger_factory(config, experiment_path) - c_logger = ConsoleLogger() - return config, experiment_path, audio_path, c_logger, dashboard_logger - - -def init_arguments(): - train_config = TrainArgs() - parser = train_config.init_argparse(arg_prefix="") - return parser - - -def init_training(config: Coqpit = None): - """Initialization of a training run.""" - parser = init_arguments() - args = parser.parse_known_args() - config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args(args, config) - return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger diff --git a/TTS/model.py b/TTS/model.py index c3707c85ae..39faa7f690 100644 --- a/TTS/model.py +++ b/TTS/model.py @@ -1,6 +1,6 @@ import os from abc import abstractmethod -from typing import Any, Union +from typing import Any import torch from coqpit import Coqpit @@ -12,7 +12,7 @@ class BaseTrainerModel(TrainerModel): """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS. - Every new 🐸TTS model must inherit it. + Every new Coqui model must inherit it. """ @staticmethod @@ -48,7 +48,7 @@ def inference(self, input: torch.Tensor, aux_input: dict[str, Any] = {}) -> dict def load_checkpoint( self, config: Coqpit, - checkpoint_path: Union[str, os.PathLike[Any]], + checkpoint_path: str | os.PathLike[Any], eval: bool = False, strict: bool = True, cache: bool = False, @@ -64,3 +64,7 @@ def load_checkpoint( It is cached under `trainer.io.get_user_data_dir()/tts_cache`. Defaults to False. """ ... + + @property + def device(self) -> torch.device: + return next(self.parameters()).device diff --git a/TTS/server/README.md b/TTS/server/README.md index ae8e38a4e3..232b8618d8 100644 --- a/TTS/server/README.md +++ b/TTS/server/README.md @@ -1,21 +1,36 @@ -# :frog: TTS demo server +# :frog: TTS Demo Server Before you use the server, make sure you -[install](https://github.com/idiap/coqui-ai-TTS/tree/dev#install-tts)) :frog: TTS +[install](https://github.com/idiap/coqui-ai-TTS/tree/dev#install-tts) :frog: TTS properly and install the additional dependencies with `pip install coqui-tts[server]`. Then, you can follow the steps below. -**Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal. +**Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` endpoint on the terminal instead of the `python TTS/server/server.py` arguments. -Examples runs: +## Example commands -List officially released models. -```python TTS/server/server.py --list_models ``` +List officially released models: +```bash +python TTS/server/server.py --list_models # or +tts-server --list_models +``` -Run the server with the official models. -```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan``` +Run the server with the official models: +```bash +python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA \ + --vocoder_name vocoder_models/en/ljspeech/multiband-melgan +``` -Run the server with the official models on a GPU. -```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda``` +Run the server with the official models on a GPU: +```bash +CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py \ + --model_name tts_models/en/ljspeech/tacotron2-DCA + --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda +``` -Run the server with a custom models. -```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json``` +Run the server with a custom models: +```bash +python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth \ + --tts_config /path/to/tts/config.json \ + --vocoder_checkpoint /path/to/vocoder/model.pth \ + --vocoder_config /path/to/vocoder/config.json +``` diff --git a/TTS/server/server.py b/TTS/server/server.py index f410fb7539..500c706c4e 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -8,9 +8,7 @@ import logging import os import sys -from pathlib import Path from threading import Lock -from typing import Union from urllib.parse import parse_qs try: @@ -19,13 +17,12 @@ msg = "Server requires requires flask, use `pip install coqui-tts[server]`" raise ImportError(msg) from e -from TTS.config import load_config +from TTS.api import TTS from TTS.utils.generic_utils import ConsoleFormatter, setup_logger from TTS.utils.manage import ModelManager -from TTS.utils.synthesizer import Synthesizer logger = logging.getLogger(__name__) -setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter()) +setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter()) def create_argparser() -> argparse.ArgumentParser: @@ -60,6 +57,7 @@ def create_argparser() -> argparse.ArgumentParser: parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None) parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument("--port", type=int, default=5002, help="port to listen on.") + parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu") parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="true to use CUDA.") parser.add_argument( "--debug", action=argparse.BooleanOptionalAction, default=False, help="true to enable Flask debug mode." @@ -73,8 +71,7 @@ def create_argparser() -> argparse.ArgumentParser: # parse the args args = create_argparser().parse_args() -path = Path(__file__).parent / "../.models.json" -manager = ModelManager(path) +manager = ModelManager(models_file=TTS.get_models_file_path()) # update in-use models to the specified released models. model_path = None @@ -86,55 +83,32 @@ def create_argparser() -> argparse.ArgumentParser: # CASE1: list pre-trained TTS models if args.list_models: manager.list_models() - sys.exit() - -# CASE2: load pre-trained model paths -if args.model_name is not None and not args.model_path: - model_path, config_path, model_item = manager.download_model(args.model_name) - args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name - -if args.vocoder_name is not None and not args.vocoder_path: - vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) - -# CASE3: set custom model paths -if args.model_path is not None: - model_path = args.model_path - config_path = args.config_path - speakers_file_path = args.speakers_file_path - -if args.vocoder_path is not None: - vocoder_path = args.vocoder_path - vocoder_config_path = args.vocoder_config_path - -# load models -synthesizer = Synthesizer( - tts_checkpoint=model_path, - tts_config_path=config_path, - tts_speakers_file=speakers_file_path, - tts_languages_file=None, - vocoder_checkpoint=vocoder_path, - vocoder_config=vocoder_config_path, - encoder_checkpoint="", - encoder_config="", - use_cuda=args.use_cuda, -) - -use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and ( - synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None -) -speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None) - -use_multi_language = hasattr(synthesizer.tts_model, "num_languages") and ( - synthesizer.tts_model.num_languages > 1 or synthesizer.tts_languages_file is not None -) -language_manager = getattr(synthesizer.tts_model, "language_manager", None) + sys.exit(0) + +device = args.device +if args.use_cuda: + device = "cuda" + +# CASE2: load models +model_name = args.model_name if args.model_path is None else None +api = TTS( + model_name=model_name, + model_path=args.model_path, + config_path=args.config_path, + vocoder_name=args.vocoder_name, + vocoder_path=args.vocoder_path, + vocoder_config_path=args.vocoder_config_path, + speakers_file_path=args.speakers_file_path, + # language_ids_file_path=args.language_ids_file_path, +).to(device) # TODO: set this from SpeakerManager -use_gst = synthesizer.tts_config.get("use_gst", False) +use_gst = api.synthesizer.tts_config.get("use_gst", False) +supports_cloning = api.synthesizer.tts_config.get("model", "") in ["xtts", "bark"] app = Flask(__name__) -def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]: +def style_wav_uri_to_dict(style_wav: str) -> str | dict: """Transform an uri style_wav, in either a string (path to wav file to be use for style transfer) or a dict (gst tokens/values to be use for styling) @@ -158,27 +132,19 @@ def index(): return render_template( "index.html", show_details=args.show_details, - use_multi_speaker=use_multi_speaker, - use_multi_language=use_multi_language, - speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None, - language_ids=language_manager.name_to_id if language_manager is not None else None, + use_multi_speaker=api.is_multi_speaker, + use_multi_language=api.is_multi_lingual, + speaker_ids=api.speakers, + language_ids=api.languages, use_gst=use_gst, + supports_cloning=supports_cloning, ) @app.route("/details") def details(): - if args.config_path is not None and os.path.isfile(args.config_path): - model_config = load_config(args.config_path) - elif args.model_name is not None: - model_config = load_config(config_path) - - if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path): - vocoder_config = load_config(args.vocoder_config_path) - elif args.vocoder_name is not None: - vocoder_config = load_config(vocoder_config_path) - else: - vocoder_config = None + model_config = api.synthesizer.tts_config + vocoder_config = api.synthesizer.vocoder_config or None return render_template( "details.html", @@ -196,17 +162,26 @@ def details(): def tts(): with lock: text = request.headers.get("text") or request.values.get("text", "") - speaker_idx = request.headers.get("speaker-id") or request.values.get("speaker_id", "") - language_idx = request.headers.get("language-id") or request.values.get("language_id", "") + speaker_idx = ( + request.headers.get("speaker-id") or request.values.get("speaker_id", "") if api.is_multi_speaker else None + ) + if speaker_idx == "": + speaker_idx = None + language_idx = ( + request.headers.get("language-id") or request.values.get("language_id", "") + if api.is_multi_lingual + else None + ) style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "") style_wav = style_wav_uri_to_dict(style_wav) + speaker_wav = request.headers.get("speaker-wav") or request.values.get("speaker_wav", "") logger.info("Model input: %s", text) logger.info("Speaker idx: %s", speaker_idx) logger.info("Language idx: %s", language_idx) - wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav) + wavs = api.tts(text, speaker=speaker_idx, language=language_idx, style_wav=style_wav, speaker_wav=speaker_wav) out = io.BytesIO() - synthesizer.save_wav(wavs, out) + api.synthesizer.save_wav(wavs, out) return send_file(out, mimetype="audio/wav") @@ -248,9 +223,9 @@ def mary_tts_api_process(): else: text = request.args.get("INPUT_TEXT", "") logger.info("Model input: %s", text) - wavs = synthesizer.tts(text) + wavs = api.tts(text) out = io.BytesIO() - synthesizer.save_wav(wavs, out) + api.synthesizer.save_wav(wavs, out) return send_file(out, mimetype="audio/wav") diff --git a/TTS/server/templates/index.html b/TTS/server/templates/index.html index 6bfd5ae2cb..95d7076394 100644 --- a/TTS/server/templates/index.html +++ b/TTS/server/templates/index.html @@ -66,7 +66,12 @@ {%if use_gst%} + type="text" name="style_wav">

+ {%endif%} + + {%if supports_cloning%} + Reference audio: +

{%endif%} @@ -114,14 +119,18 @@ q('#text').focus() function do_tts(e) { const text = q('#text').value - const speaker_id = getTextValue('#speaker_id') const style_wav = getTextValue('#style_wav') + const speaker_wav = getTextValue('#speaker_wav') + let speaker_id = getTextValue('#speaker_id') + if (speaker_wav !== '') { + speaker_id = '' + } const language_id = getTextValue('#language_id') if (text) { q('#message').textContent = 'Synthesizing...' q('#speak-button').disabled = true q('#audio').hidden = true - synthesize(text, speaker_id, style_wav, language_id) + synthesize(text, speaker_id, style_wav, speaker_wav, language_id) } e.preventDefault() return false @@ -132,8 +141,8 @@ do_tts(e) } }) - function synthesize(text, speaker_id = "", style_wav = "", language_id = "") { - fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker_id=${encodeURIComponent(speaker_id)}&style_wav=${encodeURIComponent(style_wav)}&language_id=${encodeURIComponent(language_id)}`, { cache: 'no-cache' }) + function synthesize(text, speaker_id = "", style_wav = "", speaker_wav = "", language_id = "") { + fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker_id=${encodeURIComponent(speaker_id)}&style_wav=${encodeURIComponent(style_wav)}&speaker_wav=${encodeURIComponent(speaker_wav)}&language_id=${encodeURIComponent(language_id)}`, { cache: 'no-cache' }) .then(function (res) { if (!res.ok) throw Error(res.statusText) return res.blob() diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py index 317a01af53..784819eee3 100644 --- a/TTS/tts/configs/align_tts_config.py +++ b/TTS/tts/configs/align_tts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.align_tts import AlignTTSArgs @@ -70,7 +69,7 @@ class AlignTTSConfig(BaseTTSConfig): model: str = "align_tts" # model specific params model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs) - phase_start_steps: List[int] = None + phase_start_steps: list[int] | None = None ssim_alpha: float = 1.0 spec_loss_alpha: float = 1.0 @@ -80,13 +79,13 @@ class AlignTTSConfig(BaseTTSConfig): # multi-speaker settings use_speaker_embedding: bool = False use_d_vector_file: bool = False - d_vector_file: str = False + d_vector_file: str | None = None # optimizer parameters optimizer: str = "Adam" optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6}) - lr_scheduler: str = None - lr_scheduler_params: dict = None + lr_scheduler: str | None = None + lr_scheduler_params: dict | None = None lr: float = 1e-4 grad_clip: float = 5.0 @@ -96,7 +95,7 @@ class AlignTTSConfig(BaseTTSConfig): r: int = 1 # testing - test_sentences: List[str] = field( + test_sentences: list[str] | list[list[str]] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/bark_config.py b/TTS/tts/configs/bark_config.py index b846febe85..61d67b987a 100644 --- a/TTS/tts/configs/bark_config.py +++ b/TTS/tts/configs/bark_config.py @@ -1,6 +1,5 @@ import os from dataclasses import dataclass, field -from typing import Dict from trainer.io import get_user_data_dir @@ -70,9 +69,9 @@ class BarkConfig(BaseTTSConfig): COARSE_INFER_TOKEN: int = 12_050 REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/" - REMOTE_MODEL_PATHS: Dict = None - LOCAL_MODEL_PATHS: Dict = None - SMALL_REMOTE_MODEL_PATHS: Dict = None + REMOTE_MODEL_PATHS: dict = None + LOCAL_MODEL_PATHS: dict = None + SMALL_REMOTE_MODEL_PATHS: dict = None CACHE_DIR: str = str(get_user_data_dir("tts/suno/bark_v0")) DEF_SPEAKER_DIR: str = str(get_user_data_dir("tts/bark_v0/speakers")) diff --git a/TTS/tts/configs/delightful_tts_config.py b/TTS/tts/configs/delightful_tts_config.py index 805d995369..fc9a76f613 100644 --- a/TTS/tts/configs/delightful_tts_config.py +++ b/TTS/tts/configs/delightful_tts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig @@ -73,7 +72,7 @@ class DelightfulTTSConfig(BaseTTSConfig): # optimizer steps_to_start_discriminator: int = 200000 - grad_clip: List[float] = field(default_factory=lambda: [1000, 1000]) + grad_clip: list[float] = field(default_factory=lambda: [1000, 1000]) lr_gen: float = 0.0002 lr_disc: float = 0.0002 lr_scheduler_gen: str = "ExponentialLR" @@ -140,7 +139,7 @@ class DelightfulTTSConfig(BaseTTSConfig): d_vector_dim: int = None # testing - test_sentences: List[List[str]] = field( + test_sentences: list[str] | list[list[str]] = field( default_factory=lambda: [ ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."], ["Be a voice, not an echo."], diff --git a/TTS/tts/configs/fast_pitch_config.py b/TTS/tts/configs/fast_pitch_config.py index d086d26564..1342856668 100644 --- a/TTS/tts/configs/fast_pitch_config.py +++ b/TTS/tts/configs/fast_pitch_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.forward_tts import ForwardTTSArgs @@ -117,10 +116,10 @@ class FastPitchConfig(BaseTTSConfig): # multi-speaker settings num_speakers: int = 0 - speakers_file: str = None + speakers_file: str | None = None use_speaker_embedding: bool = False use_d_vector_file: bool = False - d_vector_file: str = False + d_vector_file: str | None = None d_vector_dim: int = 0 # optimizer parameters @@ -150,10 +149,10 @@ class FastPitchConfig(BaseTTSConfig): # dataset configs compute_f0: bool = True - f0_cache_path: str = None + f0_cache_path: str | None = None # testing - test_sentences: List[str] = field( + test_sentences: list[str] | list[list[str]] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/fast_speech_config.py b/TTS/tts/configs/fast_speech_config.py index af6c2db6fa..408dbab196 100644 --- a/TTS/tts/configs/fast_speech_config.py +++ b/TTS/tts/configs/fast_speech_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.forward_tts import ForwardTTSArgs @@ -111,10 +110,10 @@ class FastSpeechConfig(BaseTTSConfig): # multi-speaker settings num_speakers: int = 0 - speakers_file: str = None + speakers_file: str | None = None use_speaker_embedding: bool = False use_d_vector_file: bool = False - d_vector_file: str = False + d_vector_file: str | None = None d_vector_dim: int = 0 # optimizer parameters @@ -144,10 +143,10 @@ class FastSpeechConfig(BaseTTSConfig): # dataset configs compute_f0: bool = False - f0_cache_path: str = None + f0_cache_path: str | None = None # testing - test_sentences: List[str] = field( + test_sentences: list[str] | list[list[str]] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/fastspeech2_config.py b/TTS/tts/configs/fastspeech2_config.py index d179617fb0..44bdefad0d 100644 --- a/TTS/tts/configs/fastspeech2_config.py +++ b/TTS/tts/configs/fastspeech2_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.forward_tts import ForwardTTSArgs @@ -127,10 +126,10 @@ class Fastspeech2Config(BaseTTSConfig): # multi-speaker settings num_speakers: int = 0 - speakers_file: str = None + speakers_file: str | None = None use_speaker_embedding: bool = False use_d_vector_file: bool = False - d_vector_file: str = False + d_vector_file: str | None = None d_vector_dim: int = 0 # optimizer parameters @@ -161,14 +160,14 @@ class Fastspeech2Config(BaseTTSConfig): # dataset configs compute_f0: bool = True - f0_cache_path: str = None + f0_cache_path: str | None = None # dataset configs compute_energy: bool = True - energy_cache_path: str = None + energy_cache_path: str | None = None # testing - test_sentences: List[str] = field( + test_sentences: list[str] | list[list[str]] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index f42f3e5a51..c99e920b9d 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig @@ -101,7 +100,7 @@ class GlowTTSConfig(BaseTTSConfig): model: str = "glow_tts" # model params - num_chars: int = None + num_chars: int | None = None encoder_type: str = "rel_pos_transformer" encoder_params: dict = field( default_factory=lambda: { @@ -147,15 +146,15 @@ class GlowTTSConfig(BaseTTSConfig): data_dep_init_steps: int = 10 # inference params - style_wav_for_test: str = None + style_wav_for_test: str | None = None inference_noise_scale: float = 0.0 length_scale: float = 1.0 # multi-speaker settings use_speaker_embedding: bool = False - speakers_file: str = None + speakers_file: str | None = None use_d_vector_file: bool = False - d_vector_file: str = False + d_vector_file: str | None = None # optimizer parameters optimizer: str = "RAdam" @@ -171,7 +170,7 @@ class GlowTTSConfig(BaseTTSConfig): r: int = 1 # DO NOT CHANGE - TODO: make this immutable once coqpit implements it. # testing - test_sentences: List[str] = field( + test_sentences: list[str] | list[list[str]] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/neuralhmm_tts_config.py b/TTS/tts/configs/neuralhmm_tts_config.py index 50f72847ed..108f2022d4 100644 --- a/TTS/tts/configs/neuralhmm_tts_config.py +++ b/TTS/tts/configs/neuralhmm_tts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig @@ -126,7 +125,7 @@ class NeuralhmmTTSConfig(BaseTTSConfig): memory_rnn_dim: int = 1024 ## Outputnet parameters - outputnet_size: List[int] = field(default_factory=lambda: [1024]) + outputnet_size: list[int] = field(default_factory=lambda: [1024]) flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14}) std_floor: float = 0.001 @@ -143,7 +142,7 @@ class NeuralhmmTTSConfig(BaseTTSConfig): min_audio_len: int = 512 # testing - test_sentences: List[str] = field( + test_sentences: list[str] | list[list[str]] = field( default_factory=lambda: [ "Be a voice, not an echo.", ] @@ -162,9 +161,9 @@ def check_values(self): AssertionError: transition probability is not between 0 and 1 """ assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model." - assert ( - len(self.outputnet_size) >= 1 - ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" - assert ( - 0 < self.flat_start_params["transition_p"] < 1 - ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + assert len(self.outputnet_size) >= 1, ( + f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" + ) + assert 0 < self.flat_start_params["transition_p"] < 1, ( + f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + ) diff --git a/TTS/tts/configs/overflow_config.py b/TTS/tts/configs/overflow_config.py index dc3e5548b8..9e96aaa441 100644 --- a/TTS/tts/configs/overflow_config.py +++ b/TTS/tts/configs/overflow_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig @@ -145,7 +144,7 @@ class OverflowConfig(BaseTTSConfig): # The classname has to be camel case memory_rnn_dim: int = 1024 ## Outputnet parameters - outputnet_size: List[int] = field(default_factory=lambda: [1024]) + outputnet_size: list[int] = field(default_factory=lambda: [1024]) flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14}) std_floor: float = 0.01 @@ -174,7 +173,7 @@ class OverflowConfig(BaseTTSConfig): # The classname has to be camel case min_audio_len: int = 512 # testing - test_sentences: List[str] = field( + test_sentences: list[str] | list[list[str]] = field( default_factory=lambda: [ "Be a voice, not an echo.", ] @@ -193,9 +192,9 @@ def check_values(self): AssertionError: transition probability is not between 0 and 1 """ assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model." - assert ( - len(self.outputnet_size) >= 1 - ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" - assert ( - 0 < self.flat_start_params["transition_p"] < 1 - ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + assert len(self.outputnet_size) >= 1, ( + f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" + ) + assert 0 < self.flat_start_params["transition_p"] < 1, ( + f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" + ) diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index bf17322c19..c62f68306d 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -1,5 +1,4 @@ from dataclasses import asdict, dataclass, field -from typing import Dict, List from coqpit import Coqpit, check_argument @@ -138,7 +137,7 @@ class CharactersConfig(Coqpit): characters_class: str = None # using BaseVocabulary - vocab_dict: Dict = None + vocab_dict: dict = None # using on BaseCharacters pad: str = None @@ -323,7 +322,7 @@ class BaseTTSConfig(BaseTrainingConfig): shuffle: bool = False drop_last: bool = False # dataset - datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) + datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # optimizer optimizer: str = "radam" optimizer_params: dict = None @@ -331,7 +330,7 @@ class BaseTTSConfig(BaseTrainingConfig): lr_scheduler: str = None lr_scheduler_params: dict = field(default_factory=lambda: {}) # testing - test_sentences: List[str] = field(default_factory=lambda: []) + test_sentences: list[str] | list[list[str]] = field(default_factory=lambda: []) # evaluation eval_split_max_size: int = None eval_split_size: float = 0.01 diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py index bf8517dfc4..b37ba174bf 100644 --- a/TTS/tts/configs/speedy_speech_config.py +++ b/TTS/tts/configs/speedy_speech_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.forward_tts import ForwardTTSArgs @@ -129,10 +128,10 @@ class SpeedySpeechConfig(BaseTTSConfig): # multi-speaker settings num_speakers: int = 0 - speakers_file: str = None + speakers_file: str | None = None use_speaker_embedding: bool = False use_d_vector_file: bool = False - d_vector_file: str = False + d_vector_file: str | None = None d_vector_dim: int = 0 # optimizer parameters @@ -161,10 +160,10 @@ class SpeedySpeechConfig(BaseTTSConfig): # dataset configs compute_f0: bool = False - f0_cache_path: str = None + f0_cache_path: str | None = None # testing - test_sentences: List[str] = field( + test_sentences: list[str] | list[list[str]] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index 350b5ea996..caa118815a 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig, CapacitronVAEConfig, GSTConfig @@ -154,7 +153,7 @@ class TacotronConfig(BaseTTSConfig): num_speakers: int = 1 num_chars: int = 0 r: int = 2 - gradual_training: List[List[int]] = None + gradual_training: list[list[int]] = None memory_size: int = -1 prenet_type: str = "original" prenet_dropout: bool = True @@ -170,7 +169,7 @@ class TacotronConfig(BaseTTSConfig): # attention layers attention_type: str = "original" - attention_heads: int = None + attention_heads: int | None = None attention_norm: str = "sigmoid" attention_win: bool = False windowing: bool = False @@ -189,8 +188,8 @@ class TacotronConfig(BaseTTSConfig): use_speaker_embedding: bool = False speaker_embedding_dim: int = 512 use_d_vector_file: bool = False - d_vector_file: str = False - d_vector_dim: int = None + d_vector_file: str | None = None + d_vector_dim: int | None = None # optimizer parameters optimizer: str = "RAdam" @@ -212,7 +211,7 @@ class TacotronConfig(BaseTTSConfig): ga_alpha: float = 5.0 # testing - test_sentences: List[str] = field( + test_sentences: list[str] | list[list[str]] = field( default_factory=lambda: [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", @@ -224,12 +223,12 @@ class TacotronConfig(BaseTTSConfig): def check_values(self): if self.gradual_training: - assert ( - self.gradual_training[0][1] == self.r - ), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}" + assert self.gradual_training[0][1] == self.r, ( + f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}" + ) if self.model == "tacotron" and self.audio is not None: - assert self.out_channels == ( - self.audio.fft_size // 2 + 1 - ), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}" + assert self.out_channels == (self.audio.fft_size // 2 + 1), ( + f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}" + ) if self.model == "tacotron2" and self.audio is not None: assert self.out_channels == self.audio.num_mels diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index 2d0242bf13..9ad720da30 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.vits import VitsArgs, VitsAudioConfig @@ -112,7 +111,7 @@ class VitsConfig(BaseTTSConfig): audio: VitsAudioConfig = field(default_factory=VitsAudioConfig) # optimizer - grad_clip: List[float] = field(default_factory=lambda: [1000, 1000]) + grad_clip: list[float] = field(default_factory=lambda: [1000, 1000]) lr_gen: float = 0.0002 lr_disc: float = 0.0002 lr_scheduler_gen: str = "ExponentialLR" @@ -146,7 +145,7 @@ class VitsConfig(BaseTTSConfig): add_blank: bool = True # testing - test_sentences: List[List] = field( + test_sentences: list[str] | list[list[str]] = field( default_factory=lambda: [ ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."], ["Be a voice, not an echo."], @@ -167,7 +166,7 @@ class VitsConfig(BaseTTSConfig): # use d-vectors use_d_vector_file: bool = False - d_vector_file: List[str] = None + d_vector_file: list[str] = None d_vector_dim: int = None def __post_init__(self): diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py index a0766d425c..1ebce57ba5 100644 --- a/TTS/tts/configs/xtts_config.py +++ b/TTS/tts/configs/xtts_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig @@ -70,7 +69,7 @@ class XttsConfig(BaseTTSConfig): model_args: XttsArgs = field(default_factory=XttsArgs) audio: XttsAudioConfig = field(default_factory=XttsAudioConfig) model_dir: str = None - languages: List[str] = field( + languages: list[str] = field( default_factory=lambda: [ "en", "es", diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index d1a37da4c1..d83abce00a 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -2,8 +2,8 @@ import os import sys from collections import Counter +from collections.abc import Callable from pathlib import Path -from typing import Callable, Dict, List, Tuple, Union import numpy as np @@ -17,7 +17,7 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01): """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training. Args: - items (List[List]): + items (list[list]): A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`. eval_split_max_size (int): @@ -37,10 +37,8 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01): else: eval_split_size = int(len(items) * eval_split_size) - assert ( - eval_split_size > 0 - ), " [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {}".format( - 1 / len(items) + assert eval_split_size > 0, ( + f" [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {1 / len(items)}" ) np.random.seed(0) np.random.shuffle(items) @@ -71,18 +69,18 @@ def add_extra_keys(metadata, language, dataset_name): def load_tts_samples( - datasets: Union[List[Dict], Dict], + datasets: list[dict] | dict, eval_split=True, formatter: Callable = None, eval_split_max_size=None, eval_split_size=0.01, -) -> Tuple[List[List], List[List]]: - """Parse the dataset from the datasets config, load the samples as a List and load the attention alignments if provided. +) -> tuple[list[list], list[list]]: + """Parse the dataset from the datasets config, load the samples as a list and load the attention alignments if provided. If `formatter` is not None, apply the formatter to the samples else pick the formatter from the available ones based on the dataset name. Args: - datasets (List[Dict], Dict): A list of datasets or a single dataset dictionary. If multiple datasets are + datasets (list[dict], dict): A list of datasets or a single dataset dictionary. If multiple datasets are in the list, they are all merged. eval_split (bool, optional): If true, create a evaluation split. If an eval split provided explicitly, generate @@ -101,7 +99,7 @@ def load_tts_samples( If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%). Returns: - Tuple[List[List], List[List]: training and evaluation splits of the dataset. + tuple[list[list], list[list]: training and evaluation splits of the dataset. """ meta_data_train_all = [] meta_data_eval_all = [] if eval_split else None @@ -153,7 +151,7 @@ def load_tts_samples( def load_attention_mask_meta_data(metafile_path): """Load meta data file created by compute_attention_masks.py""" - with open(metafile_path, "r", encoding="utf-8") as f: + with open(metafile_path, encoding="utf-8") as f: lines = f.readlines() meta_data = [] diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 37e3a1779d..6f21dcd1e0 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -3,7 +3,7 @@ import logging import os import random -from typing import Any, Optional, Union +from typing import Any import numpy as np import numpy.typing as npt @@ -47,7 +47,7 @@ def string2filename(string: str) -> str: return base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore") -def get_audio_size(audiopath: Union[str, os.PathLike[Any]]) -> int: +def get_audio_size(audiopath: str | os.PathLike[Any]) -> int: """Return the number of samples in the audio file.""" if not isinstance(audiopath, str): audiopath = str(audiopath) @@ -63,29 +63,54 @@ def get_audio_size(audiopath: Union[str, os.PathLike[Any]]) -> int: raise RuntimeError(msg) from e +def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: dict | None = None): + """Create inverse frequency weights for balancing the dataset. + + Use `multi_dict` to scale relative weights.""" + attr_names_samples = np.array([item[attr_name] for item in items]) + unique_attr_names = np.unique(attr_names_samples).tolist() + attr_idx = [unique_attr_names.index(l) for l in attr_names_samples] + attr_count = np.array([len(np.where(attr_names_samples == l)[0]) for l in unique_attr_names]) + weight_attr = 1.0 / attr_count + dataset_samples_weight = np.array([weight_attr[l] for l in attr_idx]) + dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight) + if multi_dict is not None: + # check if all keys are in the multi_dict + for k in multi_dict: + assert k in unique_attr_names, f"{k} not in {unique_attr_names}" + # scale weights + multiplier_samples = np.array([multi_dict.get(item[attr_name], 1.0) for item in items]) + dataset_samples_weight *= multiplier_samples + return ( + torch.from_numpy(dataset_samples_weight).float(), + unique_attr_names, + np.unique(dataset_samples_weight).tolist(), + ) + + class TTSDataset(Dataset): def __init__( self, outputs_per_step: int = 1, compute_linear_spec: bool = False, ap: AudioProcessor = None, - samples: Optional[list[dict]] = None, + samples: list[dict] | None = None, tokenizer: "TTSTokenizer" = None, compute_f0: bool = False, compute_energy: bool = False, - f0_cache_path: Optional[str] = None, - energy_cache_path: Optional[str] = None, + f0_cache_path: str | None = None, + energy_cache_path: str | None = None, return_wav: bool = False, batch_group_size: int = 0, min_text_len: int = 0, max_text_len: int = float("inf"), min_audio_len: int = 0, max_audio_len: int = float("inf"), - phoneme_cache_path: Optional[str] = None, + phoneme_cache_path: str | None = None, precompute_num_workers: int = 0, - speaker_id_mapping: Optional[dict] = None, - d_vector_mapping: Optional[dict] = None, - language_id_mapping: Optional[dict] = None, + speaker_id_mapping: dict | None = None, + d_vector_mapping: dict | None = None, + language_id_mapping: dict | None = None, use_noise_augment: bool = False, start_by_longest: bool = False, ) -> None: @@ -206,7 +231,7 @@ def lengths(self) -> list[int]: try: audio_len = get_audio_size(wav_file) except RuntimeError: - logger.warning(f"Failed to compute length for {item['audio_file']}") + logger.warning("Failed to compute length for %s", item["audio_file"]) audio_len = 0 lens.append(audio_len) return lens @@ -327,7 +352,7 @@ def _compute_lengths(samples): try: audio_length = get_audio_size(item["audio_file"]) except RuntimeError: - logger.warning(f"Failed to compute length, skipping {item['audio_file']}") + logger.warning("Failed to compute length, skipping %s", item["audio_file"]) continue text_lenght = len(item["text"]) item["audio_length"] = audio_length @@ -412,14 +437,14 @@ def preprocess_samples(self) -> None: self.samples = samples logger.info("Preprocessing samples") - logger.info(f"Max text length: {np.max(text_lengths)}") - logger.info(f"Min text length: {np.min(text_lengths)}") - logger.info(f"Avg text length: {np.mean(text_lengths)}") - logger.info(f"Max audio length: {np.max(audio_lengths)}") - logger.info(f"Min audio length: {np.min(audio_lengths)}") - logger.info(f"Avg audio length: {np.mean(audio_lengths)}") + logger.info("Max text length: %d", np.max(text_lengths)) + logger.info("Min text length: %d", np.min(text_lengths)) + logger.info("Avg text length: %.2f", np.mean(text_lengths)) + logger.info("Max audio length: %.2f", np.max(audio_lengths)) + logger.info("Min audio length: %.2f", np.min(audio_lengths)) + logger.info("Avg audio length: %.2f", np.mean(audio_lengths)) logger.info("Num. instances discarded samples: %d", len(ignore_idx)) - logger.info(f"Batch group size: {self.batch_group_size}.") + logger.info("Batch group size: %d", self.batch_group_size) @staticmethod def _sort_batch(batch, text_lengths): @@ -615,7 +640,7 @@ class PhonemeDataset(Dataset): def __init__( self, - samples: Union[list[dict], list[list]], + samples: list[dict] | list[list], tokenizer: "TTSTokenizer", cache_path: str, precompute_num_workers: int = 0, @@ -719,10 +744,10 @@ class F0Dataset: def __init__( self, - samples: Union[list[list], list[dict]], + samples: list[list] | list[dict], ap: "AudioProcessor", audio_config=None, # pylint: disable=unused-argument - cache_path: Optional[str] = None, + cache_path: str | None = None, precompute_num_workers: int = 0, normalize_f0: bool = True, ) -> None: @@ -871,9 +896,9 @@ class EnergyDataset: def __init__( self, - samples: Union[list[list], list[dict]], + samples: list[list] | list[dict], ap: "AudioProcessor", - cache_path: Optional[str] = None, + cache_path: str | None = None, precompute_num_workers=0, normalize_energy=True, ) -> None: diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index ff1a76e2c9..3a4605275a 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -5,7 +5,6 @@ import xml.etree.ElementTree as ET from glob import glob from pathlib import Path -from typing import List from tqdm import tqdm @@ -21,7 +20,7 @@ def cml_tts(root_path, meta_file, ignored_speakers=None): https://github.com/freds0/CML-TTS-Dataset/""" filepath = os.path.join(root_path, meta_file) # ensure there are 4 columns for every line - with open(filepath, "r", encoding="utf8") as f: + with open(filepath, encoding="utf8") as f: lines = f.readlines() num_cols = len(lines[0].split("|")) # take the first row as reference for idx, line in enumerate(lines[1:]): @@ -61,7 +60,7 @@ def coqui(root_path, meta_file, ignored_speakers=None): """Interal dataset formatter.""" filepath = os.path.join(root_path, meta_file) # ensure there are 4 columns for every line - with open(filepath, "r", encoding="utf8") as f: + with open(filepath, encoding="utf8") as f: lines = f.readlines() num_cols = len(lines[0].split("|")) # take the first row as reference for idx, line in enumerate(lines[1:]): @@ -104,7 +103,7 @@ def tweb(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "tweb" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("\t") wav_file = os.path.join(root_path, cols[0] + ".wav") @@ -118,7 +117,7 @@ def mozilla(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "mozilla" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = cols[1].strip() @@ -133,7 +132,7 @@ def mozilla_de(root_path, meta_file, **kwargs): # pylint: disable=unused-argume txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "mozilla" - with open(txt_file, "r", encoding="ISO 8859-1") as ttf: + with open(txt_file, encoding="ISO 8859-1") as ttf: for line in ttf: cols = line.strip().split("|") wav_file = cols[0].strip() @@ -177,7 +176,7 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None): if speaker_name in ignored_speakers: continue logger.info(csv_file) - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") if not meta_files: @@ -201,7 +200,7 @@ def ljspeech(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "ljspeech" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") @@ -215,7 +214,7 @@ def ljspeech_test(root_path, meta_file, **kwargs): # pylint: disable=unused-arg https://keithito.com/LJ-Speech-Dataset/""" txt_file = os.path.join(root_path, meta_file) items = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: speaker_id = 0 for idx, line in enumerate(ttf): # 2 samples per speaker to avoid eval split issues @@ -236,7 +235,7 @@ def thorsten(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "thorsten" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") @@ -268,7 +267,7 @@ def ruslan(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "ruslan" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "RUSLAN", cols[0] + ".wav") @@ -282,7 +281,7 @@ def css10(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "css10" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, cols[0]) @@ -296,7 +295,7 @@ def nancy(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "nancy" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: utt_id = line.split()[1] text = line[line.find('"') + 1 : line.rfind('"') - 1] @@ -309,7 +308,7 @@ def common_voice(root_path, meta_file, ignored_speakers=None): """Normalize the common voice meta data file to TTS format.""" txt_file = os.path.join(root_path, meta_file) items = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: if line.startswith("client_id"): continue @@ -338,7 +337,7 @@ def libri_tts(root_path, meta_files=None, ignored_speakers=None): for meta_file in meta_files: _meta_file = os.path.basename(meta_file).split(".")[0] - with open(meta_file, "r", encoding="utf-8") as ttf: + with open(meta_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("\t") file_name = cols[0] @@ -368,7 +367,7 @@ def custom_turkish(root_path, meta_file, **kwargs): # pylint: disable=unused-ar items = [] speaker_name = "turkish-female" skipped_files = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0].strip() + ".wav") @@ -386,7 +385,7 @@ def brspeech(root_path, meta_file, ignored_speakers=None): """BRSpeech 3.0 beta""" txt_file = os.path.join(root_path, meta_file) items = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: if line.startswith("wav_filename"): continue @@ -425,7 +424,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic """ file_ext = "flac" items = [] - meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) + meta_files = glob(f"{os.path.join(root_path, 'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] @@ -433,7 +432,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic if isinstance(ignored_speakers, list): if speaker_id in ignored_speakers: continue - with open(meta_file, "r", encoding="utf-8") as file_text: + with open(meta_file, encoding="utf-8") as file_text: text = file_text.readlines()[0] # p280 has no mic2 recordings if speaker_id == "p280": @@ -452,7 +451,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] - meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) + meta_files = glob(f"{os.path.join(root_path, 'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] @@ -460,7 +459,7 @@ def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=Non if isinstance(ignored_speakers, list): if speaker_id in ignored_speakers: continue - with open(meta_file, "r", encoding="utf-8") as file_text: + with open(meta_file, encoding="utf-8") as file_text: text = file_text.readlines()[0] wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") items.append( @@ -482,7 +481,7 @@ def synpaflex(root_path, metafiles=None, **kwargs): # pylint: disable=unused-ar os.path.dirname(wav_file), "txt", os.path.basename(wav_file).replace(".wav", ".txt") ) if os.path.exists(txt_file) and os.path.exists(wav_file): - with open(txt_file, "r", encoding="utf-8") as file_text: + with open(txt_file, encoding="utf-8") as file_text: text = file_text.readlines()[0] items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}) return items @@ -500,7 +499,7 @@ def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, igno if isinstance(ignored_speakers, list): if speaker_id in ignored_speakers: continue - with open(meta_file, "r", encoding="utf-8") as file_text: + with open(meta_file, encoding="utf-8") as file_text: text = file_text.readline().replace("\n", "") # ignore sentences that contains digits if ignore_digits_sentences and any(map(str.isdigit, text)): @@ -513,7 +512,7 @@ def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, igno def mls(root_path, meta_files=None, ignored_speakers=None): """http://www.openslr.org/94/""" items = [] - with open(os.path.join(root_path, meta_files), "r", encoding="utf-8") as meta: + with open(os.path.join(root_path, meta_files), encoding="utf-8") as meta: for line in meta: file, text = line.split("\t") text = text[:-1] @@ -553,7 +552,7 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): # if not exists meta file, crawl recursively for 'wav' files if meta_file is not None: - with open(str(meta_file), "r", encoding="utf-8") as f: + with open(str(meta_file), encoding="utf-8") as f: return [x.strip().split("|") for x in f.readlines()] elif not cache_to.exists(): @@ -575,7 +574,7 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): if cnt < expected_count: raise ValueError(f"Found too few instances for Voxceleb. Should be around {expected_count}, is: {cnt}") - with open(str(cache_to), "r", encoding="utf-8") as f: + with open(str(cache_to), encoding="utf-8") as f: return [x.strip().split("|") for x in f.readlines()] @@ -583,7 +582,7 @@ def emotion(root_path, meta_file, ignored_speakers=None): """Generic emotion dataset""" txt_file = os.path.join(root_path, meta_file) items = [] - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: if line.startswith("file_path"): continue @@ -601,7 +600,7 @@ def emotion(root_path, meta_file, ignored_speakers=None): return items -def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: # pylint: disable=unused-argument +def baker(root_path: str, meta_file: str, **kwargs) -> list[list[str]]: # pylint: disable=unused-argument """Normalizes the Baker meta data file to TTS format Args: @@ -613,7 +612,7 @@ def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: # pylin txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "baker" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: wav_name, text = line.rstrip("\n").split("|") wav_path = os.path.join(root_path, "clips_22", wav_name) @@ -626,7 +625,7 @@ def kokoro(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "kokoro" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") @@ -640,7 +639,7 @@ def kss(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "kss" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, cols[0]) @@ -653,7 +652,7 @@ def bel_tts_formatter(root_path, meta_file, **kwargs): # pylint: disable=unused txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "bel_tts" - with open(txt_file, "r", encoding="utf-8") as ttf: + with open(txt_file, encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") wav_file = os.path.join(root_path, cols[0]) diff --git a/TTS/tts/layers/bark/hubert/kmeans_hubert.py b/TTS/tts/layers/bark/hubert/kmeans_hubert.py index 58a614cb87..87be97d5d1 100644 --- a/TTS/tts/layers/bark/hubert/kmeans_hubert.py +++ b/TTS/tts/layers/bark/hubert/kmeans_hubert.py @@ -7,13 +7,14 @@ # Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py - import torch from einops import pack, unpack from torch import nn from torchaudio.functional import resample from transformers import HubertModel +from TTS.utils.generic_utils import exists + def round_down_nearest_multiple(num, divisor): return num // divisor * divisor @@ -26,14 +27,6 @@ def curtail_to_multiple(t, mult, from_left=False): return t[..., seq_slice] -def exists(val): - return val is not None - - -def default(val, d): - return val if exists(val) else d - - class CustomHubert(nn.Module): """ checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert diff --git a/TTS/tts/layers/bark/inference_funcs.py b/TTS/tts/layers/bark/inference_funcs.py index 65c7800dcf..457a20ea28 100644 --- a/TTS/tts/layers/bark/inference_funcs.py +++ b/TTS/tts/layers/bark/inference_funcs.py @@ -2,7 +2,6 @@ import os import re from glob import glob -from typing import Dict, List, Optional, Tuple import librosa import numpy as np @@ -34,9 +33,9 @@ def _normalize_whitespace(text): return re.sub(r"\s+", " ", text).strip() -def get_voices(extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value +def get_voices(extra_voice_dirs: list[str] = []): # pylint: disable=dangerous-default-value dirs = extra_voice_dirs - voices: Dict[str, List[str]] = {} + voices: dict[str, list[str]] = {} for d in dirs: subs = os.listdir(d) for sub in subs: @@ -49,7 +48,7 @@ def get_voices(extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-d return voices -def load_npz(npz_file: str) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]: +def load_npz(npz_file: str) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]: x_history = np.load(npz_file) semantic = x_history["semantic_prompt"] coarse = x_history["coarse_prompt"] @@ -58,10 +57,8 @@ def load_npz(npz_file: str) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64 def load_voice( - model, voice: str, extra_voice_dirs: List[str] = [] -) -> Tuple[ - Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]] -]: # pylint: disable=dangerous-default-value + model, voice: str, extra_voice_dirs: list[str] = [] +) -> tuple[npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None, npt.NDArray[np.int64] | None]: # pylint: disable=dangerous-default-value if voice == "random": return None, None, None @@ -206,8 +203,8 @@ def generate_text_semantic( semantic_history = None encoded_text = np.array(_tokenize(model.tokenizer, text)) + model.config.TEXT_ENCODING_OFFSET if len(encoded_text) > 256: - p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1) - logger.warning(f"warning, text too long, lopping of last {p}%") + p = (len(encoded_text) - 256) / len(encoded_text) * 100 + logger.warning("warning, text too long, lopping of last %.1f%%", p) encoded_text = encoded_text[:256] encoded_text = np.pad( encoded_text, diff --git a/TTS/tts/layers/bark/load_model.py b/TTS/tts/layers/bark/load_model.py index 6b7caab916..dcec5b5bbc 100644 --- a/TTS/tts/layers/bark/load_model.py +++ b/TTS/tts/layers/bark/load_model.py @@ -88,7 +88,7 @@ def clear_cuda_cache(): def load_model(ckpt_path, device, config, model_type="text"): - logger.info(f"loading {model_type} model from {ckpt_path}...") + logger.info("loading %s model from %s...", model_type, ckpt_path) if device == "cpu": logger.warning("No GPU being used. Careful, Inference might be extremely slow!") @@ -108,11 +108,13 @@ def load_model(ckpt_path, device, config, model_type="text"): and os.path.exists(ckpt_path) and _md5(ckpt_path) != config.REMOTE_MODEL_PATHS[model_type]["checksum"] ): - logger.warning(f"found outdated {model_type} model, removing...") + logger.warning("found outdated %s model, removing...", model_type) os.remove(ckpt_path) if not os.path.exists(ckpt_path): - logger.info(f"{model_type} model not found, downloading...") - _download(config.REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path, config.CACHE_DIR) + logger.info("%s model not found, downloading...", model_type) + # The URL in the config is a 404 and needs to be fixed + download_url = config.REMOTE_MODEL_PATHS[model_type]["path"].replace("tree", "resolve") + _download(download_url, ckpt_path, config.CACHE_DIR) checkpoint = torch.load(ckpt_path, map_location=device, weights_only=is_pytorch_at_least_2_4()) # this is a hack @@ -148,7 +150,7 @@ def load_model(ckpt_path, device, config, model_type="text"): model.load_state_dict(state_dict, strict=False) n_params = model.get_num_params() val_loss = checkpoint["best_val_loss"].item() - logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss") + logger.info("model loaded: %.1fM params, %.3f loss", n_params / 1e6, val_loss) model.eval() model.to(device) del checkpoint, state_dict diff --git a/TTS/tts/layers/bark/model.py b/TTS/tts/layers/bark/model.py index 68c50dbdbd..4850d0a88b 100644 --- a/TTS/tts/layers/bark/model.py +++ b/TTS/tts/layers/bark/model.py @@ -12,18 +12,6 @@ from torch.nn import functional as F -class LayerNorm(nn.Module): - """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False""" - - def __init__(self, ndim, bias): - super().__init__() - self.weight = nn.Parameter(torch.ones(ndim)) - self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None - - def forward(self, x): - return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5) - - class CausalSelfAttention(nn.Module): def __init__(self, config): super().__init__() @@ -119,9 +107,9 @@ def forward(self, x): class Block(nn.Module): def __init__(self, config, layer_idx): super().__init__() - self.ln_1 = LayerNorm(config.n_embd, bias=config.bias) + self.ln_1 = nn.LayerNorm(config.n_embd, bias=config.bias) self.attn = CausalSelfAttention(config) - self.ln_2 = LayerNorm(config.n_embd, bias=config.bias) + self.ln_2 = nn.LayerNorm(config.n_embd, bias=config.bias) self.mlp = MLP(config) self.layer_idx = layer_idx @@ -158,7 +146,7 @@ def __init__(self, config): wpe=nn.Embedding(config.block_size, config.n_embd), drop=nn.Dropout(config.dropout), h=nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]), - ln_f=LayerNorm(config.n_embd, bias=config.bias), + ln_f=nn.LayerNorm(config.n_embd, bias=config.bias), ) ) self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False) @@ -187,9 +175,9 @@ def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use assert idx.shape[1] >= 256 + 256 + 1 t = idx.shape[1] - 256 else: - assert ( - t <= self.config.block_size - ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + assert t <= self.config.block_size, ( + f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + ) # forward the GPT model itself if merge_context: diff --git a/TTS/tts/layers/bark/model_fine.py b/TTS/tts/layers/bark/model_fine.py index 29126b41ab..20f54d2152 100644 --- a/TTS/tts/layers/bark/model_fine.py +++ b/TTS/tts/layers/bark/model_fine.py @@ -101,9 +101,9 @@ def __init__(self, config): def forward(self, pred_idx, idx): device = idx.device b, t, codes = idx.size() - assert ( - t <= self.config.block_size - ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + assert t <= self.config.block_size, ( + f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" + ) assert pred_idx > 0, "cannot predict 0th codebook" assert codes == self.n_codes_total, (b, t, codes) pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t) diff --git a/TTS/tts/layers/delightful_tts/acoustic_model.py b/TTS/tts/layers/delightful_tts/acoustic_model.py index 3c0e3a3a76..9110ff5fd0 100644 --- a/TTS/tts/layers/delightful_tts/acoustic_model.py +++ b/TTS/tts/layers/delightful_tts/acoustic_model.py @@ -1,6 +1,6 @@ ### credit: https://github.com/dunky11/voicesmith import logging -from typing import Callable, Dict, Tuple +from collections.abc import Callable import torch import torch.nn.functional as F @@ -12,7 +12,6 @@ from TTS.tts.layers.delightful_tts.encoders import ( PhonemeLevelProsodyEncoder, UtteranceLevelProsodyEncoder, - get_mask_from_lengths, ) from TTS.tts.layers.delightful_tts.energy_adaptor import EnergyAdaptor from TTS.tts.layers.delightful_tts.networks import EmbeddingPadded, positional_encoding @@ -20,7 +19,7 @@ from TTS.tts.layers.delightful_tts.pitch_adaptor import PitchAdaptor from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor from TTS.tts.layers.generic.aligner import AlignmentNetwork -from TTS.tts.utils.helpers import generate_path, sequence_mask +from TTS.tts.utils.helpers import expand_encoder_outputs, generate_attention, sequence_mask logger = logging.getLogger(__name__) @@ -178,7 +177,7 @@ def init_multispeaker(self, args: Coqpit): # pylint: disable=unused-argument self._init_d_vector() @staticmethod - def _set_cond_input(aux_input: Dict): + def _set_cond_input(aux_input: dict): """Set the speaker conditioning input based on the multi-speaker mode.""" sid, g, lid, durations = None, None, None, None if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None: @@ -195,11 +194,11 @@ def _set_cond_input(aux_input: Dict): return sid, g, lid, durations - def get_aux_input(self, aux_input: Dict): + def get_aux_input(self, aux_input: dict): sid, g, lid, _ = self._set_cond_input(aux_input) return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid} - def _set_speaker_input(self, aux_input: Dict): + def _set_speaker_input(self, aux_input: dict): d_vectors = aux_input.get("d_vectors", None) speaker_ids = aux_input.get("speaker_ids", None) @@ -231,42 +230,6 @@ def _init_d_vector(self): raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.") self.embedded_speaker_dim = self.args.d_vector_dim - @staticmethod - def generate_attn(dr, x_mask, y_mask=None): - """Generate an attention mask from the linear scale durations. - - Args: - dr (Tensor): Linear scale durations. - x_mask (Tensor): Mask for the input (character) sequence. - y_mask (Tensor): Mask for the output (spectrogram) sequence. Compute it from the predicted durations - if None. Defaults to None. - - Shapes - - dr: :math:`(B, T_{en})` - - x_mask: :math:`(B, T_{en})` - - y_mask: :math:`(B, T_{de})` - """ - # compute decode mask from the durations - if y_mask is None: - y_lengths = dr.sum(1).long() - y_lengths[y_lengths < 1] = 1 - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype) - attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) - attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype) - return attn - - def _expand_encoder_with_durations( - self, - o_en: torch.FloatTensor, - dr: torch.IntTensor, - x_mask: torch.IntTensor, - y_lengths: torch.IntTensor, - ): - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype) - attn = self.generate_attn(dr, x_mask, y_mask) - o_en_ex = torch.einsum("kmn, kjm -> kjn", [attn.float(), o_en]) - return y_mask, o_en_ex, attn.transpose(1, 2) - def _forward_aligner( self, x: torch.FloatTensor, @@ -274,7 +237,7 @@ def _forward_aligner( x_mask: torch.IntTensor, y_mask: torch.IntTensor, attn_priors: torch.FloatTensor, - ) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: """Aligner forward pass. 1. Compute a mask to apply to the attention map. @@ -335,13 +298,13 @@ def forward( use_ground_truth: bool = True, d_vectors: torch.Tensor = None, speaker_idx: torch.Tensor = None, - ) -> Dict[str, torch.Tensor]: + ) -> dict[str, torch.Tensor]: sid, g, lid, _ = self._set_cond_input( # pylint: disable=unused-variable {"d_vectors": d_vectors, "speaker_ids": speaker_idx} ) # pylint: disable=unused-variable - src_mask = get_mask_from_lengths(src_lens) # [B, T_src] - mel_mask = get_mask_from_lengths(mel_lens) # [B, T_mel] + src_mask = ~sequence_mask(src_lens) # [B, T_src] + mel_mask = ~sequence_mask(mel_lens) # [B, T_mel] # Token embeddings token_embeddings = self.src_word_emb(tokens) # [B, T_src, C_hidden] @@ -420,8 +383,8 @@ def forward( encoder_outputs = encoder_outputs.transpose(1, 2) + pitch_emb + energy_emb log_duration_prediction = self.duration_predictor(x=encoder_outputs_res.detach(), mask=src_mask) - mel_pred_mask, encoder_outputs_ex, alignments = self._expand_encoder_with_durations( - o_en=encoder_outputs, y_lengths=mel_lens, dr=dr, x_mask=~src_mask[:, None] + encoder_outputs_ex, alignments, mel_pred_mask = expand_encoder_outputs( + encoder_outputs, y_lengths=mel_lens, duration=dr, x_mask=~src_mask[:, None] ) x = self.decoder( @@ -435,7 +398,7 @@ def forward( dr = torch.log(dr + 1) dr_pred = torch.exp(log_duration_prediction) - 1 - alignments_dp = self.generate_attn(dr_pred, src_mask.unsqueeze(1), mel_pred_mask) # [B, T_max, T_max2'] + alignments_dp = generate_attention(dr_pred, src_mask.unsqueeze(1), mel_pred_mask) # [B, T_max, T_max2'] return { "model_outputs": x, @@ -448,7 +411,7 @@ def forward( "p_prosody_pred": p_prosody_pred, "p_prosody_ref": p_prosody_ref, "alignments_dp": alignments_dp, - "alignments": alignments, # [B, T_de, T_en] + "alignments": alignments.transpose(1, 2), # [B, T_de, T_en] "aligner_soft": aligner_soft, "aligner_mas": aligner_mas, "aligner_durations": aligner_durations, @@ -458,7 +421,7 @@ def forward( "spk_emb": speaker_embedding, } - @torch.no_grad() + @torch.inference_mode() def inference( self, tokens: torch.Tensor, @@ -469,7 +432,7 @@ def inference( pitch_transform: Callable = None, energy_transform: Callable = None, ) -> torch.Tensor: - src_mask = get_mask_from_lengths(torch.tensor([tokens.shape[1]], dtype=torch.int64, device=tokens.device)) + src_mask = ~sequence_mask(torch.tensor([tokens.shape[1]], dtype=torch.int64, device=tokens.device)) src_lens = torch.tensor(tokens.shape[1:2]).to(tokens.device) # pylint: disable=unused-variable sid, g, lid, _ = self._set_cond_input( # pylint: disable=unused-variable {"d_vectors": d_vectors, "speaker_ids": speaker_idx} @@ -536,11 +499,11 @@ def inference( duration_pred = torch.round(duration_pred) # -> [B, T_src] mel_lens = duration_pred.sum(1) # -> [B,] - _, encoder_outputs_ex, alignments = self._expand_encoder_with_durations( - o_en=encoder_outputs, y_lengths=mel_lens, dr=duration_pred.squeeze(1), x_mask=~src_mask[:, None] + encoder_outputs_ex, alignments, _ = expand_encoder_outputs( + encoder_outputs, y_lengths=mel_lens, duration=duration_pred.squeeze(1), x_mask=~src_mask[:, None] ) - mel_mask = get_mask_from_lengths( + mel_mask = ~sequence_mask( torch.tensor([encoder_outputs_ex.shape[2]], dtype=torch.int64, device=encoder_outputs_ex.device) ) @@ -557,7 +520,7 @@ def inference( x = self.to_mel(x) outputs = { "model_outputs": x, - "alignments": alignments, + "alignments": alignments.transpose(1, 2), # "pitch": pitch_emb_pred, "durations": duration_pred, "pitch": pitch_pred, diff --git a/TTS/tts/layers/delightful_tts/conformer.py b/TTS/tts/layers/delightful_tts/conformer.py index b2175b3b96..227a871c69 100644 --- a/TTS/tts/layers/delightful_tts/conformer.py +++ b/TTS/tts/layers/delightful_tts/conformer.py @@ -1,20 +1,14 @@ ### credit: https://github.com/dunky11/voicesmith import math -from typing import Tuple import torch import torch.nn as nn # pylint: disable=consider-using-from-import import torch.nn.functional as F -from TTS.tts.layers.delightful_tts.conv_layers import Conv1dGLU, DepthWiseConv1d, PointwiseConv1d +from TTS.tts.layers.delightful_tts.conv_layers import Conv1dGLU, DepthWiseConv1d, PointwiseConv1d, calc_same_padding from TTS.tts.layers.delightful_tts.networks import GLUActivation -def calc_same_padding(kernel_size: int) -> Tuple[int, int]: - pad = kernel_size // 2 - return (pad, pad - (kernel_size + 1) % 2) - - class Conformer(nn.Module): def __init__( self, @@ -322,7 +316,7 @@ def forward( value: torch.Tensor, mask: torch.Tensor, encoding: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: batch_size, seq_length, _ = key.size() # pylint: disable=unused-variable encoding = encoding[:, : key.shape[1]] encoding = encoding.repeat(batch_size, 1, 1) @@ -378,7 +372,7 @@ def forward( value: torch.Tensor, pos_embedding: torch.Tensor, mask: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: batch_size = query.shape[0] query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head) key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3) @@ -411,40 +405,3 @@ def _relative_shift(self, pos_score: torch.Tensor) -> torch.Tensor: # pylint: d padded_pos_score = padded_pos_score.view(batch_size, num_heads, seq_length2 + 1, seq_length1) pos_score = padded_pos_score[:, :, 1:].view_as(pos_score) return pos_score - - -class MultiHeadAttention(nn.Module): - """ - input: - query --- [N, T_q, query_dim] - key --- [N, T_k, key_dim] - output: - out --- [N, T_q, num_units] - """ - - def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int): - super().__init__() - self.num_units = num_units - self.num_heads = num_heads - self.key_dim = key_dim - - self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False) - self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) - self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) - - def forward(self, query: torch.Tensor, key: torch.Tensor) -> torch.Tensor: - querys = self.W_query(query) # [N, T_q, num_units] - keys = self.W_key(key) # [N, T_k, num_units] - values = self.W_value(key) - split_size = self.num_units // self.num_heads - querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0) # [h, N, T_q, num_units/h] - keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h] - values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h] - # score = softmax(QK^T / (d_k ** 0.5)) - scores = torch.matmul(querys, keys.transpose(2, 3)) # [h, N, T_q, T_k] - scores = scores / (self.key_dim**0.5) - scores = F.softmax(scores, dim=3) - # out = score * V - out = torch.matmul(scores, values) # [h, N, T_q, num_units/h] - out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units] - return out diff --git a/TTS/tts/layers/delightful_tts/conv_layers.py b/TTS/tts/layers/delightful_tts/conv_layers.py index fb9aa4495f..5cf41d4ff6 100644 --- a/TTS/tts/layers/delightful_tts/conv_layers.py +++ b/TTS/tts/layers/delightful_tts/conv_layers.py @@ -1,14 +1,9 @@ -from typing import Tuple - import torch import torch.nn as nn # pylint: disable=consider-using-from-import import torch.nn.functional as F -from torch.nn.utils import parametrize - -from TTS.tts.layers.delightful_tts.kernel_predictor import KernelPredictor -def calc_same_padding(kernel_size: int) -> Tuple[int, int]: +def calc_same_padding(kernel_size: int) -> tuple[int, int]: pad = kernel_size // 2 return (pad, pad - (kernel_size + 1) % 2) @@ -55,7 +50,7 @@ def __init__( w_init_gain="linear", use_weight_norm=False, ): - super(ConvNorm, self).__init__() # pylint: disable=super-with-arguments + super().__init__() if padding is None: assert kernel_size % 2 == 1 padding = int(dilation * (kernel_size - 1) / 2) @@ -97,7 +92,7 @@ def __init__( lstm_type="bilstm", use_linear=True, ): - super(ConvLSTMLinear, self).__init__() # pylint: disable=super-with-arguments + super().__init__() self.out_dim = out_dim self.lstm_type = lstm_type self.use_linear = use_linear @@ -530,142 +525,3 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.addcoords(x) x = self.conv(x) return x - - -class LVCBlock(torch.nn.Module): - """the location-variable convolutions""" - - def __init__( # pylint: disable=dangerous-default-value - self, - in_channels, - cond_channels, - stride, - dilations=[1, 3, 9, 27], - lReLU_slope=0.2, - conv_kernel_size=3, - cond_hop_length=256, - kpnet_hidden_channels=64, - kpnet_conv_size=3, - kpnet_dropout=0.0, - ): - super().__init__() - - self.cond_hop_length = cond_hop_length - self.conv_layers = len(dilations) - self.conv_kernel_size = conv_kernel_size - - self.kernel_predictor = KernelPredictor( - cond_channels=cond_channels, - conv_in_channels=in_channels, - conv_out_channels=2 * in_channels, - conv_layers=len(dilations), - conv_kernel_size=conv_kernel_size, - kpnet_hidden_channels=kpnet_hidden_channels, - kpnet_conv_size=kpnet_conv_size, - kpnet_dropout=kpnet_dropout, - kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope}, - ) - - self.convt_pre = nn.Sequential( - nn.LeakyReLU(lReLU_slope), - nn.utils.parametrizations.weight_norm( - nn.ConvTranspose1d( - in_channels, - in_channels, - 2 * stride, - stride=stride, - padding=stride // 2 + stride % 2, - output_padding=stride % 2, - ) - ), - ) - - self.conv_blocks = nn.ModuleList() - for dilation in dilations: - self.conv_blocks.append( - nn.Sequential( - nn.LeakyReLU(lReLU_slope), - nn.utils.parametrizations.weight_norm( - nn.Conv1d( - in_channels, - in_channels, - conv_kernel_size, - padding=dilation * (conv_kernel_size - 1) // 2, - dilation=dilation, - ) - ), - nn.LeakyReLU(lReLU_slope), - ) - ) - - def forward(self, x, c): - """forward propagation of the location-variable convolutions. - Args: - x (Tensor): the input sequence (batch, in_channels, in_length) - c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) - - Returns: - Tensor: the output sequence (batch, in_channels, in_length) - """ - _, in_channels, _ = x.shape # (B, c_g, L') - - x = self.convt_pre(x) # (B, c_g, stride * L') - kernels, bias = self.kernel_predictor(c) - - for i, conv in enumerate(self.conv_blocks): - output = conv(x) # (B, c_g, stride * L') - - k = kernels[:, i, :, :, :, :] # (B, 2 * c_g, c_g, kernel_size, cond_length) - b = bias[:, i, :, :] # (B, 2 * c_g, cond_length) - - output = self.location_variable_convolution( - output, k, b, hop_size=self.cond_hop_length - ) # (B, 2 * c_g, stride * L'): LVC - x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh( - output[:, in_channels:, :] - ) # (B, c_g, stride * L'): GAU - - return x - - def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256): # pylint: disable=no-self-use - """perform location-variable convolution operation on the input sequence (x) using the local convolution kernl. - Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100. - Args: - x (Tensor): the input sequence (batch, in_channels, in_length). - kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length) - bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length) - dilation (int): the dilation of convolution. - hop_size (int): the hop_size of the conditioning sequence. - Returns: - (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length). - """ - batch, _, in_length = x.shape - batch, _, out_channels, kernel_size, kernel_length = kernel.shape - assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched" - - padding = dilation * int((kernel_size - 1) / 2) - x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding) - x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding) - - if hop_size < dilation: - x = F.pad(x, (0, dilation), "constant", 0) - x = x.unfold( - 3, dilation, dilation - ) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation) - x = x[:, :, :, :, :hop_size] - x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation) - x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size) - - o = torch.einsum("bildsk,biokl->bolsd", x, kernel) - o = o.to(memory_format=torch.channels_last_3d) - bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d) - o = o + bias - o = o.contiguous().view(batch, out_channels, -1) - - return o - - def remove_weight_norm(self): - self.kernel_predictor.remove_weight_norm() - parametrize.remove_parametrizations(self.convt_pre[1], "weight") - for block in self.conv_blocks: - parametrize.remove_parametrizations(block[1], "weight") diff --git a/TTS/tts/layers/delightful_tts/encoders.py b/TTS/tts/layers/delightful_tts/encoders.py index 0878f0677a..31bab8cc97 100644 --- a/TTS/tts/layers/delightful_tts/encoders.py +++ b/TTS/tts/layers/delightful_tts/encoders.py @@ -1,5 +1,3 @@ -from typing import List, Tuple, Union - import torch import torch.nn as nn # pylint: disable=consider-using-from-import import torch.nn.functional as F @@ -7,14 +5,7 @@ from TTS.tts.layers.delightful_tts.conformer import ConformerMultiHeadedSelfAttention from TTS.tts.layers.delightful_tts.conv_layers import CoordConv1d from TTS.tts.layers.delightful_tts.networks import STL - - -def get_mask_from_lengths(lengths: torch.Tensor) -> torch.Tensor: - batch_size = lengths.shape[0] - max_len = torch.max(lengths).item() - ids = torch.arange(0, max_len, device=lengths.device).unsqueeze(0).expand(batch_size, -1) - mask = ids >= lengths.unsqueeze(1).expand(-1, max_len) - return mask +from TTS.tts.utils.helpers import sequence_mask def stride_lens(lens: torch.Tensor, stride: int = 2) -> torch.Tensor: @@ -43,9 +34,9 @@ class ReferenceEncoder(nn.Module): def __init__( self, num_mels: int, - ref_enc_filters: List[Union[int, int, int, int, int, int]], + ref_enc_filters: list[int | int | int | int | int | int], ref_enc_size: int, - ref_enc_strides: List[Union[int, int, int, int, int]], + ref_enc_strides: list[int | int | int | int | int], ref_enc_gru_size: int, ): super().__init__() @@ -87,13 +78,13 @@ def __init__( batch_first=True, ) - def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ inputs --- [N, n_mels, timesteps] outputs --- [N, E//2] """ - mel_masks = get_mask_from_lengths(mel_lens).unsqueeze(1) + mel_masks = ~sequence_mask(mel_lens).unsqueeze(1) x = x.masked_fill(mel_masks, 0) for conv, norm in zip(self.convs, self.norms): x = conv(x) @@ -103,7 +94,7 @@ def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> Tuple[torch.Tensor for _ in range(2): mel_lens = stride_lens(mel_lens) - mel_masks = get_mask_from_lengths(mel_lens) + mel_masks = ~sequence_mask(mel_lens) x = x.masked_fill(mel_masks.unsqueeze(1), 0) x = x.permute((0, 2, 1)) @@ -127,9 +118,9 @@ class UtteranceLevelProsodyEncoder(nn.Module): def __init__( self, num_mels: int, - ref_enc_filters: List[Union[int, int, int, int, int, int]], + ref_enc_filters: list[int | int | int | int | int | int], ref_enc_size: int, - ref_enc_strides: List[Union[int, int, int, int, int]], + ref_enc_strides: list[int | int | int | int | int], ref_enc_gru_size: int, dropout: float, n_hidden: int, @@ -199,9 +190,9 @@ class PhonemeLevelProsodyEncoder(nn.Module): def __init__( self, num_mels: int, - ref_enc_filters: List[Union[int, int, int, int, int, int]], + ref_enc_filters: list[int | int | int | int | int | int], ref_enc_size: int, - ref_enc_strides: List[Union[int, int, int, int, int]], + ref_enc_strides: list[int | int | int | int | int], ref_enc_gru_size: int, dropout: float, n_hidden: int, diff --git a/TTS/tts/layers/delightful_tts/energy_adaptor.py b/TTS/tts/layers/delightful_tts/energy_adaptor.py index ea0d1e4721..d2b4b0ffa8 100644 --- a/TTS/tts/layers/delightful_tts/energy_adaptor.py +++ b/TTS/tts/layers/delightful_tts/energy_adaptor.py @@ -1,4 +1,4 @@ -from typing import Callable, Tuple +from collections.abc import Callable import torch import torch.nn as nn # pylint: disable=consider-using-from-import @@ -59,7 +59,7 @@ def __init__( def get_energy_embedding_train( self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Shapes: x: :math: `[B, T_src, C]` diff --git a/TTS/tts/layers/delightful_tts/kernel_predictor.py b/TTS/tts/layers/delightful_tts/kernel_predictor.py deleted file mode 100644 index 96c550b6c2..0000000000 --- a/TTS/tts/layers/delightful_tts/kernel_predictor.py +++ /dev/null @@ -1,128 +0,0 @@ -import torch.nn as nn # pylint: disable=consider-using-from-import -from torch.nn.utils import parametrize - - -class KernelPredictor(nn.Module): - """Kernel predictor for the location-variable convolutions - - Args: - cond_channels (int): number of channel for the conditioning sequence, - conv_in_channels (int): number of channel for the input sequence, - conv_out_channels (int): number of channel for the output sequence, - conv_layers (int): number of layers - - """ - - def __init__( # pylint: disable=dangerous-default-value - self, - cond_channels, - conv_in_channels, - conv_out_channels, - conv_layers, - conv_kernel_size=3, - kpnet_hidden_channels=64, - kpnet_conv_size=3, - kpnet_dropout=0.0, - kpnet_nonlinear_activation="LeakyReLU", - kpnet_nonlinear_activation_params={"negative_slope": 0.1}, - ): - super().__init__() - - self.conv_in_channels = conv_in_channels - self.conv_out_channels = conv_out_channels - self.conv_kernel_size = conv_kernel_size - self.conv_layers = conv_layers - - kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers # l_w - kpnet_bias_channels = conv_out_channels * conv_layers # l_b - - self.input_conv = nn.Sequential( - nn.utils.parametrizations.weight_norm( - nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True) - ), - getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - ) - - self.residual_convs = nn.ModuleList() - padding = (kpnet_conv_size - 1) // 2 - for _ in range(3): - self.residual_convs.append( - nn.Sequential( - nn.Dropout(kpnet_dropout), - nn.utils.parametrizations.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_hidden_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ), - getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - nn.utils.parametrizations.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_hidden_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ), - getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - ) - ) - self.kernel_conv = nn.utils.parametrizations.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_kernel_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ) - self.bias_conv = nn.utils.parametrizations.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_bias_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ) - - def forward(self, c): - """ - Args: - c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) - """ - batch, _, cond_length = c.shape - c = self.input_conv(c) - for residual_conv in self.residual_convs: - residual_conv.to(c.device) - c = c + residual_conv(c) - k = self.kernel_conv(c) - b = self.bias_conv(c) - kernels = k.contiguous().view( - batch, - self.conv_layers, - self.conv_in_channels, - self.conv_out_channels, - self.conv_kernel_size, - cond_length, - ) - bias = b.contiguous().view( - batch, - self.conv_layers, - self.conv_out_channels, - cond_length, - ) - - return kernels, bias - - def remove_weight_norm(self): - parametrize.remove_parametrizations(self.input_conv[0], "weight") - parametrize.remove_parametrizations(self.kernel_conv, "weight") - parametrize.remove_parametrizations(self.bias_conv, "weight") - for block in self.residual_convs: - parametrize.remove_parametrizations(block[1], "weight") - parametrize.remove_parametrizations(block[3], "weight") diff --git a/TTS/tts/layers/delightful_tts/networks.py b/TTS/tts/layers/delightful_tts/networks.py index 4305022f18..93b65a2a74 100644 --- a/TTS/tts/layers/delightful_tts/networks.py +++ b/TTS/tts/layers/delightful_tts/networks.py @@ -1,5 +1,4 @@ import math -from typing import Tuple import numpy as np import torch @@ -9,7 +8,7 @@ from TTS.tts.layers.delightful_tts.conv_layers import ConvNorm -def initialize_embeddings(shape: Tuple[int]) -> torch.Tensor: +def initialize_embeddings(shape: tuple[int]) -> torch.Tensor: assert len(shape) == 2, "Can only initialize 2-D embedding matrices ..." # Kaiming initialization return torch.randn(shape) * np.sqrt(2 / shape[1]) @@ -52,7 +51,7 @@ def __init__( kernel_size=3, use_partial_padding=False, # pylint: disable=unused-argument ): - super(BottleneckLayer, self).__init__() # pylint: disable=super-with-arguments + super().__init__() self.reduction_factor = reduction_factor reduced_dim = int(in_dim / reduction_factor) @@ -195,7 +194,7 @@ class STL(nn.Module): """ def __init__(self, n_hidden: int, token_num: int): - super(STL, self).__init__() # pylint: disable=super-with-arguments + super().__init__() num_heads = 1 E = n_hidden diff --git a/TTS/tts/layers/delightful_tts/pitch_adaptor.py b/TTS/tts/layers/delightful_tts/pitch_adaptor.py index 9031369e0f..14e751d2e2 100644 --- a/TTS/tts/layers/delightful_tts/pitch_adaptor.py +++ b/TTS/tts/layers/delightful_tts/pitch_adaptor.py @@ -1,4 +1,4 @@ -from typing import Callable, Tuple +from collections.abc import Callable import torch import torch.nn as nn # pylint: disable=consider-using-from-import @@ -58,7 +58,7 @@ def __init__( def get_pitch_embedding_train( self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Shapes: x: :math: `[B, T_src, C]` diff --git a/TTS/tts/layers/feed_forward/encoder.py b/TTS/tts/layers/feed_forward/encoder.py index caf939ffc7..2d08f03c2d 100644 --- a/TTS/tts/layers/feed_forward/encoder.py +++ b/TTS/tts/layers/feed_forward/encoder.py @@ -143,9 +143,9 @@ def __init__( elif encoder_type.lower() == "residual_conv_bn": self.encoder = ResidualConv1dBNEncoder(in_hidden_channels, out_channels, in_hidden_channels, encoder_params) elif encoder_type.lower() == "fftransformer": - assert ( - in_hidden_channels == out_channels - ), "[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'" + assert in_hidden_channels == out_channels, ( + "[!] must be `in_channels` == `out_channels` when encoder type is 'fftransformer'" + ) # pylint: disable=unexpected-keyword-arg self.encoder = FFTransformerBlock(in_hidden_channels, **encoder_params) else: diff --git a/TTS/tts/layers/generic/aligner.py b/TTS/tts/layers/generic/aligner.py index baa6f0e9c4..480c48f9a4 100644 --- a/TTS/tts/layers/generic/aligner.py +++ b/TTS/tts/layers/generic/aligner.py @@ -1,5 +1,3 @@ -from typing import Tuple - import torch from torch import nn @@ -68,7 +66,7 @@ def init_layers(self): def forward( self, queries: torch.tensor, keys: torch.tensor, mask: torch.tensor = None, attn_prior: torch.tensor = None - ) -> Tuple[torch.tensor, torch.tensor]: + ) -> tuple[torch.tensor, torch.tensor]: """Forward pass of the aligner encoder. Shapes: - queries: :math:`[B, C, T_de]` diff --git a/TTS/tts/layers/generic/pos_encoding.py b/TTS/tts/layers/generic/pos_encoding.py index 913add0d14..7765e224aa 100644 --- a/TTS/tts/layers/generic/pos_encoding.py +++ b/TTS/tts/layers/generic/pos_encoding.py @@ -18,9 +18,7 @@ class PositionalEncoding(nn.Module): def __init__(self, channels, dropout_p=0.0, max_len=5000, use_scale=False): super().__init__() if channels % 2 != 0: - raise ValueError( - "Cannot use sin/cos positional encoding with " "odd channels (got channels={:d})".format(channels) - ) + raise ValueError(f"Cannot use sin/cos positional encoding with odd channels (got channels={channels:d})") self.use_scale = use_scale if use_scale: self.scale = torch.nn.Parameter(torch.ones(1)) diff --git a/TTS/tts/layers/generic/transformer.py b/TTS/tts/layers/generic/transformer.py index 9b7ecee2ba..2fe9bcc408 100644 --- a/TTS/tts/layers/generic/transformer.py +++ b/TTS/tts/layers/generic/transformer.py @@ -70,9 +70,7 @@ def forward(self, x, mask=None, g=None): # pylint: disable=unused-argument class FFTDurationPredictor: - def __init__( - self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None - ): # pylint: disable=unused-argument + def __init__(self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None): # pylint: disable=unused-argument self.fft = FFTransformerBlock(in_channels, num_heads, hidden_channels, num_layers, dropout_p) self.proj = nn.Linear(in_channels, 1) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 5ebed81dda..1e744d62cf 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -309,6 +309,24 @@ def forward(self, attn_logprob, in_lens, out_lens): return total_loss +class NLLLoss(nn.Module): + """Negative log likelihood loss.""" + + def forward(self, log_prob: torch.Tensor) -> dict: # pylint: disable=no-self-use + """Compute the loss. + + Args: + logits (Tensor): [B, T, D] + + Returns: + Tensor: [1] + + """ + return_dict = {} + return_dict["loss"] = -log_prob.mean() + return return_dict + + ######################## # MODEL LOSS LAYERS ######################## @@ -619,6 +637,28 @@ def forward( return {"loss": loss, "loss_l1": spec_loss, "loss_ssim": ssim_loss, "loss_dur": dur_loss, "mdn_loss": mdn_loss} +def feature_loss(feats_real, feats_generated): + loss = 0 + for dr, dg in zip(feats_real, feats_generated): + for rl, gl in zip(dr, dg): + rl = rl.float().detach() + gl = gl.float() + loss += torch.mean(torch.abs(rl - gl)) + return loss * 2 + + +def generator_loss(scores_fake): + loss = 0 + gen_losses = [] + for dg in scores_fake: + dg = dg.float() + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + + class VitsGeneratorLoss(nn.Module): def __init__(self, c: Coqpit): super().__init__() @@ -640,28 +680,6 @@ def __init__(self, c: Coqpit): do_amp_to_db=True, ) - @staticmethod - def feature_loss(feats_real, feats_generated): - loss = 0 - for dr, dg in zip(feats_real, feats_generated): - for rl, gl in zip(dr, dg): - rl = rl.float().detach() - gl = gl.float() - loss += torch.mean(torch.abs(rl - gl)) - return loss * 2 - - @staticmethod - def generator_loss(scores_fake): - loss = 0 - gen_losses = [] - for dg in scores_fake: - dg = dg.float() - l = torch.mean((1 - dg) ** 2) - gen_losses.append(l) - loss += l - - return loss, gen_losses - @staticmethod def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): """ @@ -722,10 +740,8 @@ def forward( self.kl_loss(z_p=z_p, logs_q=logs_q, m_p=m_p, logs_p=logs_p, z_mask=z_mask.unsqueeze(1)) * self.kl_loss_alpha ) - loss_feat = ( - self.feature_loss(feats_real=feats_disc_real, feats_generated=feats_disc_fake) * self.feat_loss_alpha - ) - loss_gen = self.generator_loss(scores_fake=scores_disc_fake)[0] * self.gen_loss_alpha + loss_feat = feature_loss(feats_real=feats_disc_real, feats_generated=feats_disc_fake) * self.feat_loss_alpha + loss_gen = generator_loss(scores_fake=scores_disc_fake)[0] * self.gen_loss_alpha loss_mel = torch.nn.functional.l1_loss(mel_slice, mel_slice_hat) * self.mel_loss_alpha loss_duration = torch.sum(loss_duration.float()) * self.dur_loss_alpha loss = loss_kl + loss_feat + loss_mel + loss_gen + loss_duration @@ -779,6 +795,15 @@ def forward(self, scores_disc_real, scores_disc_fake): return return_dict +def _binary_alignment_loss(alignment_hard, alignment_soft): + """Binary loss that forces soft alignments to match the hard alignments. + + Explained in `https://arxiv.org/pdf/2108.10447.pdf`. + """ + log_sum = torch.log(torch.clamp(alignment_soft[alignment_hard == 1], min=1e-12)).sum() + return -log_sum / alignment_hard.sum() + + class ForwardTTSLoss(nn.Module): """Generic configurable ForwardTTS loss.""" @@ -789,7 +814,7 @@ def __init__(self, c): elif c.spec_loss_type == "l1": self.spec_loss = L1LossMasked(False) else: - raise ValueError(" [!] Unknown spec_loss_type {}".format(c.spec_loss_type)) + raise ValueError(f" [!] Unknown spec_loss_type {c.spec_loss_type}") if c.duration_loss_type == "mse": self.dur_loss = MSELossMasked(False) @@ -798,7 +823,7 @@ def __init__(self, c): elif c.duration_loss_type == "huber": self.dur_loss = Huber() else: - raise ValueError(" [!] Unknown duration_loss_type {}".format(c.duration_loss_type)) + raise ValueError(f" [!] Unknown duration_loss_type {c.duration_loss_type}") if c.model_args.use_aligner: self.aligner_loss = ForwardSumLoss() @@ -820,14 +845,6 @@ def __init__(self, c): self.dur_loss_alpha = c.dur_loss_alpha self.binary_alignment_loss_alpha = c.binary_align_loss_alpha - @staticmethod - def _binary_alignment_loss(alignment_hard, alignment_soft): - """Binary loss that forces soft alignments to match the hard alignments as - explained in `https://arxiv.org/pdf/2108.10447.pdf`. - """ - log_sum = torch.log(torch.clamp(alignment_soft[alignment_hard == 1], min=1e-12)).sum() - return -log_sum / alignment_hard.sum() - def forward( self, decoder_output, @@ -879,7 +896,7 @@ def forward( return_dict["loss_aligner"] = self.aligner_loss_alpha * aligner_loss if self.binary_alignment_loss_alpha > 0 and alignment_hard is not None: - binary_alignment_loss = self._binary_alignment_loss(alignment_hard, alignment_soft) + binary_alignment_loss = _binary_alignment_loss(alignment_hard, alignment_soft) loss = loss + self.binary_alignment_loss_alpha * binary_alignment_loss if binary_loss_weight: return_dict["loss_binary_alignment"] = ( diff --git a/TTS/tts/layers/overflow/common_layers.py b/TTS/tts/layers/overflow/common_layers.py index 9f77af293c..a477b34f0b 100644 --- a/TTS/tts/layers/overflow/common_layers.py +++ b/TTS/tts/layers/overflow/common_layers.py @@ -1,5 +1,4 @@ import logging -from typing import List, Tuple import torch import torch.nn.functional as F @@ -44,7 +43,7 @@ def __init__(self, num_chars, state_per_phone, in_out_channels=512, n_convolutio ) self.rnn_state = None - def forward(self, x: torch.FloatTensor, x_len: torch.LongTensor) -> Tuple[torch.FloatTensor, torch.LongTensor]: + def forward(self, x: torch.FloatTensor, x_len: torch.LongTensor) -> tuple[torch.FloatTensor, torch.LongTensor]: """Forward pass to the encoder. Args: @@ -110,7 +109,7 @@ class ParameterModel(nn.Module): def __init__( self, - outputnet_size: List[int], + outputnet_size: list[int], input_size: int, output_size: int, frame_channels: int, @@ -152,7 +151,7 @@ def __init__( encoder_dim: int, memory_rnn_dim: int, frame_channels: int, - outputnet_size: List[int], + outputnet_size: list[int], flat_start_params: dict, std_floor: float = 1e-2, ): diff --git a/TTS/tts/layers/overflow/neural_hmm.py b/TTS/tts/layers/overflow/neural_hmm.py index a12becef03..9142f65e8c 100644 --- a/TTS/tts/layers/overflow/neural_hmm.py +++ b/TTS/tts/layers/overflow/neural_hmm.py @@ -1,5 +1,3 @@ -from typing import List - import torch import torch.distributions as tdist import torch.nn.functional as F @@ -57,7 +55,7 @@ def __init__( prenet_dropout: float, prenet_dropout_at_inference: bool, memory_rnn_dim: int, - outputnet_size: List[int], + outputnet_size: list[int], flat_start_params: dict, std_floor: float, use_grad_checkpointing: bool = True, diff --git a/TTS/tts/layers/tacotron/capacitron_layers.py b/TTS/tts/layers/tacotron/capacitron_layers.py index 2181ffa7ec..817f42771b 100644 --- a/TTS/tts/layers/tacotron/capacitron_layers.py +++ b/TTS/tts/layers/tacotron/capacitron_layers.py @@ -3,6 +3,8 @@ from torch.distributions.multivariate_normal import MultivariateNormal as MVN from torch.nn import functional as F +from TTS.tts.layers.tacotron.common_layers import calculate_post_conv_height + class CapacitronVAE(nn.Module): """Effective Use of Variational Embedding Capacity for prosody transfer. @@ -97,7 +99,7 @@ def __init__(self, num_mel, out_dim): self.training = False self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]]) - post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 2, num_layers) + post_conv_height = calculate_post_conv_height(num_mel, 3, 2, 2, num_layers) self.recurrence = nn.LSTM( input_size=filters[-1] * post_conv_height, hidden_size=out_dim, batch_first=True, bidirectional=False ) @@ -155,13 +157,6 @@ def forward(self, inputs, input_lengths): return last_output.to(inputs.device) # [B, 128] - @staticmethod - def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs): - """Height of spec after n convolutions with fixed kernel/stride/pad.""" - for _ in range(n_convs): - height = (height - kernel_size + 2 * pad) // stride + 1 - return height - class TextSummary(nn.Module): def __init__(self, embedding_dim, encoder_output_dim): diff --git a/TTS/tts/layers/tacotron/common_layers.py b/TTS/tts/layers/tacotron/common_layers.py index f78ff1e75f..16e517fdca 100644 --- a/TTS/tts/layers/tacotron/common_layers.py +++ b/TTS/tts/layers/tacotron/common_layers.py @@ -3,6 +3,13 @@ from torch.nn import functional as F +def calculate_post_conv_height(height: int, kernel_size: int, stride: int, pad: int, n_convs: int) -> int: + """Height of spec after n convolutions with fixed kernel/stride/pad.""" + for _ in range(n_convs): + height = (height - kernel_size + 2 * pad) // stride + 1 + return height + + class Linear(nn.Module): """Linear layer with a specific initialization. diff --git a/TTS/tts/layers/tacotron/gst_layers.py b/TTS/tts/layers/tacotron/gst_layers.py index 05dba7084f..4a83fb1c83 100644 --- a/TTS/tts/layers/tacotron/gst_layers.py +++ b/TTS/tts/layers/tacotron/gst_layers.py @@ -2,6 +2,8 @@ import torch.nn.functional as F from torch import nn +from TTS.tts.layers.tacotron.common_layers import calculate_post_conv_height + class GST(nn.Module): """Global Style Token Module for factorizing prosody in speech. @@ -44,7 +46,7 @@ def __init__(self, num_mel, embedding_dim): self.convs = nn.ModuleList(convs) self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]]) - post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 1, num_layers) + post_conv_height = calculate_post_conv_height(num_mel, 3, 2, 1, num_layers) self.recurrence = nn.GRU( input_size=filters[-1] * post_conv_height, hidden_size=embedding_dim // 2, batch_first=True ) @@ -71,13 +73,6 @@ def forward(self, inputs): return out.squeeze(0) - @staticmethod - def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs): - """Height of spec after n convolutions with fixed kernel/stride/pad.""" - for _ in range(n_convs): - height = (height - kernel_size + 2 * pad) // stride + 1 - return height - class StyleTokenLayer(nn.Module): """NN Module attending to style tokens based on prosody encodings.""" @@ -117,7 +112,7 @@ class MultiHeadAttention(nn.Module): out --- [N, T_q, num_units] """ - def __init__(self, query_dim, key_dim, num_units, num_heads): + def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int): super().__init__() self.num_units = num_units self.num_heads = num_heads @@ -127,7 +122,7 @@ def __init__(self, query_dim, key_dim, num_units, num_heads): self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) - def forward(self, query, key): + def forward(self, query: torch.Tensor, key: torch.Tensor) -> torch.Tensor: queries = self.W_query(query) # [N, T_q, num_units] keys = self.W_key(key) # [N, T_k, num_units] values = self.W_value(key) @@ -137,13 +132,11 @@ def forward(self, query, key): keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h] values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h] - # score = softmax(QK^T / (d_k**0.5)) + # score = softmax(QK^T / (d_k ** 0.5)) scores = torch.matmul(queries, keys.transpose(2, 3)) # [h, N, T_q, T_k] scores = scores / (self.key_dim**0.5) scores = F.softmax(scores, dim=3) # out = score * V out = torch.matmul(scores, values) # [h, N, T_q, num_units/h] - out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units] - - return out + return torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units] diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py index 32643dfcee..6f33edf3d7 100644 --- a/TTS/tts/layers/tacotron/tacotron.py +++ b/TTS/tts/layers/tacotron/tacotron.py @@ -1,4 +1,3 @@ -# coding: utf-8 # adapted from https://github.com/r9y9/tacotron_pytorch import logging diff --git a/TTS/tts/layers/tortoise/arch_utils.py b/TTS/tts/layers/tortoise/arch_utils.py index 8eda251f93..508699fee3 100644 --- a/TTS/tts/layers/tortoise/arch_utils.py +++ b/TTS/tts/layers/tortoise/arch_utils.py @@ -6,7 +6,7 @@ import torch.nn as nn import torch.nn.functional as F import torchaudio -from transformers import LogitsWarper +from transformers import LogitsProcessor from TTS.tts.layers.tortoise.xtransformers import ContinuousTransformerWrapper, RelativePositionBias from TTS.utils.generic_utils import is_pytorch_at_least_2_4 @@ -70,11 +70,10 @@ def forward(self, qkv, mask=None, rel_pos=None): weight = rel_pos(weight.reshape(bs, self.n_heads, weight.shape[-2], weight.shape[-1])).reshape( bs * self.n_heads, weight.shape[-2], weight.shape[-1] ) - weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) if mask is not None: - # The proper way to do this is to mask before the softmax using -inf, but that doesn't work properly on CPUs. - mask = mask.repeat(self.n_heads, 1).unsqueeze(1) - weight = weight * mask + mask = mask.repeat(self.n_heads, 1, 1) + weight[mask.logical_not()] = -torch.inf + weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) a = torch.einsum("bts,bcs->bct", weight, v) return a.reshape(bs, -1, length) @@ -93,23 +92,24 @@ def __init__( channels, num_heads=1, num_head_channels=-1, - do_checkpoint=True, + *, relative_pos_embeddings=False, + tortoise_norm=False, ): super().__init__() self.channels = channels - self.do_checkpoint = do_checkpoint if num_head_channels == -1: self.num_heads = num_heads else: - assert ( - channels % num_head_channels == 0 - ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + assert channels % num_head_channels == 0, ( + f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + ) self.num_heads = channels // num_head_channels self.norm = normalization(channels) self.qkv = nn.Conv1d(channels, channels * 3, 1) # split heads before split qkv self.attention = QKVAttentionLegacy(self.num_heads) + self.tortoise_norm = tortoise_norm self.proj_out = zero_module(nn.Conv1d(channels, channels, 1)) if relative_pos_embeddings: @@ -126,10 +126,13 @@ def __init__( def forward(self, x, mask=None): b, c, *spatial = x.shape x = x.reshape(b, c, -1) - qkv = self.qkv(self.norm(x)) + x_norm = self.norm(x) + qkv = self.qkv(x_norm) h = self.attention(qkv, mask, self.relative_pos_embeddings) h = self.proj_out(h) - return (x + h).reshape(b, c, *spatial) + if self.tortoise_norm: + return (x + h).reshape(b, c, *spatial) + return (x_norm + h).reshape(b, c, *spatial) class Upsample(nn.Module): @@ -185,114 +188,6 @@ def forward(self, x): return self.op(x) -class ResBlock(nn.Module): - def __init__( - self, - channels, - dropout, - out_channels=None, - use_conv=False, - use_scale_shift_norm=False, - up=False, - down=False, - kernel_size=3, - ): - super().__init__() - self.channels = channels - self.dropout = dropout - self.out_channels = out_channels or channels - self.use_conv = use_conv - self.use_scale_shift_norm = use_scale_shift_norm - padding = 1 if kernel_size == 3 else 2 - - self.in_layers = nn.Sequential( - normalization(channels), - nn.SiLU(), - nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding), - ) - - self.updown = up or down - - if up: - self.h_upd = Upsample(channels, False) - self.x_upd = Upsample(channels, False) - elif down: - self.h_upd = Downsample(channels, False) - self.x_upd = Downsample(channels, False) - else: - self.h_upd = self.x_upd = nn.Identity() - - self.out_layers = nn.Sequential( - normalization(self.out_channels), - nn.SiLU(), - nn.Dropout(p=dropout), - zero_module(nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding)), - ) - - if self.out_channels == channels: - self.skip_connection = nn.Identity() - elif use_conv: - self.skip_connection = nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding) - else: - self.skip_connection = nn.Conv1d(channels, self.out_channels, 1) - - def forward(self, x): - if self.updown: - in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] - h = in_rest(x) - h = self.h_upd(h) - x = self.x_upd(x) - h = in_conv(h) - else: - h = self.in_layers(x) - h = self.out_layers(h) - return self.skip_connection(x) + h - - -class AudioMiniEncoder(nn.Module): - def __init__( - self, - spec_dim, - embedding_dim, - base_channels=128, - depth=2, - resnet_blocks=2, - attn_blocks=4, - num_attn_heads=4, - dropout=0, - downsample_factor=2, - kernel_size=3, - ): - super().__init__() - self.init = nn.Sequential(nn.Conv1d(spec_dim, base_channels, 3, padding=1)) - ch = base_channels - res = [] - for l in range(depth): - for r in range(resnet_blocks): - res.append(ResBlock(ch, dropout, kernel_size=kernel_size)) - res.append(Downsample(ch, use_conv=True, out_channels=ch * 2, factor=downsample_factor)) - ch *= 2 - self.res = nn.Sequential(*res) - self.final = nn.Sequential(normalization(ch), nn.SiLU(), nn.Conv1d(ch, embedding_dim, 1)) - attn = [] - for a in range(attn_blocks): - attn.append( - AttentionBlock( - embedding_dim, - num_attn_heads, - ) - ) - self.attn = nn.Sequential(*attn) - self.dim = embedding_dim - - def forward(self, x): - h = self.init(x) - h = self.res(h) - h = self.final(h) - h = self.attn(h) - return h[:, :, 0] - - DEFAULT_MEL_NORM_FILE = "https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/mel_norms.pth" @@ -397,7 +292,7 @@ def forward(self, x, **kwargs): return h -class TypicalLogitsWarper(LogitsWarper): +class TypicalLogitsWarper(LogitsProcessor): def __init__( self, mass: float = 0.9, diff --git a/TTS/tts/layers/tortoise/audio_utils.py b/TTS/tts/layers/tortoise/audio_utils.py index 4f299a8fd9..6bbe6c389c 100644 --- a/TTS/tts/layers/tortoise/audio_utils.py +++ b/TTS/tts/layers/tortoise/audio_utils.py @@ -1,7 +1,6 @@ import logging import os from glob import glob -from typing import Dict, List import librosa import numpy as np @@ -9,7 +8,7 @@ import torchaudio from scipy.io.wavfile import read -from TTS.utils.audio.torch_transforms import TorchSTFT +from TTS.utils.audio.torch_transforms import TorchSTFT, amp_to_db from TTS.utils.generic_utils import is_pytorch_at_least_2_4 logger = logging.getLogger(__name__) @@ -88,27 +87,9 @@ def normalize_tacotron_mel(mel): return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1 -def dynamic_range_compression(x, C=1, clip_val=1e-5): - """ - PARAMS - ------ - C: compression factor - """ - return torch.log(torch.clamp(x, min=clip_val) * C) - - -def dynamic_range_decompression(x, C=1): - """ - PARAMS - ------ - C: compression factor used to compress - """ - return torch.exp(x) / C - - -def get_voices(extra_voice_dirs: List[str] = []): +def get_voices(extra_voice_dirs: list[str] = []): dirs = extra_voice_dirs - voices: Dict[str, List[str]] = {} + voices: dict[str, list[str]] = {} for d in dirs: subs = os.listdir(d) for sub in subs: @@ -118,7 +99,7 @@ def get_voices(extra_voice_dirs: List[str] = []): return voices -def load_voice(voice: str, extra_voice_dirs: List[str] = []): +def load_voice(voice: str, extra_voice_dirs: list[str] = []): if voice == "random": return None, None @@ -134,7 +115,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []): return conds, None -def load_voices(voices: List[str], extra_voice_dirs: List[str] = []): +def load_voices(voices: list[str], extra_voice_dirs: list[str] = []): latents = [] clips = [] for voice in voices: @@ -144,14 +125,14 @@ def load_voices(voices: List[str], extra_voice_dirs: List[str] = []): return None, None clip, latent = load_voice(voice, extra_voice_dirs) if latent is None: - assert ( - len(latents) == 0 - ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + assert len(latents) == 0, ( + "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + ) clips.extend(clip) elif clip is None: - assert ( - len(clips) == 0 - ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + assert len(clips) == 0, ( + "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + ) latents.append(latent) if len(latents) == 0: return clips, None @@ -175,7 +156,7 @@ def wav_to_univnet_mel(wav, do_normalization=False, device="cuda"): ) stft = stft.to(device) mel = stft(wav) - mel = dynamic_range_compression(mel) + mel = amp_to_db(mel) if do_normalization: mel = normalize_tacotron_mel(mel) return mel diff --git a/TTS/tts/layers/tortoise/autoregressive.py b/TTS/tts/layers/tortoise/autoregressive.py index aaae695516..eaeb2a03c1 100644 --- a/TTS/tts/layers/tortoise/autoregressive.py +++ b/TTS/tts/layers/tortoise/autoregressive.py @@ -1,6 +1,6 @@ # AGPL: a notification must be added stating that changes have been made to that file. import functools -from typing import Optional +import random import torch import torch.nn as nn @@ -123,7 +123,7 @@ def forward( else: emb = self.embeddings(input_ids) emb = emb + self.text_pos_embedding.get_fixed_embedding( - attention_mask.shape[1] - mel_len, attention_mask.device + attention_mask.shape[1] - (mel_len + 1), attention_mask.device ) transformer_outputs = self.transformer( @@ -175,44 +175,56 @@ def __init__( embedding_dim, attn_blocks=6, num_attn_heads=4, - do_checkpointing=False, - mean=False, + *, + tortoise_norm=False, ): super().__init__() attn = [] self.init = nn.Conv1d(spec_dim, embedding_dim, kernel_size=1) for a in range(attn_blocks): - attn.append(AttentionBlock(embedding_dim, num_attn_heads)) + attn.append(AttentionBlock(embedding_dim, num_attn_heads, tortoise_norm=tortoise_norm)) self.attn = nn.Sequential(*attn) self.dim = embedding_dim - self.do_checkpointing = do_checkpointing - self.mean = mean def forward(self, x): + """ + x: (b, 80, s) + """ h = self.init(x) h = self.attn(h) - if self.mean: - return h.mean(dim=2) - else: - return h[:, :, 0] + return h class LearnedPositionEmbeddings(nn.Module): - def __init__(self, seq_len, model_dim, init=0.02): + def __init__(self, seq_len, model_dim, init=0.02, relative=False): super().__init__() self.emb = nn.Embedding(seq_len, model_dim) # Initializing this way is standard for GPT-2 self.emb.weight.data.normal_(mean=0.0, std=init) + self.relative = relative + self.seq_len = seq_len def forward(self, x): sl = x.shape[1] - return self.emb(torch.arange(0, sl, device=x.device)) + if self.relative: + start = random.randint(sl, self.seq_len) - sl + return self.emb(torch.arange(start, start + sl, device=x.device)) + else: + return self.emb(torch.arange(0, sl, device=x.device)) def get_fixed_embedding(self, ind, dev): - return self.emb(torch.arange(0, ind, device=dev))[ind - 1 : ind] - - -def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text_seq_len, checkpointing): + return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0) + + +def build_hf_gpt_transformer( + layers: int, + model_dim: int, + heads: int, + max_mel_seq_len: int, + max_text_seq_len: int, + checkpointing: bool, + max_prompt_len: int = 0, +): """ GPT-2 implemented by the HuggingFace library. """ @@ -220,8 +232,8 @@ def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text gpt_config = GPT2Config( vocab_size=256, # Unused. - n_positions=max_mel_seq_len + max_text_seq_len, - n_ctx=max_mel_seq_len + max_text_seq_len, + n_positions=max_mel_seq_len + max_text_seq_len + max_prompt_len, + n_ctx=max_mel_seq_len + max_text_seq_len + max_prompt_len, n_embd=model_dim, n_layer=layers, n_head=heads, @@ -234,13 +246,18 @@ def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim) # Built-in token embeddings are unused. del gpt.wte - return ( - gpt, - LearnedPositionEmbeddings(max_mel_seq_len, model_dim), - LearnedPositionEmbeddings(max_text_seq_len, model_dim), - None, - None, + + mel_pos_emb = ( + LearnedPositionEmbeddings(max_mel_seq_len, model_dim) + if max_mel_seq_len != -1 + else functools.partial(null_position_embeddings, dim=model_dim) ) + text_pos_emb = ( + LearnedPositionEmbeddings(max_text_seq_len, model_dim) + if max_mel_seq_len != -1 + else functools.partial(null_position_embeddings, dim=model_dim) + ) + return gpt, mel_pos_emb, text_pos_emb, None, None class MelEncoder(nn.Module): @@ -334,12 +351,12 @@ def __init__( self.mel_layer_pos_embedding, self.text_layer_pos_embedding, ) = build_hf_gpt_transformer( - layers, - model_dim, - heads, - self.max_mel_tokens + 2 + self.max_conditioning_inputs, - self.max_text_tokens + 2, - checkpointing, + layers=layers, + model_dim=model_dim, + heads=heads, + max_mel_seq_len=self.max_mel_tokens + 2 + self.max_conditioning_inputs, + max_text_seq_len=self.max_text_tokens + 2, + checkpointing=checkpointing, ) if train_solo_embeddings: self.mel_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * 0.02, requires_grad=True) @@ -455,7 +472,7 @@ def get_conditioning(self, speech_conditioning_input): ) conds = [] for j in range(speech_conditioning_input.shape[1]): - conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])) + conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])[:, :, 0]) conds = torch.stack(conds, dim=1) conds = conds.mean(dim=1) return conds @@ -591,9 +608,9 @@ def inference_speech( if input_tokens is None: inputs = fake_inputs else: - assert ( - num_return_sequences % input_tokens.shape[0] == 0 - ), "The number of return sequences must be divisible by the number of input sequences" + assert num_return_sequences % input_tokens.shape[0] == 0, ( + "The number of return sequences must be divisible by the number of input sequences" + ) fake_inputs = fake_inputs.repeat(num_return_sequences, 1) input_tokens = input_tokens.repeat(num_return_sequences // input_tokens.shape[0], 1) inputs = torch.cat([fake_inputs, input_tokens], dim=1) @@ -622,8 +639,8 @@ def inference_speech( def _prepare_attention_mask_for_generation( inputs: torch.Tensor, - pad_token_id: Optional[torch.Tensor], - eos_token_id: Optional[torch.Tensor], + pad_token_id: torch.Tensor | None, + eos_token_id: torch.Tensor | None, ) -> torch.LongTensor: # No information for attention mask inference -> return default attention mask default_attention_mask = torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device) diff --git a/TTS/tts/layers/tortoise/classifier.py b/TTS/tts/layers/tortoise/classifier.py index 8764bb070b..337323db67 100644 --- a/TTS/tts/layers/tortoise/classifier.py +++ b/TTS/tts/layers/tortoise/classifier.py @@ -16,7 +16,6 @@ def __init__( up=False, down=False, kernel_size=3, - do_checkpoint=True, ): super().__init__() self.channels = channels @@ -24,7 +23,6 @@ def __init__( self.out_channels = out_channels or channels self.use_conv = use_conv self.use_scale_shift_norm = use_scale_shift_norm - self.do_checkpoint = do_checkpoint padding = 1 if kernel_size == 3 else 2 self.in_layers = nn.Sequential( @@ -92,14 +90,14 @@ def __init__( self.layers = depth for l in range(depth): for r in range(resnet_blocks): - res.append(ResBlock(ch, dropout, do_checkpoint=False, kernel_size=kernel_size)) + res.append(ResBlock(ch, dropout, kernel_size=kernel_size)) res.append(Downsample(ch, use_conv=True, out_channels=ch * 2, factor=downsample_factor)) ch *= 2 self.res = nn.Sequential(*res) self.final = nn.Sequential(normalization(ch), nn.SiLU(), nn.Conv1d(ch, embedding_dim, 1)) attn = [] for a in range(attn_blocks): - attn.append(AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=False)) + attn.append(AttentionBlock(embedding_dim, num_attn_heads, tortoise_norm=True)) self.attn = nn.Sequential(*attn) self.dim = embedding_dim diff --git a/TTS/tts/layers/tortoise/clvp.py b/TTS/tts/layers/tortoise/clvp.py index 241dfdd4f4..44da1324e7 100644 --- a/TTS/tts/layers/tortoise/clvp.py +++ b/TTS/tts/layers/tortoise/clvp.py @@ -8,10 +8,6 @@ from TTS.tts.layers.tortoise.xtransformers import Encoder -def exists(val): - return val is not None - - def masked_mean(t, mask, dim=1): t = t.masked_fill(~mask[:, :, None], 0.0) return t.sum(dim=1) / mask.sum(dim=1)[..., None] diff --git a/TTS/tts/layers/tortoise/diffusion.py b/TTS/tts/layers/tortoise/diffusion.py index 2b29091b44..cfb8fa800d 100644 --- a/TTS/tts/layers/tortoise/diffusion.py +++ b/TTS/tts/layers/tortoise/diffusion.py @@ -653,7 +653,7 @@ def p_sample_loop_progressive( """ if device is None: device = next(model.parameters()).device - assert isinstance(shape, (tuple, list)) + assert isinstance(shape, tuple | list) if noise is not None: img = noise else: @@ -805,7 +805,7 @@ def ddim_sample_loop_progressive( """ if device is None: device = next(model.parameters()).device - assert isinstance(shape, (tuple, list)) + assert isinstance(shape, tuple | list) if noise is not None: img = noise else: diff --git a/TTS/tts/layers/tortoise/diffusion_decoder.py b/TTS/tts/layers/tortoise/diffusion_decoder.py index f71eaf1718..cfdeaff8bb 100644 --- a/TTS/tts/layers/tortoise/diffusion_decoder.py +++ b/TTS/tts/layers/tortoise/diffusion_decoder.py @@ -130,7 +130,7 @@ def __init__(self, model_channels, dropout, num_heads): dims=1, use_scale_shift_norm=True, ) - self.attn = AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True) + self.attn = AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True) def forward(self, x, time_emb): y = self.resblk(x, time_emb) @@ -177,17 +177,17 @@ def __init__( # transformer network. self.code_embedding = nn.Embedding(in_tokens, model_channels) self.code_converter = nn.Sequential( - AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), - AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), - AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True), ) self.code_norm = normalization(model_channels) self.latent_conditioner = nn.Sequential( nn.Conv1d(in_latent_channels, model_channels, 3, padding=1), - AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), - AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), - AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), - AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True), + AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True, tortoise_norm=True), ) self.contextual_embedder = nn.Sequential( nn.Conv1d(in_channels, model_channels, 3, padding=1, stride=2), @@ -196,31 +196,31 @@ def __init__( model_channels * 2, num_heads, relative_pos_embeddings=True, - do_checkpoint=False, + tortoise_norm=True, ), AttentionBlock( model_channels * 2, num_heads, relative_pos_embeddings=True, - do_checkpoint=False, + tortoise_norm=True, ), AttentionBlock( model_channels * 2, num_heads, relative_pos_embeddings=True, - do_checkpoint=False, + tortoise_norm=True, ), AttentionBlock( model_channels * 2, num_heads, relative_pos_embeddings=True, - do_checkpoint=False, + tortoise_norm=True, ), AttentionBlock( model_channels * 2, num_heads, relative_pos_embeddings=True, - do_checkpoint=False, + tortoise_norm=True, ), ) self.unconditioned_embedding = nn.Parameter(torch.randn(1, model_channels, 1)) diff --git a/TTS/tts/layers/tortoise/dpm_solver.py b/TTS/tts/layers/tortoise/dpm_solver.py index 6a1d8ff784..c8892d456a 100644 --- a/TTS/tts/layers/tortoise/dpm_solver.py +++ b/TTS/tts/layers/tortoise/dpm_solver.py @@ -98,9 +98,7 @@ def __init__( if schedule not in ["discrete", "linear", "cosine"]: raise ValueError( - "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format( - schedule - ) + f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear' or 'cosine'" ) self.schedule = schedule @@ -150,7 +148,7 @@ def marginal_log_mean_coeff(self, t): t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device), - ).reshape((-1)) + ).reshape(-1) elif self.schedule == "linear": return -0.25 * t**2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0 elif self.schedule == "cosine": @@ -447,7 +445,7 @@ def correcting_xt_fn(xt, t, step): Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b. """ - self.model = lambda x, t: model_fn(x, t.expand((x.shape[0]))) + self.model = lambda x, t: model_fn(x, t.expand(x.shape[0])) self.noise_schedule = noise_schedule assert algorithm_type in ["dpmsolver", "dpmsolver++"] self.algorithm_type = algorithm_type @@ -527,7 +525,7 @@ def get_time_steps(self, skip_type, t_T, t_0, N, device): return t else: raise ValueError( - "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type) + f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'" ) def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device): @@ -565,41 +563,21 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type if order == 3: K = steps // 3 + 1 if steps % 3 == 0: - orders = [ - 3, - ] * ( - K - 2 - ) + [2, 1] + orders = [3] * (K - 2) + [2, 1] elif steps % 3 == 1: - orders = [ - 3, - ] * ( - K - 1 - ) + [1] + orders = [3] * (K - 1) + [1] else: - orders = [ - 3, - ] * ( - K - 1 - ) + [2] + orders = [3] * (K - 1) + [2] elif order == 2: if steps % 2 == 0: K = steps // 2 - orders = [ - 2, - ] * K + orders = [2] * K else: K = steps // 2 + 1 - orders = [ - 2, - ] * ( - K - 1 - ) + [1] + orders = [2] * (K - 1) + [1] elif order == 1: K = 1 - orders = [ - 1, - ] * steps + orders = [1] * steps else: raise ValueError("'order' must be '1' or '2' or '3'.") if skip_type == "logSNR": @@ -607,15 +585,7 @@ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device) else: timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[ - torch.cumsum( - torch.tensor( - [ - 0, - ] - + orders - ), - 0, - ).to(device) + torch.cumsum(torch.tensor([0] + orders), 0).to(device) ] return timesteps_outer, orders @@ -693,7 +663,7 @@ def singlestep_dpm_solver_second_update( x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ["dpmsolver", "taylor"]: - raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type)) + raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}") if r1 is None: r1 = 0.5 ns = self.noise_schedule @@ -790,7 +760,7 @@ def singlestep_dpm_solver_third_update( x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ["dpmsolver", "taylor"]: - raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type)) + raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}") if r1 is None: r1 = 1.0 / 3.0 if r2 is None: @@ -913,7 +883,7 @@ def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, x_t: A pytorch tensor. The approximated solution at time `t`. """ if solver_type not in ["dpmsolver", "taylor"]: - raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type)) + raise ValueError(f"'solver_type' must be either 'dpmsolver' or 'taylor', got {solver_type}") ns = self.noise_schedule model_prev_1, model_prev_0 = model_prev_list[-2], model_prev_list[-1] t_prev_1, t_prev_0 = t_prev_list[-2], t_prev_list[-1] @@ -1062,7 +1032,7 @@ def singlestep_dpm_solver_update( r2=r2, ) else: - raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order)) + raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}") def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type="dpmsolver"): """ @@ -1086,7 +1056,7 @@ def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, elif order == 3: return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type) else: - raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order)) + raise ValueError(f"Solver order must be 1 or 2 or 3, got {order}") def dpm_solver_adaptive( self, @@ -1150,8 +1120,8 @@ def higher_update(x, s, t, **kwargs): return self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs) else: - raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order)) - while torch.abs((s - t_0)).mean() > t_err: + raise ValueError(f"For adaptive step size solver, order must be 2 or 3, got {order}") + while torch.abs(s - t_0).mean() > t_err: t = ns.inverse_lambda(lambda_s + h) x_lower, lower_noise_kwargs = lower_update(x, s, t) x_higher = higher_update(x, s, t, **lower_noise_kwargs) @@ -1219,9 +1189,9 @@ def inverse( """ t_0 = 1.0 / self.noise_schedule.total_N if t_start is None else t_start t_T = self.noise_schedule.T if t_end is None else t_end - assert ( - t_0 > 0 and t_T > 0 - ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + assert t_0 > 0 and t_T > 0, ( + "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + ) return self.sample( x, steps=steps, @@ -1364,9 +1334,9 @@ def sample( """ t_0 = 1.0 / self.noise_schedule.total_N if t_end is None else t_end t_T = self.noise_schedule.T if t_start is None else t_start - assert ( - t_0 > 0 and t_T > 0 - ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + assert t_0 > 0 and t_T > 0, ( + "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array" + ) if return_intermediate: assert method in [ "multistep", @@ -1487,7 +1457,7 @@ def sample( if return_intermediate: intermediates.append(x) else: - raise ValueError("Got wrong method {}".format(method)) + raise ValueError(f"Got wrong method {method}") if denoise_to_zero: t = torch.ones((1,)).to(device) * t_0 x = self.denoise_to_zero_fn(x, t) diff --git a/TTS/tts/layers/tortoise/transformer.py b/TTS/tts/layers/tortoise/transformer.py index 6cb1bab96a..531f294220 100644 --- a/TTS/tts/layers/tortoise/transformer.py +++ b/TTS/tts/layers/tortoise/transformer.py @@ -1,22 +1,19 @@ +from typing import TypeVar + import torch import torch.nn.functional as F from einops import rearrange from torch import nn -# helpers - - -def exists(val): - return val is not None +from TTS.utils.generic_utils import exists - -def default(val, d): - return val if exists(val) else d +# helpers +_T = TypeVar("_T") -def cast_tuple(val, depth=1): +def cast_tuple(val: tuple[_T] | list[_T] | _T, depth: int = 1) -> tuple[_T]: if isinstance(val, list): - val = tuple(val) + return tuple(val) return val if isinstance(val, tuple) else (val,) * depth @@ -46,9 +43,9 @@ def route_args(router, args, depth): class SequentialSequence(nn.Module): def __init__(self, layers, args_route={}, layer_dropout=0.0): super().__init__() - assert all( - len(route) == len(layers) for route in args_route.values() - ), "each argument route map must have the same depth as the number of sequential layers" + assert all(len(route) == len(layers) for route in args_route.values()), ( + "each argument route map must have the same depth as the number of sequential layers" + ) self.layers = layers self.args_route = args_route self.layer_dropout = layer_dropout diff --git a/TTS/tts/layers/tortoise/vocoder.py b/TTS/tts/layers/tortoise/vocoder.py index a5200c2673..e7497d8190 100644 --- a/TTS/tts/layers/tortoise/vocoder.py +++ b/TTS/tts/layers/tortoise/vocoder.py @@ -1,6 +1,6 @@ +from collections.abc import Callable from dataclasses import dataclass from enum import Enum -from typing import Callable, Optional import torch import torch.nn as nn @@ -293,7 +293,7 @@ def __init__( hop_length=256, n_mel_channels=100, ): - super(UnivNetGenerator, self).__init__() + super().__init__() self.mel_channel = n_mel_channels self.noise_dim = noise_dim self.hop_length = hop_length @@ -344,7 +344,7 @@ def forward(self, c, z): return z def eval(self, inference=False): - super(UnivNetGenerator, self).eval() + super().eval() # don't remove weight norm while validation in training loop if inference: self.remove_weight_norm() @@ -378,7 +378,7 @@ def inference(self, c, z=None): class VocType: constructor: Callable[[], nn.Module] model_path: str - subkey: Optional[str] = None + subkey: str | None = None def optionally_index(self, model_dict): if self.subkey is not None: diff --git a/TTS/tts/layers/tortoise/xtransformers.py b/TTS/tts/layers/tortoise/xtransformers.py index 9325b8c720..b2e74cf118 100644 --- a/TTS/tts/layers/tortoise/xtransformers.py +++ b/TTS/tts/layers/tortoise/xtransformers.py @@ -1,13 +1,15 @@ import math from collections import namedtuple from functools import partial -from inspect import isfunction import torch import torch.nn.functional as F from einops import rearrange, repeat from torch import einsum, nn +from TTS.tts.layers.tortoise.transformer import cast_tuple, max_neg_value +from TTS.utils.generic_utils import default, exists + DEFAULT_DIM_HEAD = 64 Intermediates = namedtuple("Intermediates", ["pre_softmax_attn", "post_softmax_attn"]) @@ -25,20 +27,6 @@ # helpers -def exists(val): - return val is not None - - -def default(val, d): - if exists(val): - return val - return d() if isfunction(d) else d - - -def cast_tuple(val, depth): - return val if isinstance(val, tuple) else (val,) * depth - - class always: def __init__(self, val): self.val = val @@ -63,10 +51,6 @@ def __call__(self, x, *args, **kwargs): return x == self.val -def max_neg_value(tensor): - return -torch.finfo(tensor.dtype).max - - def l2norm(t): return F.normalize(t, p=2, dim=-1) @@ -576,9 +560,9 @@ def __init__( self.rel_pos_bias = rel_pos_bias if rel_pos_bias: - assert ( - rel_pos_num_buckets <= rel_pos_max_distance - ), "number of relative position buckets must be less than the relative position max distance" + assert rel_pos_num_buckets <= rel_pos_max_distance, ( + "number of relative position buckets must be less than the relative position max distance" + ) self.rel_pos = RelativePositionBias( scale=dim_head**0.5, causal=causal, @@ -696,9 +680,9 @@ def forward( del input_mask if exists(attn_mask): - assert ( - 2 <= attn_mask.ndim <= 4 - ), "attention mask must have greater than 2 dimensions but less than or equal to 4" + assert 2 <= attn_mask.ndim <= 4, ( + "attention mask must have greater than 2 dimensions but less than or equal to 4" + ) if attn_mask.ndim == 2: attn_mask = rearrange(attn_mask, "i j -> () () i j") elif attn_mask.ndim == 3: @@ -806,9 +790,9 @@ def __init__( rotary_emb_dim = max(default(rotary_emb_dim, dim_head // 2), 32) self.rotary_pos_emb = RotaryEmbedding(rotary_emb_dim) if rotary_pos_emb else None - assert not ( - alibi_pos_bias and rel_pos_bias - ), "you can only choose Alibi positional bias or T5 relative positional bias, not both" + assert not (alibi_pos_bias and rel_pos_bias), ( + "you can only choose Alibi positional bias or T5 relative positional bias, not both" + ) if alibi_pos_bias: alibi_num_heads = default(alibi_num_heads, heads) @@ -938,9 +922,9 @@ def forward( past_key_values=None, expected_seq_len=None, ): - assert not ( - self.cross_attend ^ (exists(context) or exists(full_context)) - ), "context must be passed in if cross_attend is set to True" + assert not (self.cross_attend ^ (exists(context) or exists(full_context))), ( + "context must be passed in if cross_attend is set to True" + ) assert context is None or full_context is None, "only one of full_context or context can be provided" hiddens = [] @@ -956,9 +940,9 @@ def forward( rotary_pos_emb = None if exists(self.rotary_pos_emb): if not self.training and self.causal: - assert ( - expected_seq_len is not None - ), "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`" + assert expected_seq_len is not None, ( + "To decode a transformer with rotary embeddings, you must specify an `expected_seq_len`" + ) elif expected_seq_len is None: expected_seq_len = 0 seq_len = x.shape[1] diff --git a/TTS/tts/layers/vits/discriminator.py b/TTS/tts/layers/vits/discriminator.py index 3449739fdc..49f7a0d074 100644 --- a/TTS/tts/layers/vits/discriminator.py +++ b/TTS/tts/layers/vits/discriminator.py @@ -2,7 +2,7 @@ from torch import nn from torch.nn.modules.conv import Conv1d -from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP +from TTS.vocoder.models.hifigan_discriminator import LRELU_SLOPE, DiscriminatorP class DiscriminatorS(torch.nn.Module): @@ -39,7 +39,7 @@ def forward(self, x): feat = [] for l in self.convs: x = l(x) - x = torch.nn.functional.leaky_relu(x, 0.1) + x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) feat.append(x) x = self.conv_post(x) feat.append(x) diff --git a/TTS/tts/layers/vits/networks.py b/TTS/tts/layers/vits/networks.py index 50ed1024de..ab2ca5667a 100644 --- a/TTS/tts/layers/vits/networks.py +++ b/TTS/tts/layers/vits/networks.py @@ -256,7 +256,7 @@ def __init__( ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward(self, x, x_lengths, g=None): + def forward(self, x, x_lengths, g=None, tau=1.0): """ Shapes: - x: :math:`[B, C, T]` @@ -268,5 +268,5 @@ def forward(self, x, x_lengths, g=None): x = self.enc(x, x_mask, g=g) stats = self.proj(x) * x_mask mean, log_scale = torch.split(stats, self.out_channels, dim=1) - z = (mean + torch.randn_like(mean) * torch.exp(log_scale)) * x_mask + z = (mean + torch.randn_like(mean) * tau * torch.exp(log_scale)) * x_mask return z, mean, log_scale, x_mask diff --git a/TTS/tts/layers/vits/transforms.py b/TTS/tts/layers/vits/transforms.py index 3cac1b8d6d..da5deea9ef 100644 --- a/TTS/tts/layers/vits/transforms.py +++ b/TTS/tts/layers/vits/transforms.py @@ -74,7 +74,7 @@ def unconstrained_rational_quadratic_spline( outputs[outside_interval_mask] = inputs[outside_interval_mask] logabsdet[outside_interval_mask] = 0 else: - raise RuntimeError("{} tails are not implemented.".format(tails)) + raise RuntimeError(f"{tails} tails are not implemented.") outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( inputs=inputs[inside_interval_mask], diff --git a/TTS/tts/layers/xtts/dvae.py b/TTS/tts/layers/xtts/dvae.py index 73970fb0bf..4f806f82cb 100644 --- a/TTS/tts/layers/xtts/dvae.py +++ b/TTS/tts/layers/xtts/dvae.py @@ -14,10 +14,6 @@ logger = logging.getLogger(__name__) -def default(val, d): - return val if val is not None else d - - def eval_decorator(fn): def inner(model, *args, **kwargs): was_training = model.training diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py index b3c3b31b47..4e0f53616d 100644 --- a/TTS/tts/layers/xtts/gpt.py +++ b/TTS/tts/layers/xtts/gpt.py @@ -1,6 +1,5 @@ # ported from: https://github.com/neonbjb/tortoise-tts -import functools import random import torch @@ -8,83 +7,16 @@ import torch.nn.functional as F from transformers import GPT2Config -from TTS.tts.layers.tortoise.autoregressive import _prepare_attention_mask_for_generation +from TTS.tts.layers.tortoise.autoregressive import ( + ConditioningEncoder, + LearnedPositionEmbeddings, + _prepare_attention_mask_for_generation, + build_hf_gpt_transformer, +) from TTS.tts.layers.xtts.gpt_inference import GPT2InferenceModel -from TTS.tts.layers.xtts.latent_encoder import ConditioningEncoder from TTS.tts.layers.xtts.perceiver_encoder import PerceiverResampler -def null_position_embeddings(range, dim): - return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device) - - -class LearnedPositionEmbeddings(nn.Module): - def __init__(self, seq_len, model_dim, init=0.02, relative=False): - super().__init__() - # nn.Embedding - self.emb = torch.nn.Embedding(seq_len, model_dim) - # Initializing this way is standard for GPT-2 - self.emb.weight.data.normal_(mean=0.0, std=init) - self.relative = relative - self.seq_len = seq_len - - def forward(self, x): - sl = x.shape[1] - if self.relative: - start = random.randint(sl, self.seq_len) - sl - return self.emb(torch.arange(start, start + sl, device=x.device)) - else: - return self.emb(torch.arange(0, sl, device=x.device)) - - def get_fixed_embedding(self, ind, dev): - return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0) - - -def build_hf_gpt_transformer( - layers, - model_dim, - heads, - max_mel_seq_len, - max_text_seq_len, - max_prompt_len, - checkpointing, -): - """ - GPT-2 implemented by the HuggingFace library. - """ - from transformers import GPT2Config, GPT2Model - - gpt_config = GPT2Config( - vocab_size=256, # Unused. - n_positions=max_mel_seq_len + max_text_seq_len + max_prompt_len, - n_ctx=max_mel_seq_len + max_text_seq_len + max_prompt_len, - n_embd=model_dim, - n_layer=layers, - n_head=heads, - gradient_checkpointing=checkpointing, - use_cache=not checkpointing, - ) - gpt = GPT2Model(gpt_config) - # Override the built in positional embeddings - del gpt.wpe - gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim) - # Built-in token embeddings are unused. - del gpt.wte - - mel_pos_emb = ( - LearnedPositionEmbeddings(max_mel_seq_len, model_dim) - if max_mel_seq_len != -1 - else functools.partial(null_position_embeddings, dim=model_dim) - ) - text_pos_emb = ( - LearnedPositionEmbeddings(max_text_seq_len, model_dim) - if max_mel_seq_len != -1 - else functools.partial(null_position_embeddings, dim=model_dim) - ) - # gpt = torch.compile(gpt, mode="reduce-overhead", fullgraph=True) - return gpt, mel_pos_emb, text_pos_emb, None, None - - class GPT(nn.Module): def __init__( self, @@ -149,13 +81,13 @@ def __init__( self.mel_layer_pos_embedding, self.text_layer_pos_embedding, ) = build_hf_gpt_transformer( - layers, - model_dim, - heads, - self.max_mel_tokens, - self.max_text_tokens, - self.max_prompt_tokens, - checkpointing, + layers=layers, + model_dim=model_dim, + heads=heads, + max_mel_seq_len=self.max_mel_tokens, + max_text_seq_len=self.max_text_tokens, + max_prompt_len=self.max_prompt_tokens, + checkpointing=checkpointing, ) if train_solo_embeddings: self.mel_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * 0.02, requires_grad=True) @@ -303,19 +235,6 @@ def get_logits( else: return first_logits - def get_conditioning(self, speech_conditioning_input): - speech_conditioning_input = ( - speech_conditioning_input.unsqueeze(1) - if len(speech_conditioning_input.shape) == 3 - else speech_conditioning_input - ) - conds = [] - for j in range(speech_conditioning_input.shape[1]): - conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])) - conds = torch.stack(conds, dim=1) - conds = conds.mean(dim=1) - return conds - def get_prompts(self, prompt_codes): """ Create a prompt from the mel codes. This is used to condition the model on the mel codes. @@ -354,6 +273,7 @@ def get_style_emb(self, cond_input, return_latent=False): """ cond_input: (b, 80, s) or (b, 1, 80, s) conds: (b, 1024, s) + output: (b, 1024, 32) """ conds = None if not return_latent: @@ -427,12 +347,12 @@ def forward( audio_codes = F.pad(audio_codes, (0, max_mel_len - audio_codes.shape[-1])) # 💖 Lovely assertions - assert ( - max_mel_len <= audio_codes.shape[-1] - ), f" ❗ max_mel_len ({max_mel_len}) > audio_codes.shape[-1] ({audio_codes.shape[-1]})" - assert ( - max_text_len <= text_inputs.shape[-1] - ), f" ❗ max_text_len ({max_text_len}) > text_inputs.shape[-1] ({text_inputs.shape[-1]})" + assert max_mel_len <= audio_codes.shape[-1], ( + f" ❗ max_mel_len ({max_mel_len}) > audio_codes.shape[-1] ({audio_codes.shape[-1]})" + ) + assert max_text_len <= text_inputs.shape[-1], ( + f" ❗ max_text_len ({max_text_len}) > text_inputs.shape[-1] ({text_inputs.shape[-1]})" + ) # Append stop token to text inputs text_inputs = F.pad(text_inputs[:, :max_text_len], (0, 1), value=self.stop_text_token) @@ -534,9 +454,9 @@ def forward( mel_targets[idx, l + 1 :] = -1 # check if stoptoken is in every row of mel_targets - assert (mel_targets == self.stop_audio_token).sum() >= mel_targets.shape[ - 0 - ], f" ❗ mel_targets does not contain stop token ({self.stop_audio_token}) in every row." + assert (mel_targets == self.stop_audio_token).sum() >= mel_targets.shape[0], ( + f" ❗ mel_targets does not contain stop token ({self.stop_audio_token}) in every row." + ) # ignore the loss for the segment used for conditioning # coin flip for the segment to be ignored diff --git a/TTS/tts/layers/xtts/hifigan_decoder.py b/TTS/tts/layers/xtts/hifigan_decoder.py index 5ef0030b8b..550ad3e3b2 100644 --- a/TTS/tts/layers/xtts/hifigan_decoder.py +++ b/TTS/tts/layers/xtts/hifigan_decoder.py @@ -1,618 +1,13 @@ import logging import torch -import torchaudio -from torch import nn -from torch.nn import Conv1d, ConvTranspose1d -from torch.nn import functional as F -from torch.nn.utils.parametrizations import weight_norm -from torch.nn.utils.parametrize import remove_parametrizations from trainer.io import load_fsspec -from TTS.utils.generic_utils import is_pytorch_at_least_2_4 -from TTS.vocoder.models.hifigan_generator import get_padding +from TTS.encoder.models.resnet import ResNetSpeakerEncoder +from TTS.vocoder.models.hifigan_generator import HifiganGenerator logger = logging.getLogger(__name__) -LRELU_SLOPE = 0.1 - - -class ResBlock1(torch.nn.Module): - """Residual Block Type 1. It has 3 convolutional layers in each convolutional block. - - Network:: - - x -> lrelu -> conv1_1 -> conv1_2 -> conv1_3 -> z -> lrelu -> conv2_1 -> conv2_2 -> conv2_3 -> o -> + -> o - |--------------------------------------------------------------------------------------------------| - - - Args: - channels (int): number of hidden channels for the convolutional layers. - kernel_size (int): size of the convolution filter in each layer. - dilations (list): list of dilation value for each conv layer in a block. - """ - - def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): - super().__init__() - self.convs1 = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[2], - padding=get_padding(kernel_size, dilation[2]), - ) - ), - ] - ) - - self.convs2 = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - ] - ) - - def forward(self, x): - """ - Args: - x (Tensor): input tensor. - Returns: - Tensor: output tensor. - Shapes: - x: [B, C, T] - """ - for c1, c2 in zip(self.convs1, self.convs2): - xt = F.leaky_relu(x, LRELU_SLOPE) - xt = c1(xt) - xt = F.leaky_relu(xt, LRELU_SLOPE) - xt = c2(xt) - x = xt + x - return x - - def remove_weight_norm(self): - for l in self.convs1: - remove_parametrizations(l, "weight") - for l in self.convs2: - remove_parametrizations(l, "weight") - - -class ResBlock2(torch.nn.Module): - """Residual Block Type 2. It has 1 convolutional layers in each convolutional block. - - Network:: - - x -> lrelu -> conv1-> -> z -> lrelu -> conv2-> o -> + -> o - |---------------------------------------------------| - - - Args: - channels (int): number of hidden channels for the convolutional layers. - kernel_size (int): size of the convolution filter in each layer. - dilations (list): list of dilation value for each conv layer in a block. - """ - - def __init__(self, channels, kernel_size=3, dilation=(1, 3)): - super().__init__() - self.convs = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]), - ) - ), - ] - ) - - def forward(self, x): - for c in self.convs: - xt = F.leaky_relu(x, LRELU_SLOPE) - xt = c(xt) - x = xt + x - return x - - def remove_weight_norm(self): - for l in self.convs: - remove_parametrizations(l, "weight") - - -class HifiganGenerator(torch.nn.Module): - def __init__( - self, - in_channels, - out_channels, - resblock_type, - resblock_dilation_sizes, - resblock_kernel_sizes, - upsample_kernel_sizes, - upsample_initial_channel, - upsample_factors, - inference_padding=5, - cond_channels=0, - conv_pre_weight_norm=True, - conv_post_weight_norm=True, - conv_post_bias=True, - cond_in_each_up_layer=False, - ): - r"""HiFiGAN Generator with Multi-Receptive Field Fusion (MRF) - - Network: - x -> lrelu -> upsampling_layer -> resblock1_k1x1 -> z1 -> + -> z_sum / #resblocks -> lrelu -> conv_post_7x1 -> tanh -> o - .. -> zI ---| - resblockN_kNx1 -> zN ---' - - Args: - in_channels (int): number of input tensor channels. - out_channels (int): number of output tensor channels. - resblock_type (str): type of the `ResBlock`. '1' or '2'. - resblock_dilation_sizes (List[List[int]]): list of dilation values in each layer of a `ResBlock`. - resblock_kernel_sizes (List[int]): list of kernel sizes for each `ResBlock`. - upsample_kernel_sizes (List[int]): list of kernel sizes for each transposed convolution. - upsample_initial_channel (int): number of channels for the first upsampling layer. This is divided by 2 - for each consecutive upsampling layer. - upsample_factors (List[int]): upsampling factors (stride) for each upsampling layer. - inference_padding (int): constant padding applied to the input at inference time. Defaults to 5. - """ - super().__init__() - self.inference_padding = inference_padding - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_factors) - self.cond_in_each_up_layer = cond_in_each_up_layer - - # initial upsampling layers - self.conv_pre = weight_norm(Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3)) - resblock = ResBlock1 if resblock_type == "1" else ResBlock2 - # upsampling layers - self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(upsample_factors, upsample_kernel_sizes)): - self.ups.append( - weight_norm( - ConvTranspose1d( - upsample_initial_channel // (2**i), - upsample_initial_channel // (2 ** (i + 1)), - k, - u, - padding=(k - u) // 2, - ) - ) - ) - # MRF blocks - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): - self.resblocks.append(resblock(ch, k, d)) - # post convolution layer - self.conv_post = weight_norm(Conv1d(ch, out_channels, 7, 1, padding=3, bias=conv_post_bias)) - if cond_channels > 0: - self.cond_layer = nn.Conv1d(cond_channels, upsample_initial_channel, 1) - - if not conv_pre_weight_norm: - remove_parametrizations(self.conv_pre, "weight") - - if not conv_post_weight_norm: - remove_parametrizations(self.conv_post, "weight") - - if self.cond_in_each_up_layer: - self.conds = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - self.conds.append(nn.Conv1d(cond_channels, ch, 1)) - - def forward(self, x, g=None): - """ - Args: - x (Tensor): feature input tensor. - g (Tensor): global conditioning input tensor. - - Returns: - Tensor: output waveform. - - Shapes: - x: [B, C, T] - Tensor: [B, 1, T] - """ - o = self.conv_pre(x) - if hasattr(self, "cond_layer"): - o = o + self.cond_layer(g) - for i in range(self.num_upsamples): - o = F.leaky_relu(o, LRELU_SLOPE) - o = self.ups[i](o) - - if self.cond_in_each_up_layer: - o = o + self.conds[i](g) - - z_sum = None - for j in range(self.num_kernels): - if z_sum is None: - z_sum = self.resblocks[i * self.num_kernels + j](o) - else: - z_sum += self.resblocks[i * self.num_kernels + j](o) - o = z_sum / self.num_kernels - o = F.leaky_relu(o) - o = self.conv_post(o) - o = torch.tanh(o) - return o - - @torch.no_grad() - def inference(self, c): - """ - Args: - x (Tensor): conditioning input tensor. - - Returns: - Tensor: output waveform. - - Shapes: - x: [B, C, T] - Tensor: [B, 1, T] - """ - c = c.to(self.conv_pre.weight.device) - c = torch.nn.functional.pad(c, (self.inference_padding, self.inference_padding), "replicate") - return self.forward(c) - - def remove_weight_norm(self): - logger.info("Removing weight norm...") - for l in self.ups: - remove_parametrizations(l, "weight") - for l in self.resblocks: - l.remove_weight_norm() - remove_parametrizations(self.conv_pre, "weight") - remove_parametrizations(self.conv_post, "weight") - - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin - state = torch.load(checkpoint_path, map_location=torch.device("cpu"), weights_only=is_pytorch_at_least_2_4()) - self.load_state_dict(state["model"]) - if eval: - self.eval() - assert not self.training - self.remove_weight_norm() - - -class SELayer(nn.Module): - def __init__(self, channel, reduction=8): - super(SELayer, self).__init__() - self.avg_pool = nn.AdaptiveAvgPool2d(1) - self.fc = nn.Sequential( - nn.Linear(channel, channel // reduction), - nn.ReLU(inplace=True), - nn.Linear(channel // reduction, channel), - nn.Sigmoid(), - ) - - def forward(self, x): - b, c, _, _ = x.size() - y = self.avg_pool(x).view(b, c) - y = self.fc(y).view(b, c, 1, 1) - return x * y - - -class SEBasicBlock(nn.Module): - expansion = 1 - - def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): - super(SEBasicBlock, self).__init__() - self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(planes) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.relu = nn.ReLU(inplace=True) - self.se = SELayer(planes, reduction) - self.downsample = downsample - self.stride = stride - - def forward(self, x): - residual = x - - out = self.conv1(x) - out = self.relu(out) - out = self.bn1(out) - - out = self.conv2(out) - out = self.bn2(out) - out = self.se(out) - - if self.downsample is not None: - residual = self.downsample(x) - - out += residual - out = self.relu(out) - return out - - -def set_init_dict(model_dict, checkpoint_state, c): - # Partial initialization: if there is a mismatch with new and old layer, it is skipped. - for k, v in checkpoint_state.items(): - if k not in model_dict: - logger.warning("Layer missing in the model definition: %s", k) - # 1. filter out unnecessary keys - pretrained_dict = {k: v for k, v in checkpoint_state.items() if k in model_dict} - # 2. filter out different size layers - pretrained_dict = {k: v for k, v in pretrained_dict.items() if v.numel() == model_dict[k].numel()} - # 3. skip reinit layers - if c.has("reinit_layers") and c.reinit_layers is not None: - for reinit_layer_name in c.reinit_layers: - pretrained_dict = {k: v for k, v in pretrained_dict.items() if reinit_layer_name not in k} - # 4. overwrite entries in the existing state dict - model_dict.update(pretrained_dict) - logger.info("%d / %d layers are restored.", len(pretrained_dict), len(model_dict)) - return model_dict - - -class PreEmphasis(nn.Module): - def __init__(self, coefficient=0.97): - super().__init__() - self.coefficient = coefficient - self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0)) - - def forward(self, x): - assert len(x.size()) == 2 - - x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect") - return torch.nn.functional.conv1d(x, self.filter).squeeze(1) - - -class ResNetSpeakerEncoder(nn.Module): - """This is copied from 🐸TTS to remove it from the dependencies.""" - - # pylint: disable=W0102 - def __init__( - self, - input_dim=64, - proj_dim=512, - layers=[3, 4, 6, 3], - num_filters=[32, 64, 128, 256], - encoder_type="ASP", - log_input=False, - use_torch_spec=False, - audio_config=None, - ): - super(ResNetSpeakerEncoder, self).__init__() - - self.encoder_type = encoder_type - self.input_dim = input_dim - self.log_input = log_input - self.use_torch_spec = use_torch_spec - self.audio_config = audio_config - self.proj_dim = proj_dim - - self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1) - self.relu = nn.ReLU(inplace=True) - self.bn1 = nn.BatchNorm2d(num_filters[0]) - - self.inplanes = num_filters[0] - self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0]) - self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2)) - self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2)) - self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2)) - - self.instancenorm = nn.InstanceNorm1d(input_dim) - - if self.use_torch_spec: - self.torch_spec = torch.nn.Sequential( - PreEmphasis(audio_config["preemphasis"]), - torchaudio.transforms.MelSpectrogram( - sample_rate=audio_config["sample_rate"], - n_fft=audio_config["fft_size"], - win_length=audio_config["win_length"], - hop_length=audio_config["hop_length"], - window_fn=torch.hamming_window, - n_mels=audio_config["num_mels"], - ), - ) - - else: - self.torch_spec = None - - outmap_size = int(self.input_dim / 8) - - self.attention = nn.Sequential( - nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1), - nn.ReLU(), - nn.BatchNorm1d(128), - nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1), - nn.Softmax(dim=2), - ) - - if self.encoder_type == "SAP": - out_dim = num_filters[3] * outmap_size - elif self.encoder_type == "ASP": - out_dim = num_filters[3] * outmap_size * 2 - else: - raise ValueError("Undefined encoder") - - self.fc = nn.Linear(out_dim, proj_dim) - - self._init_layers() - - def _init_layers(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - - def create_layer(self, block, planes, blocks, stride=1): - downsample = None - if stride != 1 or self.inplanes != planes * block.expansion: - downsample = nn.Sequential( - nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(planes * block.expansion), - ) - - layers = [] - layers.append(block(self.inplanes, planes, stride, downsample)) - self.inplanes = planes * block.expansion - for _ in range(1, blocks): - layers.append(block(self.inplanes, planes)) - - return nn.Sequential(*layers) - - # pylint: disable=R0201 - def new_parameter(self, *size): - out = nn.Parameter(torch.FloatTensor(*size)) - nn.init.xavier_normal_(out) - return out - - def forward(self, x, l2_norm=False): - """Forward pass of the model. - - Args: - x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True` - to compute the spectrogram on-the-fly. - l2_norm (bool): Whether to L2-normalize the outputs. - - Shapes: - - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})` - """ - x.squeeze_(1) - # if you torch spec compute it otherwise use the mel spec computed by the AP - if self.use_torch_spec: - x = self.torch_spec(x) - - if self.log_input: - x = (x + 1e-6).log() - x = self.instancenorm(x).unsqueeze(1) - - x = self.conv1(x) - x = self.relu(x) - x = self.bn1(x) - - x = self.layer1(x) - x = self.layer2(x) - x = self.layer3(x) - x = self.layer4(x) - - x = x.reshape(x.size()[0], -1, x.size()[-1]) - - w = self.attention(x) - - if self.encoder_type == "SAP": - x = torch.sum(x * w, dim=2) - elif self.encoder_type == "ASP": - mu = torch.sum(x * w, dim=2) - sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5)) - x = torch.cat((mu, sg), 1) - - x = x.view(x.size()[0], -1) - x = self.fc(x) - - if l2_norm: - x = torch.nn.functional.normalize(x, p=2, dim=1) - return x - - def load_checkpoint( - self, - checkpoint_path: str, - eval: bool = False, - use_cuda: bool = False, - criterion=None, - cache=False, - ): - state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) - try: - self.load_state_dict(state["model"]) - logger.info("Model fully restored.") - except (KeyError, RuntimeError) as error: - # If eval raise the error - if eval: - raise error - - logger.info("Partial model initialization.") - model_dict = self.state_dict() - model_dict = set_init_dict(model_dict, state["model"]) - self.load_state_dict(model_dict) - del model_dict - - # load the criterion for restore_path - if criterion is not None and "criterion" in state: - try: - criterion.load_state_dict(state["criterion"]) - except (KeyError, RuntimeError) as error: - logger.exception("Criterion load ignored because of: %s", error) - - if use_cuda: - self.cuda() - if criterion is not None: - criterion = criterion.cuda() - - if eval: - self.eval() - assert not self.training - - if not eval: - return criterion, state["step"] - return criterion - class HifiDecoder(torch.nn.Module): def __init__( @@ -702,7 +97,7 @@ def forward(self, latents, g=None): o = self.waveform_decoder(z, g=g) return o - @torch.no_grad() + @torch.inference_mode() def inference(self, c, g): """ Args: diff --git a/TTS/tts/layers/xtts/latent_encoder.py b/TTS/tts/layers/xtts/latent_encoder.py deleted file mode 100644 index f9d62a36f1..0000000000 --- a/TTS/tts/layers/xtts/latent_encoder.py +++ /dev/null @@ -1,141 +0,0 @@ -# ported from: Originally ported from: https://github.com/neonbjb/tortoise-tts - -import math - -import torch -from torch import nn -from torch.nn import functional as F - - -class GroupNorm32(nn.GroupNorm): - def forward(self, x): - return super().forward(x.float()).type(x.dtype) - - -def conv_nd(dims, *args, **kwargs): - if dims == 1: - return nn.Conv1d(*args, **kwargs) - elif dims == 2: - return nn.Conv2d(*args, **kwargs) - elif dims == 3: - return nn.Conv3d(*args, **kwargs) - raise ValueError(f"unsupported dimensions: {dims}") - - -def normalization(channels): - groups = 32 - if channels <= 16: - groups = 8 - elif channels <= 64: - groups = 16 - while channels % groups != 0: - groups = int(groups / 2) - assert groups > 2 - return GroupNorm32(groups, channels) - - -def zero_module(module): - for p in module.parameters(): - p.detach().zero_() - return module - - -class QKVAttention(nn.Module): - def __init__(self, n_heads): - super().__init__() - self.n_heads = n_heads - - def forward(self, qkv, mask=None, qk_bias=0): - """ - Apply QKV attention. - - :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. - :return: an [N x (H * C) x T] tensor after attention. - """ - bs, width, length = qkv.shape - assert width % (3 * self.n_heads) == 0 - ch = width // (3 * self.n_heads) - q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1) - scale = 1 / math.sqrt(math.sqrt(ch)) - weight = torch.einsum("bct,bcs->bts", q * scale, k * scale) # More stable with f16 than dividing afterwards - weight = weight + qk_bias - if mask is not None: - mask = mask.repeat(self.n_heads, 1, 1) - weight[mask.logical_not()] = -torch.inf - weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) - a = torch.einsum("bts,bcs->bct", weight, v) - - return a.reshape(bs, -1, length) - - -class AttentionBlock(nn.Module): - """An attention block that allows spatial positions to attend to each other.""" - - def __init__( - self, - channels, - num_heads=1, - num_head_channels=-1, - out_channels=None, - do_activation=False, - ): - super().__init__() - self.channels = channels - out_channels = channels if out_channels is None else out_channels - self.do_activation = do_activation - if num_head_channels == -1: - self.num_heads = num_heads - else: - assert ( - channels % num_head_channels == 0 - ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" - self.num_heads = channels // num_head_channels - self.norm = normalization(channels) - self.qkv = conv_nd(1, channels, out_channels * 3, 1) - self.attention = QKVAttention(self.num_heads) - - self.x_proj = nn.Identity() if out_channels == channels else conv_nd(1, channels, out_channels, 1) - self.proj_out = zero_module(conv_nd(1, out_channels, out_channels, 1)) - - def forward(self, x, mask=None, qk_bias=0): - b, c, *spatial = x.shape - if mask is not None: - if len(mask.shape) == 2: - mask = mask.unsqueeze(0).repeat(x.shape[0], 1, 1) - if mask.shape[1] != x.shape[-1]: - mask = mask[:, : x.shape[-1], : x.shape[-1]] - - x = x.reshape(b, c, -1) - x = self.norm(x) - if self.do_activation: - x = F.silu(x, inplace=True) - qkv = self.qkv(x) - h = self.attention(qkv, mask=mask, qk_bias=qk_bias) - h = self.proj_out(h) - xp = self.x_proj(x) - return (xp + h).reshape(b, xp.shape[1], *spatial) - - -class ConditioningEncoder(nn.Module): - def __init__( - self, - spec_dim, - embedding_dim, - attn_blocks=6, - num_attn_heads=4, - ): - super().__init__() - attn = [] - self.init = nn.Conv1d(spec_dim, embedding_dim, kernel_size=1) - for a in range(attn_blocks): - attn.append(AttentionBlock(embedding_dim, num_attn_heads)) - self.attn = nn.Sequential(*attn) - self.dim = embedding_dim - - def forward(self, x): - """ - x: (b, 80, s) - """ - h = self.init(x) - h = self.attn(h) - return h diff --git a/TTS/tts/layers/xtts/perceiver_encoder.py b/TTS/tts/layers/xtts/perceiver_encoder.py index f4b6e84123..7477087283 100644 --- a/TTS/tts/layers/xtts/perceiver_encoder.py +++ b/TTS/tts/layers/xtts/perceiver_encoder.py @@ -9,9 +9,8 @@ from einops.layers.torch import Rearrange from torch import einsum, nn - -def exists(val): - return val is not None +from TTS.tts.layers.tortoise.transformer import GEGLU +from TTS.utils.generic_utils import default, exists def once(fn): @@ -151,12 +150,6 @@ def Sequential(*mods): return nn.Sequential(*filter(exists, mods)) -def default(val, d): - if exists(val): - return val - return d() if callable(d) else d - - class RMSNorm(nn.Module): def __init__(self, dim, scale=True, dim_cond=None): super().__init__() @@ -194,12 +187,6 @@ def forward(self, x): return super().forward(causal_padded_x) -class GEGLU(nn.Module): - def forward(self, x): - x, gate = x.chunk(2, dim=-1) - return F.gelu(gate) * x - - def FeedForward(dim, mult=4, causal_conv=False): dim_inner = int(dim * mult * 2 / 3) diff --git a/TTS/tts/layers/xtts/stream_generator.py b/TTS/tts/layers/xtts/stream_generator.py index 44cf940c69..9343f656e1 100644 --- a/TTS/tts/layers/xtts/stream_generator.py +++ b/TTS/tts/layers/xtts/stream_generator.py @@ -4,7 +4,7 @@ import inspect import random import warnings -from typing import Callable, Optional, Union +from collections.abc import Callable import numpy as np import torch @@ -45,18 +45,18 @@ def __init__(self, **kwargs): class NewGenerationMixin(GenerationMixin): - @torch.no_grad() + @torch.inference_mode() def generate( # noqa: PLR0911 self, - inputs: Optional[torch.Tensor] = None, - generation_config: Optional[StreamGenerationConfig] = None, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None, - synced_gpus: Optional[bool] = False, + inputs: torch.Tensor | None = None, + generation_config: StreamGenerationConfig | None = None, + logits_processor: LogitsProcessorList | None = None, + stopping_criteria: StoppingCriteriaList | None = None, + prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], list[int]] | None = None, + synced_gpus: bool | None = False, seed: int = 0, **kwargs, - ) -> Union[GenerateOutput, torch.LongTensor]: + ) -> GenerateOutput | torch.LongTensor: r""" Generates sequences of token ids for models with a language modeling head. @@ -207,8 +207,8 @@ def generate( # noqa: PLR0911 ) model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( inputs_tensor, - generation_config._pad_token_tensor, - generation_config._eos_token_tensor, + generation_config, + model_kwargs, ) # decoder-only models should use left-padding for generation @@ -662,23 +662,23 @@ def typeerror(): **model_kwargs, ) - @torch.no_grad() + @torch.inference_mode() def sample_stream( self, input_ids: torch.LongTensor, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_warper: Optional[LogitsProcessorList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[Union[int, list[int]]] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: Optional[bool] = False, + logits_processor: LogitsProcessorList | None = None, + stopping_criteria: StoppingCriteriaList | None = None, + logits_warper: LogitsProcessorList | None = None, + max_length: int | None = None, + pad_token_id: int | None = None, + eos_token_id: int | list[int] | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + output_scores: bool | None = None, + return_dict_in_generate: bool | None = None, + synced_gpus: bool | None = False, **model_kwargs, - ) -> Union[SampleOutput, torch.LongTensor]: + ) -> SampleOutput | torch.LongTensor: r""" Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. @@ -953,7 +953,6 @@ def init_stream_support(): def _get_logits_warper(generation_config: GenerationConfig) -> LogitsProcessorList: - warpers = LogitsProcessorList() if generation_config.temperature is not None and generation_config.temperature != 1.0: diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index cf80d8cff3..ee7989407e 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -16,6 +16,7 @@ from tokenizers import Tokenizer from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words +from TTS.tts.utils.text.cleaners import collapse_whitespace, lowercase logger = logging.getLogger(__name__) @@ -73,12 +74,10 @@ def split_sentence(text, lang, text_split_length=250): return text_splits -_whitespace_re = re.compile(r"\s+") - # List of (regular expression, replacement) pairs for abbreviations: _abbreviations = { "en": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("mrs", "misess"), ("mr", "mister"), @@ -101,7 +100,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "es": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("sra", "señora"), ("sr", "señor"), @@ -114,7 +113,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "fr": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("mme", "madame"), ("mr", "monsieur"), @@ -126,7 +125,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "de": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("fr", "frau"), ("dr", "doktor"), @@ -136,7 +135,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "pt": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("sra", "senhora"), ("sr", "senhor"), @@ -149,7 +148,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "it": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # ("sig.ra", "signora"), ("sig", "signore"), @@ -161,7 +160,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "pl": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("p", "pani"), ("m", "pan"), @@ -171,19 +170,19 @@ def split_sentence(text, lang, text_split_length=250): ] ], "ar": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # There are not many common abbreviations in Arabic as in English. ] ], "zh": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts. ] ], "cs": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("dr", "doktor"), # doctor ("ing", "inženýr"), # engineer @@ -192,7 +191,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "ru": [ - (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\b", re.IGNORECASE), x[1]) for x in [ ("г-жа", "госпожа"), # Mrs. ("г-н", "господин"), # Mr. @@ -201,7 +200,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "nl": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("dhr", "de heer"), # Mr. ("mevr", "mevrouw"), # Mrs. @@ -211,7 +210,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "tr": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("b", "bay"), # Mr. ("byk", "büyük"), # büyük @@ -220,7 +219,7 @@ def split_sentence(text, lang, text_split_length=250): ] ], "hu": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("dr", "doktor"), # doctor ("b", "bácsi"), # Mr. @@ -229,13 +228,13 @@ def split_sentence(text, lang, text_split_length=250): ] ], "ko": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # Korean doesn't typically use abbreviations in the same way as Latin-based scripts. ] ], "hi": [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ # Hindi doesn't typically use abbreviations in the same way as Latin-based scripts. ] @@ -261,7 +260,7 @@ def expand_abbreviations_multilingual(text, lang="en"): _symbols_multilingual = { "en": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " and "), ("@", " at "), @@ -273,7 +272,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "es": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " y "), ("@", " arroba "), @@ -285,7 +284,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "fr": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " et "), ("@", " arobase "), @@ -297,7 +296,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "de": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " und "), ("@", " at "), @@ -309,7 +308,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "pt": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " e "), ("@", " arroba "), @@ -321,7 +320,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "it": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " e "), ("@", " chiocciola "), @@ -333,7 +332,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "pl": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " i "), ("@", " małpa "), @@ -346,7 +345,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "ar": [ # Arabic - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " و "), ("@", " على "), @@ -359,7 +358,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "zh": [ # Chinese - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " 和 "), ("@", " 在 "), @@ -372,7 +371,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "cs": [ # Czech - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " a "), ("@", " na "), @@ -385,7 +384,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "ru": [ # Russian - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " и "), ("@", " собака "), @@ -398,7 +397,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "nl": [ # Dutch - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " en "), ("@", " bij "), @@ -410,7 +409,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "tr": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " ve "), ("@", " at "), @@ -422,7 +421,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "hu": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " és "), ("@", " kukac "), @@ -435,7 +434,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ], "ko": [ # Korean - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " 그리고 "), ("@", " 에 "), @@ -447,7 +446,7 @@ def expand_abbreviations_multilingual(text, lang="en"): ] ], "hi": [ - (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + (re.compile(rf"{re.escape(x[0])}", re.IGNORECASE), x[1]) for x in [ ("&", " और "), ("@", " ऐट दी रेट "), @@ -528,12 +527,12 @@ def _remove_dots(m): def _expand_decimal_point(m, lang="en"): amount = m.group(1).replace(",", ".") - return num2words(float(amount), lang=lang if lang != "cs" else "cz") + return num2words(float(amount), lang=lang) def _expand_currency(m, lang="en", currency="USD"): - amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", ".")))) - full_amount = num2words(amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz") + amount = float(re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))) + full_amount = num2words(amount, to="currency", currency=currency, lang=lang) and_equivalents = { "en": ", ", @@ -564,11 +563,11 @@ def _expand_currency(m, lang="en", currency="USD"): def _expand_ordinal(m, lang="en"): - return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz") + return num2words(int(m.group(1)), ordinal=True, lang=lang) def _expand_number(m, lang="en"): - return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz") + return num2words(int(m.group(0)), lang=lang) def expand_numbers_multilingual(text, lang="en"): @@ -592,14 +591,6 @@ def expand_numbers_multilingual(text, lang="en"): return text -def lowercase(text): - return text.lower() - - -def collapse_whitespace(text): - return re.sub(_whitespace_re, " ", text) - - def multilingual_cleaners(text, lang): text = text.replace('"', "") if lang == "tr": @@ -614,13 +605,6 @@ def multilingual_cleaners(text, lang): return text -def basic_cleaners(text): - """Basic pipeline that lowercases and collapses whitespace without transliteration.""" - text = lowercase(text) - text = collapse_whitespace(text) - return text - - def chinese_transliterate(text): try: import pypinyin diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py index 0253d65ddd..edd8fc4b65 100644 --- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -1,6 +1,5 @@ import logging from dataclasses import dataclass, field -from typing import Dict, List, Tuple, Union import torch import torch.nn as nn @@ -18,7 +17,7 @@ from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer from TTS.tts.layers.xtts.trainer.dataset import XTTSDataset from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.models.xtts import Xtts, XttsArgs, XttsAudioConfig +from TTS.tts.models.xtts import Xtts, XttsArgs from TTS.utils.generic_utils import is_pytorch_at_least_2_4 logger = logging.getLogger(__name__) @@ -31,12 +30,7 @@ class GPTTrainerConfig(XttsConfig): optimizer_wd_only_on_weights: bool = False weighted_loss_attrs: dict = field(default_factory=lambda: {}) weighted_loss_multipliers: dict = field(default_factory=lambda: {}) - test_sentences: List[dict] = field(default_factory=lambda: []) - - -@dataclass -class XttsAudioConfig(XttsAudioConfig): - dvae_sample_rate: int = 22050 + test_sentences: list[dict] = field(default_factory=lambda: []) @dataclass @@ -202,10 +196,6 @@ def __init__(self, config: Coqpit): mel_norm_file=self.args.mel_norm_file, sampling_rate=config.audio.dvae_sample_rate ) - @property - def device(self): - return next(self.parameters()).device - def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels, cond_idxs, cond_lens): """ Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode @@ -230,8 +220,8 @@ def forward(self, text_inputs, text_lengths, audio_codes, wav_lengths, cond_mels ) return losses - @torch.no_grad() - def test_run(self, assets) -> Tuple[Dict, Dict]: # pylint: disable=W0613 + @torch.inference_mode() + def test_run(self, assets) -> tuple[dict, dict]: # pylint: disable=W0613 test_audios = {} if self.config.test_sentences: # init gpt for inference mode @@ -246,7 +236,7 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: # pylint: disable=W0613 s_info["language"], gpt_cond_len=3, )["wav"] - test_audios["{}-audio".format(idx)] = wav + test_audios[f"{idx}-audio"] = wav # delete inference layers del self.xtts.gpt.gpt_inference @@ -254,11 +244,15 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: # pylint: disable=W0613 return {"audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.args.output_sample_rate) - def format_batch(self, batch: Dict) -> Dict: + def format_batch(self, batch: dict) -> dict: return batch @torch.no_grad() # torch no grad to avoid gradients from the pre-processing and DVAE codes extraction @@ -340,7 +334,7 @@ def on_init_end(self, trainer): # pylint: disable=W0613 WeightsFileHandler.add_pre_callback(callback_clearml_load_save) - @torch.no_grad() + @torch.inference_mode() def inference( self, x, @@ -360,12 +354,12 @@ def get_sampler(self, dataset: TTSDataset, num_gpus=1): def get_data_loader( self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: bool, - samples: Union[List[Dict], List[List]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, - rank: int = None, + rank: int | None = None, ) -> "DataLoader": # pylint: disable=W0613 if is_eval and not config.run_eval: loader = None @@ -405,7 +399,7 @@ def get_data_loader( ) return loader - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: """Initiate and return the optimizer based on the config parameters.""" # ToDo: deal with multi GPU training if self.config.optimizer_wd_only_on_weights: @@ -436,7 +430,7 @@ def get_optimizer(self) -> List: v.is_norm = isinstance(m, norm_modules) v.is_emb = isinstance(m, emb_modules) - fpn = "%s.%s" % (mn, k) if mn else k # full param name + fpn = f"{mn}.{k}" if mn else k # full param name all_param_names.add(fpn) param_map[fpn] = v if v.is_bias or v.is_norm or v.is_emb: @@ -469,7 +463,7 @@ def get_optimizer(self) -> List: parameters=self.xtts.gpt.parameters(), ) - def get_scheduler(self, optimizer) -> List: + def get_scheduler(self, optimizer) -> list: """Set the scheduler for the optimizer. Args: @@ -500,7 +494,7 @@ def load_checkpoint( assert not self.training @staticmethod - def init_from_config(config: "GPTTrainerConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "GPTTrainerConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/layers/xtts/zh_num2words.py b/TTS/tts/layers/xtts/zh_num2words.py index 69b8dae952..360d9b06c8 100644 --- a/TTS/tts/layers/xtts/zh_num2words.py +++ b/TTS/tts/layers/xtts/zh_num2words.py @@ -392,7 +392,7 @@ # ================================================================================ # # basic class # ================================================================================ # -class ChineseChar(object): +class ChineseChar: """ 中文字符 每个字符对应简体和繁体, @@ -420,13 +420,13 @@ class ChineseNumberUnit(ChineseChar): """ def __init__(self, power, simplified, traditional, big_s, big_t): - super(ChineseNumberUnit, self).__init__(simplified, traditional) + super().__init__(simplified, traditional) self.power = power self.big_s = big_s self.big_t = big_t def __str__(self): - return "10^{}".format(self.power) + return f"10^{self.power}" @classmethod def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False): @@ -447,7 +447,7 @@ def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=Fals power=pow(2, index + 3), simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1] ) else: - raise ValueError("Counting type should be in {0} ({1} provided).".format(NUMBERING_TYPES, numbering_type)) + raise ValueError(f"Counting type should be in {NUMBERING_TYPES} ({numbering_type} provided).") class ChineseNumberDigit(ChineseChar): @@ -456,7 +456,7 @@ class ChineseNumberDigit(ChineseChar): """ def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None): - super(ChineseNumberDigit, self).__init__(simplified, traditional) + super().__init__(simplified, traditional) self.value = value self.big_s = big_s self.big_t = big_t @@ -477,7 +477,7 @@ class ChineseMath(ChineseChar): """ def __init__(self, simplified, traditional, symbol, expression=None): - super(ChineseMath, self).__init__(simplified, traditional) + super().__init__(simplified, traditional) self.symbol = symbol self.expression = expression self.big_s = simplified @@ -487,13 +487,13 @@ def __init__(self, simplified, traditional, symbol, expression=None): CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath -class NumberSystem(object): +class NumberSystem: """ 中文数字系统 """ -class MathSymbol(object): +class MathSymbol: """ 用于中文数字系统的数学符号 (繁/简体), e.g. positive = ['正', '正'] @@ -507,8 +507,7 @@ def __init__(self, positive, negative, point): self.point = point def __iter__(self): - for v in self.__dict__.values(): - yield v + yield from self.__dict__.values() # class OtherSymbol(object): @@ -640,7 +639,7 @@ def compute_value(integer_symbols): int_str = str(compute_value(int_part)) dec_str = "".join([str(d.value) for d in dec_part]) if dec_part: - return "{0}.{1}".format(int_str, dec_str) + return f"{int_str}.{dec_str}" else: return int_str @@ -686,7 +685,7 @@ def get_value(value_string, use_zeros=True): int_string = int_dec[0] dec_string = int_dec[1] else: - raise ValueError("invalid input num string with more than one dot: {}".format(number_string)) + raise ValueError(f"invalid input num string with more than one dot: {number_string}") if use_units and len(int_string) > 1: result_symbols = get_value(int_string) @@ -702,7 +701,7 @@ def get_value(value_string, use_zeros=True): if isinstance(v, CND) and v.value == 2: next_symbol = result_symbols[i + 1] if i < len(result_symbols) - 1 else None previous_symbol = result_symbols[i - 1] if i > 0 else None - if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))): + if isinstance(next_symbol, CNU) and isinstance(previous_symbol, CNU | type(None)): if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)): result_symbols[i] = liang @@ -1166,7 +1165,7 @@ def __call__(self, text): ) ndone = 0 - with open(args.ifile, "r", encoding="utf8") as istream, open(args.ofile, "w+", encoding="utf8") as ostream: + with open(args.ifile, encoding="utf8") as istream, open(args.ofile, "w+", encoding="utf8") as ostream: if args.format == "tsv": reader = csv.DictReader(istream, delimiter="\t") assert "TEXT" in reader.fieldnames diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index ebfa171c80..4746b13ea2 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -1,12 +1,11 @@ import logging -from typing import Dict, List, Union from TTS.utils.generic_utils import find_module logger = logging.getLogger(__name__) -def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS": +def setup_model(config: "Coqpit", samples: list[list] | list[dict] = None) -> "BaseTTS": logger.info("Using model: %s", config.model) # fetch the right model implementation. if "base_model" in config and config["base_model"] is not None: diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 1c3d57582e..c2e29c7100 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Dict, List, Union import torch from coqpit import Coqpit @@ -13,7 +12,7 @@ from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.utils.helpers import generate_path, sequence_mask +from TTS.tts.utils.helpers import expand_encoder_outputs, generate_attention, sequence_mask from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_spectrogram @@ -169,35 +168,6 @@ def compute_align_path(self, mu, log_sigma, y, x_mask, y_mask): dr_mas = torch.sum(attn, -1) return dr_mas.squeeze(1), log_p - @staticmethod - def generate_attn(dr, x_mask, y_mask=None): - # compute decode mask from the durations - if y_mask is None: - y_lengths = dr.sum(1).long() - y_lengths[y_lengths < 1] = 1 - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype) - attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) - attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype) - return attn - - def expand_encoder_outputs(self, en, dr, x_mask, y_mask): - """Generate attention alignment map from durations and - expand encoder outputs - - Examples:: - - encoder output: [a,b,c,d] - - durations: [1, 3, 2, 1] - - - expanded: [a, b, b, b, c, c, d] - - attention map: [[0, 0, 0, 0, 0, 0, 1], - [0, 0, 0, 0, 1, 1, 0], - [0, 1, 1, 1, 0, 0, 0], - [1, 0, 0, 0, 0, 0, 0]] - """ - attn = self.generate_attn(dr, x_mask, y_mask) - o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2) - return o_en_ex, attn - def format_durations(self, o_dr_log, x_mask): o_dr = (torch.exp(o_dr_log) - 1) * x_mask * self.length_scale o_dr[o_dr < 1] = 1.0 @@ -243,9 +213,8 @@ def _forward_encoder(self, x, x_lengths, g=None): return o_en, o_en_dp, x_mask, g def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g): - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) # expand o_en with durations - o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask) + o_en_ex, attn, y_mask = expand_encoder_outputs(o_en, dr, x_mask, y_lengths) # positional encoding if hasattr(self, "pos_encoder"): o_en_ex = self.pos_encoder(o_en_ex, y_mask) @@ -263,9 +232,7 @@ def _forward_mdn(self, o_en, y, y_lengths, x_mask): dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, y_mask) return dr_mas, mu, log_sigma, logp - def forward( - self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None - ): # pylint: disable=unused-argument + def forward(self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None): # pylint: disable=unused-argument """ Shapes: - x: :math:`[B, T_max]` @@ -282,7 +249,7 @@ def forward( o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) - attn = self.generate_attn(dr_mas, x_mask, y_mask) + attn = generate_attention(dr_mas, x_mask, y_mask) elif phase == 1: # train decoder o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) @@ -318,7 +285,7 @@ def forward( } return outputs - @torch.no_grad() + @torch.inference_mode() def inference(self, x, aux_input={"d_vectors": None}): # pylint: disable=unused-argument """ Shapes: @@ -382,9 +349,7 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use train_audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": train_audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) @@ -397,9 +362,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: @@ -433,7 +396,7 @@ def on_epoch_start(self, trainer): self.phase = self._set_phase(trainer.config, trainer.total_steps_done) @staticmethod - def init_from_config(config: "AlignTTSConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "AlignTTSConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/bark.py b/TTS/tts/models/bark.py index ced8f60ed8..84814745a2 100644 --- a/TTS/tts/models/bark.py +++ b/TTS/tts/models/bark.py @@ -1,6 +1,6 @@ import os from dataclasses import dataclass -from typing import Optional +from pathlib import Path import numpy as np from coqpit import Coqpit @@ -42,10 +42,6 @@ def __init__( self.encodec = EncodecModel.encodec_model_24khz() self.encodec.set_target_bandwidth(6.0) - @property - def device(self): - return next(self.parameters()).device - def load_bark_models(self): self.semantic_model, self.config = load_model( ckpt_path=self.config.LOCAL_MODEL_PATHS["text"], device=self.device, config=self.config, model_type="text" @@ -68,7 +64,7 @@ def train_step( def text_to_semantic( self, text: str, - history_prompt: Optional[str] = None, + history_prompt: str | None = None, temp: float = 0.7, base=None, allow_early_stop=True, @@ -98,7 +94,7 @@ def text_to_semantic( def semantic_to_waveform( self, semantic_tokens: np.ndarray, - history_prompt: Optional[str] = None, + history_prompt: str | None = None, temp: float = 0.7, base=None, ): @@ -132,7 +128,7 @@ def semantic_to_waveform( def generate_audio( self, text: str, - history_prompt: Optional[str] = None, + history_prompt: str | None = None, text_temp: float = 0.7, waveform_temp: float = 0.7, base=None, @@ -194,9 +190,7 @@ def _set_voice_dirs(self, voice_dirs): return _voice_dirs # TODO: remove config from synthesize - def synthesize( - self, text, config, speaker_id="random", voice_dirs=None, **kwargs - ): # pylint: disable=unused-argument + def synthesize(self, text, config, speaker_id="random", voice_dirs=None, **kwargs): # pylint: disable=unused-argument """Synthesize speech with the given input text. Args: @@ -206,12 +200,14 @@ def synthesize( speaker_wav (str): Path to the speaker audio file for cloning a new voice. It is cloned and saved in `voice_dirs` with the name `speaker_id`. Defaults to None. voice_dirs (List[str]): List of paths that host reference audio files for speakers. Defaults to None. - **kwargs: Model specific inference settings used by `generate_audio()` and `TTS.tts.layers.bark.inference_funcs.generate_text_semantic(). + **kwargs: Model specific inference settings used by `generate_audio()` and + `TTS.tts.layers.bark.inference_funcs.generate_text_semantic()`. Returns: - A dictionary of the output values with `wav` as output waveform, `deterministic_seed` as seed used at inference, - `text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents` - as latents used at inference. + A dictionary of the output values with `wav` as output waveform, + `deterministic_seed` as seed used at inference, `text_input` as text token IDs + after tokenizer, `voice_samples` as samples used for cloning, + `conditioning_latents` as latents used at inference. """ speaker_id = "random" if speaker_id is None else speaker_id @@ -267,10 +263,12 @@ def load_checkpoint( fine_model_path = fine_model_path or os.path.join(checkpoint_dir, "fine_2.pt") hubert_tokenizer_path = hubert_tokenizer_path or os.path.join(checkpoint_dir, "tokenizer.pth") + # The paths in the default config start with /root/.local/share/tts and need to be fixed self.config.LOCAL_MODEL_PATHS["text"] = text_model_path self.config.LOCAL_MODEL_PATHS["coarse"] = coarse_model_path self.config.LOCAL_MODEL_PATHS["fine"] = fine_model_path self.config.LOCAL_MODEL_PATHS["hubert_tokenizer"] = hubert_tokenizer_path + self.config.CACHE_DIR = str(Path(text_model_path).parent) self.load_bark_models() diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py index 79cdf1a7d4..05f4ae168d 100644 --- a/TTS/tts/models/base_tacotron.py +++ b/TTS/tts/models/base_tacotron.py @@ -1,7 +1,6 @@ import copy import logging from abc import abstractmethod -from typing import Dict, Tuple import torch from coqpit import Coqpit @@ -62,7 +61,7 @@ def __init__( self.coarse_decoder = None @staticmethod - def _format_aux_input(aux_input: Dict) -> Dict: + def _format_aux_input(aux_input: dict) -> dict: """Set missing fields to their default values""" if aux_input: return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input) @@ -94,9 +93,7 @@ def forward(self): def inference(self): pass - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin """Load model checkpoint and set up internals. Args: @@ -141,7 +138,7 @@ def init_from_config(config: Coqpit): # TEST AND LOG FUNCTIONS # ########################## - def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: + def test_run(self, assets: dict) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -169,17 +166,19 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: use_griffin_lim=True, do_trim_silence=False, ) - test_audios["{}-audio".format(idx)] = outputs_dict["wav"] - test_figures["{}-prediction".format(idx)] = plot_spectrogram( + test_audios[f"{idx}-audio"] = outputs_dict["wav"] + test_figures[f"{idx}-prediction"] = plot_spectrogram( outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False ) - test_figures["{}-alignment".format(idx)] = plot_alignment( - outputs_dict["outputs"]["alignments"], output_fig=False - ) + test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False) return {"figures": test_figures, "audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.ap.sample_rate) logger.test_figures(steps, outputs["figures"]) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index ccb023ce84..95cbf5bbf5 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -1,7 +1,6 @@ import logging import os import random -from typing import Dict, List, Tuple, Union import torch import torch.distributed as dist @@ -79,16 +78,18 @@ def _set_model_args(self, config: Coqpit): else: raise ValueError("config must be either a *Config or *Args") - def init_multispeaker(self, config: Coqpit, data: List = None): - """Initialize a speaker embedding layer if needen and define expected embedding channel size for defining - `in_channels` size of the connected layers. + def init_multispeaker(self, config: Coqpit, data: list = None): + """Set up for multi-speaker TTS. + + Initialize a speaker embedding layer if needed and define expected embedding + channel size for defining `in_channels` size of the connected layers. This implementation yields 3 possible outcomes: - 1. If `config.use_speaker_embedding` and `config.use_d_vector_file are False, do nothing. + 1. If `config.use_speaker_embedding` and `config.use_d_vector_file` are False, do nothing. 2. If `config.use_d_vector_file` is True, set expected embedding channel size to `config.d_vector_dim` or 512. 3. If `config.use_speaker_embedding`, initialize a speaker embedding layer with channel size of - `config.d_vector_dim` or 512. + `config.d_vector_dim` or 512. You can override this function for new models. @@ -112,7 +113,7 @@ def init_multispeaker(self, config: Coqpit, data: List = None): self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) - def get_aux_input(self, **kwargs) -> Dict: + def get_aux_input(self, **kwargs) -> dict: """Prepare and return `aux_input` used by `forward()`""" return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None} @@ -163,7 +164,7 @@ def get_aux_input_from_test_sentences(self, sentence_info): "language_id": language_id, } - def format_batch(self, batch: Dict) -> Dict: + def format_batch(self, batch: dict) -> dict: """Generic batch formatting for `TTSDataset`. You must override this if you use a custom dataset. @@ -209,9 +210,9 @@ def format_batch(self, batch: Dict) -> Dict: extra_frames = dur.sum() - mel_lengths[idx] largest_idxs = torch.argsort(-dur)[:extra_frames] dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + assert dur.sum() == mel_lengths[idx], ( + f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + ) durations[idx, : text_lengths[idx]] = dur # set stop targets wrt reduction factor @@ -283,12 +284,12 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): def get_data_loader( self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: bool, - samples: Union[List[Dict], List[List]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, - rank: int = None, + rank: int | None = None, ) -> "DataLoader": if is_eval and not config.run_eval: loader = None @@ -364,7 +365,7 @@ def get_data_loader( def _get_test_aux_input( self, - ) -> Dict: + ) -> dict: d_vector = None if self.config.use_d_vector_file: d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings] @@ -381,7 +382,7 @@ def _get_test_aux_input( } return aux_inputs - def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: + def test_run(self, assets: dict) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -412,13 +413,11 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: use_griffin_lim=True, do_trim_silence=False, ) - test_audios["{}-audio".format(idx)] = outputs_dict["wav"] - test_figures["{}-prediction".format(idx)] = plot_spectrogram( + test_audios[f"{idx}-audio"] = outputs_dict["wav"] + test_figures[f"{idx}-prediction"] = plot_spectrogram( outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False ) - test_figures["{}-alignment".format(idx)] = plot_alignment( - outputs_dict["outputs"]["alignments"], output_fig=False - ) + test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False) return test_figures, test_audios def on_init_start(self, trainer): diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py index c6f15a7952..2d59db74c0 100644 --- a/TTS/tts/models/delightful_tts.py +++ b/TTS/tts/models/delightful_tts.py @@ -3,35 +3,40 @@ from dataclasses import dataclass, field from itertools import chain from pathlib import Path -from typing import Dict, List, Optional, Tuple, Union import numpy as np import torch import torch.distributed as dist -import torchaudio from coqpit import Coqpit -from librosa.filters import mel as librosa_mel_fn from torch import nn -from torch.nn import functional as F from torch.utils.data import DataLoader from torch.utils.data.sampler import WeightedRandomSampler from trainer.io import load_fsspec from trainer.torch import DistributedSampler, DistributedSamplerWrapper from trainer.trainer_utils import get_optimizer, get_scheduler -from TTS.tts.datasets.dataset import F0Dataset, TTSDataset, _parse_sample +from TTS.tts.datasets.dataset import F0Dataset, TTSDataset, _parse_sample, get_attribute_balancer_weights from TTS.tts.layers.delightful_tts.acoustic_model import AcousticModel -from TTS.tts.layers.losses import ForwardSumLoss, VitsDiscriminatorLoss +from TTS.tts.layers.losses import ( + ForwardSumLoss, + VitsDiscriminatorLoss, + _binary_alignment_loss, + feature_loss, + generator_loss, +) from TTS.tts.layers.vits.discriminator import VitsDiscriminator from TTS.tts.models.base_tts import BaseTTSE2E +from TTS.tts.models.vits import load_audio from TTS.tts.utils.helpers import average_over_durations, compute_attn_prior, rand_segments, segment, sequence_mask from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.synthesis import embedding_to_torch, id_to_torch, numpy_to_torch from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_avg_pitch, plot_pitch, plot_spectrogram from TTS.utils.audio.numpy_transforms import build_mel_basis, compute_f0 from TTS.utils.audio.numpy_transforms import db_to_amp as db_to_amp_numpy from TTS.utils.audio.numpy_transforms import mel_to_wav as mel_to_wav_numpy from TTS.utils.audio.processor import AudioProcessor +from TTS.utils.audio.torch_transforms import wav_to_mel, wav_to_spec from TTS.vocoder.layers.losses import MultiScaleSTFTLoss from TTS.vocoder.models.hifigan_generator import HifiganGenerator from TTS.vocoder.utils.generic_utils import plot_results @@ -39,291 +44,27 @@ logger = logging.getLogger(__name__) -def id_to_torch(aux_id, cuda=False): - if aux_id is not None: - aux_id = np.asarray(aux_id) - aux_id = torch.from_numpy(aux_id) - if cuda: - return aux_id.cuda() - return aux_id - - -def embedding_to_torch(d_vector, cuda=False): - if d_vector is not None: - d_vector = np.asarray(d_vector) - d_vector = torch.from_numpy(d_vector).float() - d_vector = d_vector.squeeze().unsqueeze(0) - if cuda: - return d_vector.cuda() - return d_vector - - -def numpy_to_torch(np_array, dtype, cuda=False): - if np_array is None: - return None - tensor = torch.as_tensor(np_array, dtype=dtype) - if cuda: - return tensor.cuda() - return tensor - - -def get_mask_from_lengths(lengths: torch.Tensor) -> torch.Tensor: - batch_size = lengths.shape[0] - max_len = torch.max(lengths).item() - ids = torch.arange(0, max_len, device=lengths.device).unsqueeze(0).expand(batch_size, -1) - mask = ids >= lengths.unsqueeze(1).expand(-1, max_len) - return mask - - -def pad(input_ele: List[torch.Tensor], max_len: int) -> torch.Tensor: - out_list = torch.jit.annotate(List[torch.Tensor], []) - for batch in input_ele: - if len(batch.shape) == 1: - one_batch_padded = F.pad(batch, (0, max_len - batch.size(0)), "constant", 0.0) - else: - one_batch_padded = F.pad(batch, (0, 0, 0, max_len - batch.size(0)), "constant", 0.0) - out_list.append(one_batch_padded) - out_padded = torch.stack(out_list) - return out_padded - - -def stride_lens(lens: torch.Tensor, stride: int = 2) -> torch.Tensor: - return torch.ceil(lens / stride).int() - - -def initialize_embeddings(shape: Tuple[int]) -> torch.Tensor: - assert len(shape) == 2, "Can only initialize 2-D embedding matrices ..." - return torch.randn(shape) * np.sqrt(2 / shape[1]) - - -# pylint: disable=redefined-outer-name -def calc_same_padding(kernel_size: int) -> Tuple[int, int]: - pad = kernel_size // 2 - return (pad, pad - (kernel_size + 1) % 2) - - hann_window = {} mel_basis = {} -@torch.no_grad() -def weights_reset(m: nn.Module): - # check if the current module has reset_parameters and if it is reset the weight - reset_parameters = getattr(m, "reset_parameters", None) - if callable(reset_parameters): - m.reset_parameters() - - -def get_module_weights_sum(mdl: nn.Module): - dict_sums = {} - for name, w in mdl.named_parameters(): - if "weight" in name: - value = w.data.sum().item() - dict_sums[name] = value - return dict_sums - - -def load_audio(file_path: str): - """Load the audio file normalized in [-1, 1] - - Return Shapes: - - x: :math:`[1, T]` - """ - x, sr = torchaudio.load( - file_path, - ) - assert (x > 1).sum() + (x < -1).sum() == 0 - return x, sr - - -def _amp_to_db(x, C=1, clip_val=1e-5): - return torch.log(torch.clamp(x, min=clip_val) * C) - - -def _db_to_amp(x, C=1): - return torch.exp(x) / C - - -def amp_to_db(magnitudes): - output = _amp_to_db(magnitudes) - return output - - -def db_to_amp(magnitudes): - output = _db_to_amp(magnitudes) - return output - - -def _wav_to_spec(y, n_fft, hop_length, win_length, center=False): - y = y.squeeze(1) - - if torch.min(y) < -1.0: - logger.info("min value is %.3f", torch.min(y)) - if torch.max(y) > 1.0: - logger.info("max value is %.3f", torch.max(y)) - - global hann_window # pylint: disable=global-statement - dtype_device = str(y.dtype) + "_" + str(y.device) - wnsize_dtype_device = str(win_length) + "_" + dtype_device - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device) - - y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), - mode="reflect", - ) - y = y.squeeze(1) - - spec = torch.view_as_real( - torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True, - ) - ) - - return spec - - -def wav_to_spec(y, n_fft, hop_length, win_length, center=False): - """ - Args Shapes: - - y : :math:`[B, 1, T]` - - Return Shapes: - - spec : :math:`[B,C,T]` - """ - spec = _wav_to_spec(y, n_fft, hop_length, win_length, center=center) - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - return spec - - def wav_to_energy(y, n_fft, hop_length, win_length, center=False): - spec = _wav_to_spec(y, n_fft, hop_length, win_length, center=center) - - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + spec = wav_to_spec(y, n_fft, hop_length, win_length, center=center) return torch.norm(spec, dim=1, keepdim=True) -def name_mel_basis(spec, n_fft, fmax): - n_fft_len = f"{n_fft}_{fmax}_{spec.dtype}_{spec.device}" - return n_fft_len - - -def spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax): - """ - Args Shapes: - - spec : :math:`[B,C,T]` - - Return Shapes: - - mel : :math:`[B,C,T]` - """ - global mel_basis # pylint: disable=global-statement - mel_basis_key = name_mel_basis(spec, n_fft, fmax) - # pylint: disable=too-many-function-args - if mel_basis_key not in mel_basis: - # pylint: disable=missing-kwoa - mel = librosa_mel_fn(sample_rate, n_fft, num_mels, fmin, fmax) - mel_basis[mel_basis_key] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) - mel = torch.matmul(mel_basis[mel_basis_key], spec) - mel = amp_to_db(mel) - return mel - - -def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False): - """ - Args Shapes: - - y : :math:`[B, 1, T_y]` - - Return Shapes: - - spec : :math:`[B,C,T_spec]` - """ - y = y.squeeze(1) - - if torch.min(y) < -1.0: - logger.info("min value is %.3f", torch.min(y)) - if torch.max(y) > 1.0: - logger.info("max value is %.3f", torch.max(y)) - - global mel_basis, hann_window # pylint: disable=global-statement - mel_basis_key = name_mel_basis(y, n_fft, fmax) - wnsize_dtype_device = str(win_length) + "_" + str(y.dtype) + "_" + str(y.device) - if mel_basis_key not in mel_basis: - # pylint: disable=missing-kwoa - mel = librosa_mel_fn( - sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax - ) # pylint: disable=too-many-function-args - mel_basis[mel_basis_key] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device) - - y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), - mode="reflect", - ) - y = y.squeeze(1) - - spec = torch.view_as_real( - torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True, - ) - ) - - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - spec = torch.matmul(mel_basis[mel_basis_key], spec) - spec = amp_to_db(spec) - return spec - - ############################## # DATASET ############################## -def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: dict = None): - """Create balancer weight for torch WeightedSampler""" - attr_names_samples = np.array([item[attr_name] for item in items]) - unique_attr_names = np.unique(attr_names_samples).tolist() - attr_idx = [unique_attr_names.index(l) for l in attr_names_samples] - attr_count = np.array([len(np.where(attr_names_samples == l)[0]) for l in unique_attr_names]) - weight_attr = 1.0 / attr_count - dataset_samples_weight = np.array([weight_attr[l] for l in attr_idx]) - dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight) - if multi_dict is not None: - multiplier_samples = np.array([multi_dict.get(item[attr_name], 1.0) for item in items]) - dataset_samples_weight *= multiplier_samples - return ( - torch.from_numpy(dataset_samples_weight).float(), - unique_attr_names, - np.unique(dataset_samples_weight).tolist(), - ) - - class ForwardTTSE2eF0Dataset(F0Dataset): """Override F0Dataset to avoid slow computing of pitches""" def __init__( self, ap, - samples: Union[List[List], List[Dict]], + samples: list[list] | list[dict], cache_path: str = None, precompute_num_workers=0, normalize_f0=True, @@ -533,15 +274,15 @@ def collate_fn(self, batch): @dataclass class VocoderConfig(Coqpit): resblock_type_decoder: str = "1" - resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11]) - resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) - upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2]) + resblock_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [3, 7, 11]) + resblock_dilation_sizes_decoder: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) + upsample_rates_decoder: list[int] = field(default_factory=lambda: [8, 8, 2, 2]) upsample_initial_channel_decoder: int = 512 - upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) + upsample_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [16, 16, 4, 4]) use_spectral_norm_discriminator: bool = False - upsampling_rates_discriminator: List[int] = field(default_factory=lambda: [4, 4, 4, 4]) - periods_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11]) - pretrained_model_path: Optional[str] = None + upsampling_rates_discriminator: list[int] = field(default_factory=lambda: [4, 4, 4, 4]) + periods_discriminator: list[int] = field(default_factory=lambda: [2, 3, 5, 7, 11]) + pretrained_model_path: str | None = None @dataclass @@ -696,10 +437,6 @@ def __init__( periods=self.config.vocoder.periods_discriminator, ) - @property - def device(self): - return next(self.parameters()).device - @property def energy_scaler(self): return self.acoustic_model.energy_scaler @@ -815,7 +552,7 @@ def forward( attn_priors: torch.FloatTensor = None, d_vectors: torch.FloatTensor = None, speaker_idx: torch.LongTensor = None, - ) -> Dict: + ) -> dict: """Model's forward pass. Args: @@ -880,7 +617,7 @@ def forward( model_outputs["slice_ids"] = slice_ids return model_outputs - @torch.no_grad() + @torch.inference_mode() def inference( self, x, aux_input={"d_vectors": None, "speaker_ids": None}, pitch_transform=None, energy_transform=None ): @@ -904,7 +641,7 @@ def inference( model_outputs["model_outputs"] = vocoder_output return model_outputs - @torch.no_grad() + @torch.inference_mode() def inference_spec_decoder(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): encoder_outputs = self.acoustic_model.inference( tokens=x, @@ -1094,9 +831,7 @@ def _log(self, batch, outputs, name_prefix="train"): audios[f"{name_prefix}/vocoder_audio"] = sample_voice return figures, audios - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=no-self-use, unused-argument + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=no-self-use, unused-argument """Create visualizations and waveform examples. For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to @@ -1196,7 +931,7 @@ def synthesize( **kwargs, ): # pylint: disable=unused-argument # TODO: add cloning support with ref_waveform - is_cuda = next(self.parameters()).is_cuda + device = next(self.parameters()).device # convert text to sequence of token IDs text_inputs = np.asarray( @@ -1210,14 +945,14 @@ def synthesize( if isinstance(speaker_id, str) and self.args.use_speaker_embedding: # get the speaker id for the speaker embedding layer _speaker_id = self.speaker_manager.name_to_id[speaker_id] - _speaker_id = id_to_torch(_speaker_id, cuda=is_cuda) + _speaker_id = id_to_torch(_speaker_id, device=device) if speaker_id is not None and self.args.use_d_vector_file: # get the average d_vector for the speaker d_vector = self.speaker_manager.get_mean_embedding(speaker_id, num_samples=None, randomize=False) - d_vector = embedding_to_torch(d_vector, cuda=is_cuda) + d_vector = embedding_to_torch(d_vector, device=device) - text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=is_cuda) + text_inputs = numpy_to_torch(text_inputs, torch.long, device=device) text_inputs = text_inputs.unsqueeze(0) # synthesize voice @@ -1240,7 +975,7 @@ def synthesize( return return_dict def synthesize_with_gl(self, text: str, speaker_id, d_vector): - is_cuda = next(self.parameters()).is_cuda + device = next(self.parameters()).device # convert text to sequence of token IDs text_inputs = np.asarray( @@ -1249,12 +984,12 @@ def synthesize_with_gl(self, text: str, speaker_id, d_vector): ) # pass tensors to backend if speaker_id is not None: - speaker_id = id_to_torch(speaker_id, cuda=is_cuda) + speaker_id = id_to_torch(speaker_id, device=device) if d_vector is not None: - d_vector = embedding_to_torch(d_vector, cuda=is_cuda) + d_vector = embedding_to_torch(d_vector, device=device) - text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=is_cuda) + text_inputs = numpy_to_torch(text_inputs, torch.long, device=device) text_inputs = text_inputs.unsqueeze(0) # synthesize voice @@ -1276,8 +1011,8 @@ def synthesize_with_gl(self, text: str, speaker_id, d_vector): } return return_dict - @torch.no_grad() - def test_run(self, assets) -> Tuple[Dict, Dict]: + @torch.inference_mode() + def test_run(self, assets) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -1303,18 +1038,22 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: d_vector=aux_inputs["d_vector"], ) # speaker_name = self.speaker_manager.speaker_names[aux_inputs["speaker_id"]] - test_audios["{}-audio".format(idx)] = outputs["wav"].T - test_audios["{}-audio_encoder".format(idx)] = outputs_gl["wav"].T - test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False) + test_audios[f"{idx}-audio"] = outputs["wav"].T + test_audios[f"{idx}-audio_encoder"] = outputs_gl["wav"].T + test_figures[f"{idx}-alignment"] = plot_alignment(outputs["alignments"], output_fig=False) return {"figures": test_figures, "audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.config.audio.sample_rate) logger.test_figures(steps, outputs["figures"]) - def format_batch(self, batch: Dict) -> Dict: + def format_batch(self, batch: dict) -> dict: """Compute speaker, langugage IDs and d_vector for the batch if necessary.""" speaker_ids = None d_vectors = None @@ -1422,12 +1161,12 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): def get_data_loader( self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: bool, - samples: Union[List[Dict], List[List]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, - rank: int = None, + rank: int | None = None, ) -> "DataLoader": if is_eval and not config.run_eval: loader = None @@ -1479,7 +1218,7 @@ def get_data_loader( def get_criterion(self): return [VitsDiscriminatorLoss(self.config), DelightfulTTSLoss(self.config)] - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: """Initiate and return the GAN optimizers based on the config parameters. It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator. Returns: @@ -1494,7 +1233,7 @@ def get_optimizer(self) -> List: ) return [optimizer_disc, optimizer_gen] - def get_lr(self) -> List: + def get_lr(self) -> list: """Set the initial learning rates for each optimizer. Returns: @@ -1502,7 +1241,7 @@ def get_lr(self) -> List: """ return [self.config.lr_disc, self.config.lr_gen] - def get_scheduler(self, optimizer) -> List: + def get_scheduler(self, optimizer) -> list: """Set the schedulers for each optimizer. Args: @@ -1521,9 +1260,7 @@ def on_epoch_end(self, trainer): # pylint: disable=unused-argument self.energy_scaler.eval() @staticmethod - def init_from_config( - config: "DelightfulTTSConfig", samples: Union[List[List], List[Dict]] = None - ): # pylint: disable=unused-argument + def init_from_config(config: "DelightfulTTSConfig", samples: list[list] | list[dict] = None): # pylint: disable=unused-argument """Initiate model from config Args: @@ -1601,36 +1338,6 @@ def __init__(self, config): self.gen_loss_alpha = config.gen_loss_alpha self.multi_scale_stft_loss_alpha = config.multi_scale_stft_loss_alpha - @staticmethod - def _binary_alignment_loss(alignment_hard, alignment_soft): - """Binary loss that forces soft alignments to match the hard alignments as - explained in `https://arxiv.org/pdf/2108.10447.pdf`. - """ - log_sum = torch.log(torch.clamp(alignment_soft[alignment_hard == 1], min=1e-12)).sum() - return -log_sum / alignment_hard.sum() - - @staticmethod - def feature_loss(feats_real, feats_generated): - loss = 0 - for dr, dg in zip(feats_real, feats_generated): - for rl, gl in zip(dr, dg): - rl = rl.float().detach() - gl = gl.float() - loss += torch.mean(torch.abs(rl - gl)) - return loss * 2 - - @staticmethod - def generator_loss(scores_fake): - loss = 0 - gen_losses = [] - for dg in scores_fake: - dg = dg.float() - l = torch.mean((1 - dg) ** 2) - gen_losses.append(l) - loss += l - - return loss, gen_losses - def forward( self, mel_output, @@ -1728,7 +1435,7 @@ def forward( ) if self.binary_alignment_loss_alpha > 0 and aligner_hard is not None: - binary_alignment_loss = self._binary_alignment_loss(aligner_hard, aligner_soft) + binary_alignment_loss = _binary_alignment_loss(aligner_hard, aligner_soft) total_loss = total_loss + self.binary_alignment_loss_alpha * binary_alignment_loss * binary_loss_weight if binary_loss_weight: loss_dict["loss_binary_alignment"] = ( @@ -1748,8 +1455,8 @@ def forward( # vocoder losses if not skip_disc: - loss_feat = self.feature_loss(feats_real=feats_real, feats_generated=feats_fake) * self.feat_loss_alpha - loss_gen = self.generator_loss(scores_fake=scores_fake)[0] * self.gen_loss_alpha + loss_feat = feature_loss(feats_real=feats_real, feats_generated=feats_fake) * self.feat_loss_alpha + loss_gen = generator_loss(scores_fake=scores_fake)[0] * self.gen_loss_alpha loss_dict["vocoder_loss_feat"] = loss_feat loss_dict["vocoder_loss_gen"] = loss_gen loss_dict["loss"] = loss_dict["loss"] + loss_feat + loss_gen diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index d449e580da..497ac3f63a 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -1,6 +1,5 @@ import logging from dataclasses import dataclass, field -from typing import Dict, List, Tuple, Union import torch from coqpit import Coqpit @@ -14,7 +13,7 @@ from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.utils.helpers import average_over_durations, generate_path, sequence_mask +from TTS.tts.utils.helpers import average_over_durations, expand_encoder_outputs, generate_attention, sequence_mask from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_avg_energy, plot_avg_pitch, plot_spectrogram @@ -310,49 +309,6 @@ def init_multispeaker(self, config: Coqpit): self.emb_g = nn.Embedding(self.num_speakers, self.args.hidden_channels) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) - @staticmethod - def generate_attn(dr, x_mask, y_mask=None): - """Generate an attention mask from the durations. - - Shapes - - dr: :math:`(B, T_{en})` - - x_mask: :math:`(B, T_{en})` - - y_mask: :math:`(B, T_{de})` - """ - # compute decode mask from the durations - if y_mask is None: - y_lengths = dr.sum(1).long() - y_lengths[y_lengths < 1] = 1 - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype) - attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) - attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype) - return attn - - def expand_encoder_outputs(self, en, dr, x_mask, y_mask): - """Generate attention alignment map from durations and - expand encoder outputs - - Shapes: - - en: :math:`(B, D_{en}, T_{en})` - - dr: :math:`(B, T_{en})` - - x_mask: :math:`(B, T_{en})` - - y_mask: :math:`(B, T_{de})` - - Examples:: - - encoder output: [a,b,c,d] - durations: [1, 3, 2, 1] - - expanded: [a, b, b, b, c, c, d] - attention map: [[0, 0, 0, 0, 0, 0, 1], - [0, 0, 0, 0, 1, 1, 0], - [0, 1, 1, 1, 0, 0, 0], - [1, 0, 0, 0, 0, 0, 0]] - """ - attn = self.generate_attn(dr, x_mask, y_mask) - o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2).to(en.dtype), en.transpose(1, 2)).transpose(1, 2) - return o_en_ex, attn - def format_durations(self, o_dr_log, x_mask): """Format predicted durations. 1. Convert to linear scale from log scale @@ -376,7 +332,7 @@ def format_durations(self, o_dr_log, x_mask): def _forward_encoder( self, x: torch.LongTensor, x_mask: torch.FloatTensor, g: torch.FloatTensor = None - ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: """Encoding forward pass. 1. Embed speaker IDs if multi-speaker mode. @@ -424,7 +380,7 @@ def _forward_decoder( x_mask: torch.FloatTensor, y_lengths: torch.IntTensor, g: torch.FloatTensor, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: """Decoding forward pass. 1. Compute the decoder output mask @@ -443,9 +399,8 @@ def _forward_decoder( Returns: Tuple[torch.FloatTensor, torch.FloatTensor]: Decoder output, attention map from durations. """ - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype) # expand o_en with durations - o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask) + o_en_ex, attn, y_mask = expand_encoder_outputs(o_en, dr, x_mask, y_lengths) # positional encoding if hasattr(self, "pos_encoder"): o_en_ex = self.pos_encoder(o_en_ex, y_mask) @@ -459,7 +414,7 @@ def _forward_pitch_predictor( x_mask: torch.IntTensor, pitch: torch.FloatTensor = None, dr: torch.IntTensor = None, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: """Pitch predictor forward pass. 1. Predict pitch from encoder outputs. @@ -495,7 +450,7 @@ def _forward_energy_predictor( x_mask: torch.IntTensor, energy: torch.FloatTensor = None, dr: torch.IntTensor = None, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: """Energy predictor forward pass. 1. Predict energy from encoder outputs. @@ -527,7 +482,7 @@ def _forward_energy_predictor( def _forward_aligner( self, x: torch.FloatTensor, y: torch.FloatTensor, x_mask: torch.IntTensor, y_mask: torch.IntTensor - ) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: """Aligner forward pass. 1. Compute a mask to apply to the attention map. @@ -566,7 +521,7 @@ def _forward_aligner( alignment_soft = alignment_soft.squeeze(1).transpose(1, 2) return o_alignment_dur, alignment_soft, alignment_logprob, alignment_mas - def _set_speaker_input(self, aux_input: Dict): + def _set_speaker_input(self, aux_input: dict): d_vectors = aux_input.get("d_vectors", None) speaker_ids = aux_input.get("speaker_ids", None) @@ -588,8 +543,8 @@ def forward( dr: torch.IntTensor = None, pitch: torch.FloatTensor = None, energy: torch.FloatTensor = None, - aux_input: Dict = {"d_vectors": None, "speaker_ids": None}, # pylint: disable=unused-argument - ) -> Dict: + aux_input: dict = {"d_vectors": None, "speaker_ids": None}, # pylint: disable=unused-argument + ) -> dict: """Model's forward pass. Args: @@ -624,7 +579,7 @@ def forward( o_dr_log = self.duration_predictor(o_en, x_mask) o_dr = torch.clamp(torch.exp(o_dr_log) - 1, 0, self.max_duration) # generate attn mask from predicted durations - o_attn = self.generate_attn(o_dr.squeeze(1), x_mask) + o_attn = generate_attention(o_dr.squeeze(1), x_mask) # aligner o_alignment_dur = None alignment_soft = None @@ -672,7 +627,7 @@ def forward( } return outputs - @torch.no_grad() + @torch.inference_mode() def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument """Model's inference pass. @@ -815,9 +770,7 @@ def _create_logs(self, batch, outputs, ap): train_audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": train_audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) @@ -830,9 +783,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: @@ -849,7 +800,7 @@ def on_train_step_start(self, trainer): self.binary_loss_weight = min(trainer.epochs_done / self.config.binary_loss_warmup_epochs, 1.0) * 1.0 @staticmethod - def init_from_config(config: "ForwardTTSConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "ForwardTTSConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 5bf4713140..5d03b53dc6 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -1,6 +1,5 @@ import logging import math -from typing import Dict, List, Tuple, Union import torch from coqpit import Coqpit @@ -125,9 +124,9 @@ def init_multispeaker(self, config: Coqpit): config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 ) if self.speaker_manager is not None: - assert ( - config.d_vector_dim == self.speaker_manager.embedding_dim - ), " [!] d-vector dimension mismatch b/w config and speaker manager." + assert config.d_vector_dim == self.speaker_manager.embedding_dim, ( + " [!] d-vector dimension mismatch b/w config and speaker manager." + ) # init speaker embedding layer if config.use_speaker_embedding and not config.use_d_vector_file: logger.info("Init speaker_embedding layer.") @@ -162,7 +161,7 @@ def lock_act_norm_layers(self): if getattr(f, "set_ddi", False): f.set_ddi(False) - def _set_speaker_input(self, aux_input: Dict): + def _set_speaker_input(self, aux_input: dict): if aux_input is None: d_vectors = None speaker_ids = None @@ -179,7 +178,7 @@ def _set_speaker_input(self, aux_input: Dict): g = speaker_ids if speaker_ids is not None else d_vectors return g - def _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]: + def _speaker_embedding(self, aux_input: dict) -> torch.Tensor | None: g = self._set_speaker_input(aux_input) # speaker embedding if g is not None: @@ -193,9 +192,7 @@ def _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]: g = F.normalize(g).unsqueeze(-1) # [b, h, 1] return g - def forward( - self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} - ): # pylint: disable=dangerous-default-value + def forward(self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=dangerous-default-value """ Args: x (torch.Tensor): @@ -262,7 +259,7 @@ def forward( } return outputs - @torch.no_grad() + @torch.inference_mode() def inference_with_MAS( self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} ): # pylint: disable=dangerous-default-value @@ -318,10 +315,8 @@ def inference_with_MAS( } return outputs - @torch.no_grad() - def decoder_inference( - self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} - ): # pylint: disable=dangerous-default-value + @torch.inference_mode() + def decoder_inference(self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=dangerous-default-value """ Shapes: - y: :math:`[B, T, C]` @@ -341,10 +336,8 @@ def decoder_inference( outputs["logdet"] = logdet return outputs - @torch.no_grad() - def inference( - self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None} - ): # pylint: disable=dangerous-default-value + @torch.inference_mode() + def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}): # pylint: disable=dangerous-default-value x_lengths = aux_input["x_lengths"] g = self._speaker_embedding(aux_input) # embedding pass @@ -457,14 +450,12 @@ def _create_logs(self, batch, outputs, ap): train_audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": train_audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) - @torch.no_grad() + @torch.inference_mode() def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) @@ -473,8 +464,8 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) - @torch.no_grad() - def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: + @torch.inference_mode() + def test_run(self, assets: dict) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -503,11 +494,11 @@ def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: do_trim_silence=False, ) - test_audios["{}-audio".format(idx)] = outputs["wav"] - test_figures["{}-prediction".format(idx)] = plot_spectrogram( + test_audios[f"{idx}-audio"] = outputs["wav"] + test_figures[f"{idx}-prediction"] = plot_spectrogram( outputs["outputs"]["model_outputs"], self.ap, output_fig=False ) - test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False) + test_figures[f"{idx}-alignment"] = plot_alignment(outputs["alignments"], output_fig=False) return test_figures, test_audios def preprocess(self, y, y_lengths, y_max_length, attn=None): @@ -522,9 +513,7 @@ def preprocess(self, y, y_lengths, y_max_length, attn=None): def store_inverse(self): self.decoder.store_inverse() - def load_checkpoint( - self, config, checkpoint_path, eval=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: @@ -543,7 +532,7 @@ def on_train_step_start(self, trainer): self.run_data_dep_init = trainer.total_steps_done < self.data_dep_init_steps @staticmethod - def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "GlowTTSConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/neuralhmm_tts.py b/TTS/tts/models/neuralhmm_tts.py index de5401aac7..2cbf425884 100644 --- a/TTS/tts/models/neuralhmm_tts.py +++ b/TTS/tts/models/neuralhmm_tts.py @@ -1,6 +1,5 @@ import logging import os -from typing import Dict, List, Union import torch from coqpit import Coqpit @@ -8,6 +7,7 @@ from trainer.io import load_fsspec from trainer.logging.tensorboard_logger import TensorboardLogger +from TTS.tts.layers.losses import NLLLoss from TTS.tts.layers.overflow.common_layers import Encoder, OverflowUtils from TTS.tts.layers.overflow.neural_hmm import NeuralHMM from TTS.tts.layers.overflow.plotting_utils import ( @@ -101,7 +101,7 @@ def __init__( self.register_buffer("mean", torch.tensor(0)) self.register_buffer("std", torch.tensor(1)) - def update_mean_std(self, statistics_dict: Dict): + def update_mean_std(self, statistics_dict: dict): self.mean.data = torch.tensor(statistics_dict["mean"]) self.std.data = torch.tensor(statistics_dict["std"]) @@ -173,10 +173,10 @@ def train_step(self, batch: dict, criterion: nn.Module): loss_dict.update(self._training_stats(batch)) return outputs, loss_dict - def eval_step(self, batch: Dict, criterion: nn.Module): + def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) - def _format_aux_input(self, aux_input: Dict, default_input_dict): + def _format_aux_input(self, aux_input: dict, default_input_dict): """Set missing fields to their default value. Args: @@ -194,7 +194,7 @@ def _format_aux_input(self, aux_input: Dict, default_input_dict): return format_aux_input(default_input_dict, aux_input) return default_input_dict - @torch.no_grad() + @torch.inference_mode() def inference( self, text: torch.Tensor, @@ -238,7 +238,7 @@ def get_criterion(): return NLLLoss() @staticmethod - def init_from_config(config: "NeuralhmmTTSConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "NeuralhmmTTSConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: @@ -345,17 +345,13 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use, unus audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy()) return figures, {"audios": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=unused-argument + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Log training progress.""" figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) - def eval_log( - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int - ): # pylint: disable=unused-argument + def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Compute and log evaluation metrics.""" # Plot model parameters histograms if isinstance(logger, TensorboardLogger): @@ -369,25 +365,11 @@ def eval_log( logger.eval_audios(steps, audios, self.ap.sample_rate) def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs[1], self.ap.sample_rate) logger.test_figures(steps, outputs[0]) - - -class NLLLoss(nn.Module): - """Negative log likelihood loss.""" - - def forward(self, log_prob: torch.Tensor) -> dict: # pylint: disable=no-self-use - """Compute the loss. - - Args: - logits (Tensor): [B, T, D] - - Returns: - Tensor: [1] - - """ - return_dict = {} - return_dict["loss"] = -log_prob.mean() - return return_dict diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py index b72f4877cf..aad2e1f553 100644 --- a/TTS/tts/models/overflow.py +++ b/TTS/tts/models/overflow.py @@ -1,6 +1,5 @@ import logging import os -from typing import Dict, List, Union import torch from coqpit import Coqpit @@ -8,6 +7,7 @@ from trainer.io import load_fsspec from trainer.logging.tensorboard_logger import TensorboardLogger +from TTS.tts.layers.losses import NLLLoss from TTS.tts.layers.overflow.common_layers import Encoder, OverflowUtils from TTS.tts.layers.overflow.decoder import Decoder from TTS.tts.layers.overflow.neural_hmm import NeuralHMM @@ -32,32 +32,33 @@ class Overflow(BaseTTS): Paper abstract:: Neural HMMs are a type of neural transducer recently proposed for - sequence-to-sequence modelling in text-to-speech. They combine the best features - of classic statistical speech synthesis and modern neural TTS, requiring less - data and fewer training updates, and are less prone to gibberish output caused - by neural attention failures. In this paper, we combine neural HMM TTS with - normalising flows for describing the highly non-Gaussian distribution of speech - acoustics. The result is a powerful, fully probabilistic model of durations and - acoustics that can be trained using exact maximum likelihood. Compared to - dominant flow-based acoustic models, our approach integrates autoregression for - improved modelling of long-range dependences such as utterance-level prosody. - Experiments show that a system based on our proposal gives more accurate - pronunciations and better subjective speech quality than comparable methods, - whilst retaining the original advantages of neural HMMs. Audio examples and code - are available at https://shivammehta25.github.io/OverFlow/. + sequence-to-sequence modelling in text-to-speech. They combine the best features + of classic statistical speech synthesis and modern neural TTS, requiring less + data and fewer training updates, and are less prone to gibberish output caused + by neural attention failures. In this paper, we combine neural HMM TTS with + normalising flows for describing the highly non-Gaussian distribution of speech + acoustics. The result is a powerful, fully probabilistic model of durations and + acoustics that can be trained using exact maximum likelihood. Compared to + dominant flow-based acoustic models, our approach integrates autoregression for + improved modelling of long-range dependences such as utterance-level prosody. + Experiments show that a system based on our proposal gives more accurate + pronunciations and better subjective speech quality than comparable methods, + whilst retaining the original advantages of neural HMMs. Audio examples and code + are available at https://shivammehta25.github.io/OverFlow/. Note: - - Neural HMMs uses flat start initialization i.e it computes the means and std and transition probabilities - of the dataset and uses them to initialize the model. This benefits the model and helps with faster learning - If you change the dataset or want to regenerate the parameters change the `force_generate_statistics` and - `mel_statistics_parameter_path` accordingly. + - Neural HMMs uses flat start initialization i.e it computes the means + and std and transition probabilities of the dataset and uses them to initialize + the model. This benefits the model and helps with faster learning If you change + the dataset or want to regenerate the parameters change the + `force_generate_statistics` and `mel_statistics_parameter_path` accordingly. - To enable multi-GPU training, set the `use_grad_checkpointing=False` in config. - This will significantly increase the memory usage. This is because to compute - the actual data likelihood (not an approximation using MAS/Viterbi) we must use - all the states at the previous time step during the forward pass to decide the - probability distribution at the current step i.e the difference between the forward - algorithm and viterbi approximation. + This will significantly increase the memory usage. This is because to compute + the actual data likelihood (not an approximation using MAS/Viterbi) we must use + all the states at the previous time step during the forward pass to decide the + probability distribution at the current step i.e the difference between the forward + algorithm and viterbi approximation. Check :class:`TTS.tts.configs.overflow.OverFlowConfig` for class arguments. """ @@ -114,7 +115,7 @@ def __init__( self.register_buffer("mean", torch.tensor(0)) self.register_buffer("std", torch.tensor(1)) - def update_mean_std(self, statistics_dict: Dict): + def update_mean_std(self, statistics_dict: dict): self.mean.data = torch.tensor(statistics_dict["mean"]) self.std.data = torch.tensor(statistics_dict["std"]) @@ -186,10 +187,10 @@ def train_step(self, batch: dict, criterion: nn.Module): loss_dict.update(self._training_stats(batch)) return outputs, loss_dict - def eval_step(self, batch: Dict, criterion: nn.Module): + def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) - def _format_aux_input(self, aux_input: Dict, default_input_dict): + def _format_aux_input(self, aux_input: dict, default_input_dict): """Set missing fields to their default value. Args: @@ -207,7 +208,7 @@ def _format_aux_input(self, aux_input: Dict, default_input_dict): return format_aux_input(default_input_dict, aux_input) return default_input_dict - @torch.no_grad() + @torch.inference_mode() def inference( self, text: torch.Tensor, @@ -253,7 +254,7 @@ def get_criterion(): return NLLLoss() @staticmethod - def init_from_config(config: "OverFlowConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "OverFlowConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: @@ -361,17 +362,13 @@ def _create_logs(self, batch, outputs, ap): # pylint: disable=no-self-use, unus audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy()) return figures, {"audios": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=unused-argument + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Log training progress.""" figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) - def eval_log( - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int - ): # pylint: disable=unused-argument + def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=unused-argument """Compute and log evaluation metrics.""" # Plot model parameters histograms if isinstance(logger, TensorboardLogger): @@ -385,25 +382,11 @@ def eval_log( logger.eval_audios(steps, audios, self.ap.sample_rate) def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs[1], self.ap.sample_rate) logger.test_figures(steps, outputs[0]) - - -class NLLLoss(nn.Module): - """Negative log likelihood loss.""" - - def forward(self, log_prob: torch.Tensor) -> dict: # pylint: disable=no-self-use - """Compute the loss. - - Args: - logits (Tensor): [B, T, D] - - Returns: - Tensor: [1] - - """ - return_dict = {} - return_dict["loss"] = -log_prob.mean() - return return_dict diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 5d3efd2021..59173691f7 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from typing import Dict, List, Tuple, Union - import torch from torch import nn from trainer.trainer_utils import get_optimizer, get_scheduler @@ -218,7 +214,7 @@ def forward( # pylint: disable=dangerous-default-value ) return outputs - @torch.no_grad() + @torch.inference_mode() def inference(self, text_input, aux_input=None): aux_input = self._format_aux_input(aux_input) inputs = self.embedding(text_input) @@ -280,7 +276,7 @@ def before_backward_pass(self, loss_dict, optimizer) -> None: loss_dict["capacitron_vae_beta_loss"].backward() optimizer.first_step() - def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: torch.nn.Module) -> tuple[dict, dict]: """Perform a single training step by fetching the right set of samples from the batch. Args: @@ -332,7 +328,7 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dic loss_dict["align_error"] = align_error return outputs, loss_dict - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: if self.use_capacitron_vae: return CapacitronOptimizer(self.config, self.named_parameters()) return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self) @@ -380,9 +376,7 @@ def _create_logs(self, batch, outputs, ap): audio = ap.inv_spectrogram(pred_linear_spec.T) return figures, {"audio": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) @@ -396,7 +390,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_audios(steps, audios, self.ap.sample_rate) @staticmethod - def init_from_config(config: "TacotronConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "TacotronConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 2716a39786..e924d82d42 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from typing import Dict, List, Union - import torch from torch import nn from trainer.trainer_utils import get_optimizer, get_scheduler @@ -238,7 +234,7 @@ def forward( # pylint: disable=dangerous-default-value ) return outputs - @torch.no_grad() + @torch.inference_mode() def inference(self, text, aux_input=None): """Forward pass for inference with no Teacher-Forcing. @@ -309,7 +305,7 @@ def before_backward_pass(self, loss_dict, optimizer) -> None: loss_dict["capacitron_vae_beta_loss"].backward() optimizer.first_step() - def train_step(self, batch: Dict, criterion: torch.nn.Module): + def train_step(self, batch: dict, criterion: torch.nn.Module): """A single training step. Forward pass and loss computation. Args: @@ -360,7 +356,7 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module): loss_dict["align_error"] = align_error return outputs, loss_dict - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: if self.use_capacitron_vae: return CapacitronOptimizer(self.config, self.named_parameters()) return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self) @@ -403,9 +399,7 @@ def _create_logs(self, batch, outputs, ap): audio = ap.inv_melspectrogram(pred_spec.T) return figures, {"audio": audio} - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ) -> None: # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: # pylint: disable=no-self-use """Log training progress.""" figures, audios = self._create_logs(batch, outputs, self.ap) logger.train_figures(steps, figures) @@ -420,7 +414,7 @@ def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, s logger.eval_audios(steps, audios, self.ap.sample_rate) @staticmethod - def init_from_config(config: "Tacotron2Config", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "Tacotron2Config", samples: list[list] | list[dict] = None): """Initiate model from config Args: diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py index 01629b5d2a..a42d577676 100644 --- a/TTS/tts/models/tortoise.py +++ b/TTS/tts/models/tortoise.py @@ -342,7 +342,6 @@ def __init__(self, config: Coqpit): else self.args.autoregressive_batch_size ) self.enable_redaction = self.args.enable_redaction - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if self.enable_redaction: self.aligner = Wav2VecAlignment() @@ -423,7 +422,9 @@ def get_conditioning_latents( Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent). These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic properties. - :param voice_samples: List of arbitrary reference clips, which should be *pairs* of torch tensors containing arbitrary kHz waveform data. + + :param voice_samples: List of arbitrary reference clips, which should be *pairs* + of torch tensors containing arbitrary kHz waveform data. :param latent_averaging_mode: 0/1/2 for following modes: 0 - latents will be generated as in original tortoise, using ~4.27s from each voice sample, averaging latent across all samples 1 - latents will be generated using (almost) entire voice samples, averaged across all the ~4.27s chunks @@ -671,7 +672,7 @@ def inference( As cond_free_k increases, the output becomes dominated by the conditioning-free signal. diffusion_temperature: (float) Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0 are the "mean" prediction of the diffusion network and will sound bland and smeared. - hf_generate_kwargs: (**kwargs) The huggingface Transformers generate API is used for the autoregressive transformer. + hf_generate_kwargs: (`**kwargs`) The huggingface Transformers generate API is used for the autoregressive transformer. Extra keyword args fed to this function get forwarded directly to that API. Documentation here: https://huggingface.co/docs/transformers/internal/generation_utils @@ -683,9 +684,9 @@ def inference( text_tokens = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).to(self.device) text_tokens = F.pad(text_tokens, (0, 1)) # This may not be necessary. - assert ( - text_tokens.shape[-1] < 400 - ), "Too much text provided. Break the text up into separate segments and re-try inference." + assert text_tokens.shape[-1] < 400, ( + "Too much text provided. Break the text up into separate segments and re-try inference." + ) if voice_samples is not None: ( diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 432b29f5e1..b542030f13 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -3,14 +3,14 @@ import os from dataclasses import dataclass, field, replace from itertools import chain -from typing import Dict, List, Tuple, Union +from pathlib import Path +from typing import Any import numpy as np import torch import torch.distributed as dist import torchaudio from coqpit import Coqpit -from librosa.filters import mel as librosa_mel_fn from monotonic_alignment_search import maximum_path from torch import nn from torch.nn import functional as F @@ -21,7 +21,7 @@ from trainer.trainer_utils import get_optimizer, get_scheduler from TTS.tts.configs.shared_configs import CharactersConfig -from TTS.tts.datasets.dataset import TTSDataset, _parse_sample +from TTS.tts.datasets.dataset import TTSDataset, _parse_sample, get_attribute_balancer_weights from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.layers.vits.discriminator import VitsDiscriminator from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder @@ -35,6 +35,7 @@ from TTS.tts.utils.text.characters import BaseCharacters, BaseVocabulary, _characters, _pad, _phonemes, _punctuations from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment +from TTS.utils.audio.torch_transforms import spec_to_mel, wav_to_mel, wav_to_spec from TTS.utils.samplers import BucketBatchSampler from TTS.vocoder.models.hifigan_generator import HifiganGenerator from TTS.vocoder.utils.generic_utils import plot_results @@ -45,10 +46,6 @@ # IO / Feature extraction ############################## -# pylint: disable=global-statement -hann_window = {} -mel_basis = {} - @torch.no_grad() def weights_reset(m: nn.Module): @@ -78,143 +75,6 @@ def load_audio(file_path): return x, sr -def _amp_to_db(x, C=1, clip_val=1e-5): - return torch.log(torch.clamp(x, min=clip_val) * C) - - -def _db_to_amp(x, C=1): - return torch.exp(x) / C - - -def amp_to_db(magnitudes): - output = _amp_to_db(magnitudes) - return output - - -def db_to_amp(magnitudes): - output = _db_to_amp(magnitudes) - return output - - -def wav_to_spec(y, n_fft, hop_length, win_length, center=False): - """ - Args Shapes: - - y : :math:`[B, 1, T]` - - Return Shapes: - - spec : :math:`[B,C,T]` - """ - y = y.squeeze(1) - - if torch.min(y) < -1.0: - logger.info("min value is %.3f", torch.min(y)) - if torch.max(y) > 1.0: - logger.info("max value is %.3f", torch.max(y)) - - global hann_window - dtype_device = str(y.dtype) + "_" + str(y.device) - wnsize_dtype_device = str(win_length) + "_" + dtype_device - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device) - - y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), - mode="reflect", - ) - y = y.squeeze(1) - - spec = torch.view_as_real( - torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True, - ) - ) - - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - return spec - - -def spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax): - """ - Args Shapes: - - spec : :math:`[B,C,T]` - - Return Shapes: - - mel : :math:`[B,C,T]` - """ - global mel_basis - dtype_device = str(spec.dtype) + "_" + str(spec.device) - fmax_dtype_device = str(fmax) + "_" + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) - mel = torch.matmul(mel_basis[fmax_dtype_device], spec) - mel = amp_to_db(mel) - return mel - - -def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False): - """ - Args Shapes: - - y : :math:`[B, 1, T]` - - Return Shapes: - - spec : :math:`[B,C,T]` - """ - y = y.squeeze(1) - - if torch.min(y) < -1.0: - logger.info("min value is %.3f", torch.min(y)) - if torch.max(y) > 1.0: - logger.info("max value is %.3f", torch.max(y)) - - global mel_basis, hann_window - dtype_device = str(y.dtype) + "_" + str(y.device) - fmax_dtype_device = str(fmax) + "_" + dtype_device - wnsize_dtype_device = str(win_length) + "_" + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device) - - y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), - mode="reflect", - ) - y = y.squeeze(1) - - spec = torch.view_as_real( - torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True, - ) - ) - - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - spec = torch.matmul(mel_basis[fmax_dtype_device], spec) - spec = amp_to_db(spec) - return spec - - ############################# # CONFIGS ############################# @@ -236,30 +96,6 @@ class VitsAudioConfig(Coqpit): ############################## -def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: dict = None): - """Create inverse frequency weights for balancing the dataset. - Use `multi_dict` to scale relative weights.""" - attr_names_samples = np.array([item[attr_name] for item in items]) - unique_attr_names = np.unique(attr_names_samples).tolist() - attr_idx = [unique_attr_names.index(l) for l in attr_names_samples] - attr_count = np.array([len(np.where(attr_names_samples == l)[0]) for l in unique_attr_names]) - weight_attr = 1.0 / attr_count - dataset_samples_weight = np.array([weight_attr[l] for l in attr_idx]) - dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight) - if multi_dict is not None: - # check if all keys are in the multi_dict - for k in multi_dict: - assert k in unique_attr_names, f"{k} not in {unique_attr_names}" - # scale weights - multiplier_samples = np.array([multi_dict.get(item[attr_name], 1.0) for item in items]) - dataset_samples_weight *= multiplier_samples - return ( - torch.from_numpy(dataset_samples_weight).float(), - unique_attr_names, - np.unique(dataset_samples_weight).tolist(), - ) - - class VitsDataset(TTSDataset): def __init__(self, model_args, *args, **kwargs): super().__init__(*args, **kwargs) @@ -565,12 +401,12 @@ class VitsArgs(Coqpit): dilation_rate_flow: int = 1 num_layers_flow: int = 4 resblock_type_decoder: str = "1" - resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11]) - resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) - upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2]) + resblock_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [3, 7, 11]) + resblock_dilation_sizes_decoder: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) + upsample_rates_decoder: list[int] = field(default_factory=lambda: [8, 8, 2, 2]) upsample_initial_channel_decoder: int = 512 - upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) - periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11]) + upsample_kernel_sizes_decoder: list[int] = field(default_factory=lambda: [16, 16, 4, 4]) + periods_multi_period_discriminator: list[int] = field(default_factory=lambda: [2, 3, 5, 7, 11]) use_sdp: bool = True noise_scale: float = 1.0 inference_noise_scale: float = 0.667 @@ -583,7 +419,7 @@ class VitsArgs(Coqpit): use_speaker_embedding: bool = False num_speakers: int = 0 speakers_file: str = None - d_vector_file: List[str] = None + d_vector_file: list[str] = None speaker_embedding_channels: int = 256 use_d_vector_file: bool = False d_vector_dim: int = 0 @@ -730,10 +566,6 @@ def __init__( use_spectral_norm=self.args.use_spectral_norm_disriminator, ) - @property - def device(self): - return next(self.parameters()).device - def init_multispeaker(self, config: Coqpit): """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer or with external `d_vectors` computed from a speaker encoder model. @@ -848,7 +680,7 @@ def on_init_end(self, trainer): # pylint: disable=W0613 raise RuntimeError(" [!] The weights of Text Encoder was not reinit check it !") logger.info("Text Encoder was reinit.") - def get_aux_input(self, aux_input: Dict): + def get_aux_input(self, aux_input: dict): sid, g, lid, _ = self._set_cond_input(aux_input) return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid} @@ -878,7 +710,7 @@ def _freeze_layers(self): param.requires_grad = False @staticmethod - def _set_cond_input(aux_input: Dict): + def _set_cond_input(aux_input: dict): """Set the speaker conditioning input based on the multi-speaker mode.""" sid, g, lid, durations = None, None, None, None if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None: @@ -900,7 +732,7 @@ def _set_cond_input(aux_input: Dict): return sid, g, lid, durations - def _set_speaker_input(self, aux_input: Dict): + def _set_speaker_input(self, aux_input: dict): d_vectors = aux_input.get("d_vectors", None) speaker_ids = aux_input.get("speaker_ids", None) @@ -973,7 +805,7 @@ def forward( # pylint: disable=dangerous-default-value y_lengths: torch.tensor, waveform: torch.tensor, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}, - ) -> Dict: + ) -> dict: """Forward pass of the model. Args: @@ -1092,7 +924,7 @@ def _set_x_lengths(x, aux_input): return aux_input["x_lengths"] return torch.tensor(x.shape[1:2]).to(x.device) - @torch.no_grad() + @torch.inference_mode() def inference( self, x, @@ -1179,7 +1011,7 @@ def inference( } return outputs - @torch.no_grad() + @torch.inference_mode() def inference_voice_conversion( self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None ): @@ -1220,8 +1052,8 @@ def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt): assert self.num_speakers > 0, "num_speakers have to be larger than 0." # speaker embedding if self.args.use_speaker_embedding and not self.args.use_d_vector_file: - g_src = self.emb_g(torch.from_numpy((np.array(speaker_cond_src))).unsqueeze(0)).unsqueeze(-1) - g_tgt = self.emb_g(torch.from_numpy((np.array(speaker_cond_tgt))).unsqueeze(0)).unsqueeze(-1) + g_src = self.emb_g(torch.from_numpy(np.array(speaker_cond_src)).unsqueeze(0)).unsqueeze(-1) + g_tgt = self.emb_g(torch.from_numpy(np.array(speaker_cond_tgt)).unsqueeze(0)).unsqueeze(-1) elif not self.args.use_speaker_embedding and self.args.use_d_vector_file: g_src = F.normalize(speaker_cond_src).unsqueeze(-1) g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1) @@ -1234,7 +1066,7 @@ def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt): o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt) return o_hat, y_mask, (z, z_p, z_hat) - def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> tuple[dict, dict]: """Perform a single training step. Run the model forward pass and compute losses. Args: @@ -1354,9 +1186,7 @@ def _log(self, ap, batch, outputs, name_prefix="train"): # pylint: disable=unus ) return figures, audios - def train_log( - self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int - ): # pylint: disable=no-self-use + def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int): # pylint: disable=no-self-use """Create visualizations and waveform examples. For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to @@ -1374,7 +1204,7 @@ def train_log( logger.train_figures(steps, figures) logger.train_audios(steps, audios, self.ap.sample_rate) - @torch.no_grad() + @torch.inference_mode() def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int): return self.train_step(batch, criterion, optimizer_idx) @@ -1431,8 +1261,8 @@ def get_aux_input_from_test_sentences(self, sentence_info): "language_name": language_name, } - @torch.no_grad() - def test_run(self, assets) -> Tuple[Dict, Dict]: + @torch.inference_mode() + def test_run(self, assets) -> tuple[dict, dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -1458,17 +1288,21 @@ def test_run(self, assets) -> Tuple[Dict, Dict]: use_griffin_lim=True, do_trim_silence=False, ).values() - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False) + test_audios[f"{idx}-audio"] = wav + test_figures[f"{idx}-alignment"] = plot_alignment(alignment.permute(2, 1, 0), output_fig=False) return {"figures": test_figures, "audios": test_audios} def test_log( - self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: logger.test_audios(steps, outputs["audios"], self.ap.sample_rate) logger.test_figures(steps, outputs["figures"]) - def format_batch(self, batch: Dict) -> Dict: + def format_batch(self, batch: dict) -> dict: """Compute speaker, langugage IDs and d_vector for the batch if necessary.""" speaker_ids = None language_ids = None @@ -1532,9 +1366,9 @@ def format_batch_on_device(self, batch): ) if self.args.encoder_sample_rate: - assert batch["spec"].shape[2] == int( - batch["mel"].shape[2] / self.interpolate_factor - ), f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" + assert batch["spec"].shape[2] == int(batch["mel"].shape[2] / self.interpolate_factor), ( + f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" + ) else: assert batch["spec"].shape[2] == batch["mel"].shape[2], f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" @@ -1591,12 +1425,12 @@ def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1, is_eval=F def get_data_loader( self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: bool, - samples: Union[List[Dict], List[List]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, - rank: int = None, + rank: int | None = None, ) -> "DataLoader": if is_eval and not config.run_eval: loader = None @@ -1655,7 +1489,7 @@ def get_data_loader( ) return loader - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: """Initiate and return the GAN optimizers based on the config parameters. It returns 2 optimizers in a list. First one is for the discriminator @@ -1673,7 +1507,7 @@ def get_optimizer(self) -> List: ) return [optimizer0, optimizer1] - def get_lr(self) -> List: + def get_lr(self) -> list: """Set the initial learning rates for each optimizer. Returns: @@ -1681,7 +1515,7 @@ def get_lr(self) -> List: """ return [self.config.lr_disc, self.config.lr_gen] - def get_scheduler(self, optimizer) -> List: + def get_scheduler(self, optimizer) -> list: """Set the schedulers for each optimizer. Args: @@ -1704,9 +1538,7 @@ def get_criterion(self): return [VitsDiscriminatorLoss(self.config), VitsGeneratorLoss(self.config)] - def load_checkpoint( - self, config, checkpoint_path, eval=False, strict=True, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, strict=True, cache=False): # pylint: disable=unused-argument, redefined-builtin """Load the model checkpoint and setup for training or inference""" state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) # compat band-aid for the pre-trained models to not use the encoder baked into the model @@ -1733,9 +1565,7 @@ def load_checkpoint( self.eval() assert not self.training - def load_fairseq_checkpoint( - self, config, checkpoint_dir, eval=False, strict=True - ): # pylint: disable=unused-argument, redefined-builtin + def load_fairseq_checkpoint(self, config, checkpoint_dir, eval=False, strict=True): # pylint: disable=unused-argument, redefined-builtin """Load VITS checkpoints released by fairseq here: https://github.com/facebookresearch/fairseq/tree/main/examples/mms Performs some changes for compatibility. @@ -1750,13 +1580,16 @@ def load_fairseq_checkpoint( self.disc = None # set paths - config_file = os.path.join(checkpoint_dir, "config.json") - checkpoint_file = os.path.join(checkpoint_dir, "G_100000.pth") - vocab_file = os.path.join(checkpoint_dir, "vocab.txt") + checkpoint_dir = Path(checkpoint_dir) + config_file = checkpoint_dir / "config.json" + checkpoint_file = checkpoint_dir / "model.pth" + if not checkpoint_file.is_file(): + checkpoint_file = checkpoint_dir / "G_100000.pth" + vocab_file = checkpoint_dir / "vocab.txt" # set config params - with open(config_file, "r", encoding="utf-8") as file: + with open(config_file, encoding="utf-8") as f: # Load the JSON data as a dictionary - config_org = json.load(file) + config_org = json.load(f) self.config.audio.sample_rate = config_org["data"]["sampling_rate"] # self.config.add_blank = config['add_blank'] # set tokenizer @@ -1778,7 +1611,7 @@ def load_fairseq_checkpoint( assert not self.training @staticmethod - def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: "VitsConfig", samples: list[list] | list[dict] = None): """Initiate model from config Args: @@ -1791,15 +1624,15 @@ def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict] upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item() if not config.model_args.encoder_sample_rate: - assert ( - upsample_rate == config.audio.hop_length - ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}" + assert upsample_rate == config.audio.hop_length, ( + f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}" + ) else: encoder_to_vocoder_upsampling_factor = config.audio.sample_rate / config.model_args.encoder_sample_rate effective_hop_length = config.audio.hop_length * encoder_to_vocoder_upsampling_factor - assert ( - upsample_rate == effective_hop_length - ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}" + assert upsample_rate == effective_hop_length, ( + f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}" + ) ap = AudioProcessor.init_from_config(config) tokenizer, new_config = TTSTokenizer.init_from_config(config) @@ -1990,7 +1823,7 @@ def to_config(self) -> "CharactersConfig": class FairseqVocab(BaseVocabulary): - def __init__(self, vocab: str): + def __init__(self, vocab: str | os.PathLike[Any]): super(FairseqVocab).__init__() self.vocab = vocab @@ -2000,7 +1833,7 @@ def vocab(self): return self._vocab @vocab.setter - def vocab(self, vocab_file): + def vocab(self, vocab_file: str | os.PathLike[Any]): with open(vocab_file, encoding="utf-8") as f: self._vocab = [x.replace("\n", "") for x in f.readlines()] self.blank = self._vocab[0] diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 22d2720efa..2df07a0435 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -93,25 +93,6 @@ def load_audio(audiopath, sampling_rate): return audio -def pad_or_truncate(t, length): - """ - Ensure a given tensor t has a specified sequence length by either padding it with zeros or clipping it. - - Args: - t (torch.Tensor): The input tensor to be padded or truncated. - length (int): The desired length of the tensor. - - Returns: - torch.Tensor: The padded or truncated tensor. - """ - tp = t[..., :length] - if t.shape[-1] == length: - tp = t - elif t.shape[-1] < length: - tp = F.pad(t, (0, length - t.shape[-1])) - return tp - - @dataclass class XttsAudioConfig(Coqpit): """ @@ -120,10 +101,12 @@ class XttsAudioConfig(Coqpit): Args: sample_rate (int): The sample rate in which the GPT operates. output_sample_rate (int): The sample rate of the output audio waveform. + dvae_sample_rate (int): The sample rate of the DVAE """ sample_rate: int = 22050 output_sample_rate: int = 24000 + dvae_sample_rate: int = 22050 @dataclass @@ -194,7 +177,7 @@ class XttsArgs(Coqpit): class Xtts(BaseTTS): - """ⓍTTS model implementation. + """XTTS model implementation. ❗ Currently it only supports inference. @@ -255,10 +238,6 @@ def init_models(self): cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer, ) - @property - def device(self): - return next(self.parameters()).device - @torch.inference_mode() def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = 6): """Compute the conditioning latents for the GPT model from the given audio. @@ -400,9 +379,9 @@ def synthesize(self, text, config, speaker_wav, language, speaker_id=None, **kwa as latents used at inference. """ - assert ( - "zh-cn" if language == "zh" else language in self.config.languages - ), f" ❗ Language {language} is not supported. Supported languages are {self.config.languages}" + assert "zh-cn" if language == "zh" else language in self.config.languages, ( + f" ❗ Language {language} is not supported. Supported languages are {self.config.languages}" + ) # Use generally found best tuning knobs for generation. settings = { "temperature": config.temperature, @@ -476,7 +455,7 @@ def full_inference( gpt_cond_chunk_len: (int) Chunk length used for cloning. It must be <= `gpt_cond_len`. If gpt_cond_len == gpt_cond_chunk_len, no chunking. Defaults to 6 seconds. - hf_generate_kwargs: (**kwargs) The huggingface Transformers generate API is used for the autoregressive + hf_generate_kwargs: (`**kwargs`) The huggingface Transformers generate API is used for the autoregressive transformer. Extra keyword args fed to this function get forwarded directly to that API. Documentation here: https://huggingface.co/docs/transformers/internal/generation_utils @@ -540,9 +519,9 @@ def inference( sent = sent.strip().lower() text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device) - assert ( - text_tokens.shape[-1] < self.args.gpt_max_text_tokens - ), " ❗ XTTS can only generate text with a maximum of 400 tokens." + assert text_tokens.shape[-1] < self.args.gpt_max_text_tokens, ( + " ❗ XTTS can only generate text with a maximum of 400 tokens." + ) with torch.no_grad(): gpt_codes = self.gpt.generate( @@ -648,9 +627,9 @@ def inference_stream( sent = sent.strip().lower() text_tokens = torch.IntTensor(self.tokenizer.encode(sent, lang=language)).unsqueeze(0).to(self.device) - assert ( - text_tokens.shape[-1] < self.args.gpt_max_text_tokens - ), " ❗ XTTS can only generate text with a maximum of 400 tokens." + assert text_tokens.shape[-1] < self.args.gpt_max_text_tokens, ( + " ❗ XTTS can only generate text with a maximum of 400 tokens." + ) fake_inputs = self.gpt.compute_embeddings( gpt_cond_latent.to(self.device), @@ -738,14 +717,14 @@ def get_compatible_checkpoint_state_dict(self, model_path): def load_checkpoint( self, - config, - checkpoint_dir=None, - checkpoint_path=None, - vocab_path=None, - eval=True, - strict=True, - use_deepspeed=False, - speaker_file_path=None, + config: "XttsConfig", + checkpoint_dir: str | None = None, + checkpoint_path: str | None = None, + vocab_path: str | None = None, + eval: bool = True, + strict: bool = True, + use_deepspeed: bool = False, + speaker_file_path: str | None = None, ): """ Loads a checkpoint from disk and initializes the model's state and tokenizer. @@ -761,7 +740,9 @@ def load_checkpoint( Returns: None """ - + if checkpoint_dir is not None and Path(checkpoint_dir).is_file(): + msg = f"You passed a file to `checkpoint_dir=`. Use `checkpoint_path={checkpoint_dir}` instead." + raise ValueError(msg) model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth") if vocab_path is None: if checkpoint_dir is not None and (Path(checkpoint_dir) / "vocab.json").is_file(): diff --git a/TTS/tts/utils/data.py b/TTS/tts/utils/data.py index 22e46b683a..d0269060c8 100644 --- a/TTS/tts/utils/data.py +++ b/TTS/tts/utils/data.py @@ -11,7 +11,7 @@ def _pad_data(x, length): def prepare_data(inputs): - max_len = max((len(x) for x in inputs)) + max_len = max(len(x) for x in inputs) return np.stack([_pad_data(x, max_len) for x in inputs]) @@ -23,7 +23,7 @@ def _pad_tensor(x, length): def prepare_tensor(inputs, out_steps): - max_len = max((x.shape[1] for x in inputs)) + max_len = max(x.shape[1] for x in inputs) remainder = max_len % out_steps pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len return np.stack([_pad_tensor(x, pad_len) for x in inputs]) @@ -46,7 +46,7 @@ def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray: def prepare_stop_target(inputs, out_steps): """Pad row vectors with 1.""" - max_len = max((x.shape[0] for x in inputs)) + max_len = max(x.shape[0] for x in inputs) remainder = max_len % out_steps pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len return np.stack([_pad_stop_target(x, pad_len) for x in inputs]) diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py index d1722501f7..a3648eff4b 100644 --- a/TTS/tts/utils/helpers.py +++ b/TTS/tts/utils/helpers.py @@ -33,7 +33,7 @@ def inverse_transform(self, X): # from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 -def sequence_mask(sequence_length, max_len=None): +def sequence_mask(sequence_length: torch.Tensor, max_len: int | None = None) -> torch.Tensor: """Create a sequence mask for filtering padding in a sequence tensor. Args: @@ -44,7 +44,7 @@ def sequence_mask(sequence_length, max_len=None): - mask: :math:`[B, T_max]` """ if max_len is None: - max_len = sequence_length.max() + max_len = int(sequence_length.max()) seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device) # B x T_max return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1) @@ -105,9 +105,9 @@ def rand_segments( _x_lenghts[len_diff < 0] = segment_size len_diff = _x_lenghts - segment_size else: - assert all( - len_diff > 0 - ), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}" + assert all(len_diff > 0), ( + f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}" + ) segment_indices = (torch.rand([B]).type_as(x) * (len_diff + 1)).long() ret = segment(x, segment_indices, segment_size, pad_short=pad_short) return ret, segment_indices @@ -143,22 +143,75 @@ def convert_pad_shape(pad_shape: list[list]) -> list: return [item for sublist in l for item in sublist] -def generate_path(duration, mask): - """ +def generate_path(duration: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: + """Generate alignment path based on the given segment durations. + Shapes: - duration: :math:`[B, T_en]` - mask: :math:'[B, T_en, T_de]` - path: :math:`[B, T_en, T_de]` """ b, t_x, t_y = mask.shape - cum_duration = torch.cumsum(duration, 1) + cum_duration = torch.cumsum(duration, dim=1) cum_duration_flat = cum_duration.view(b * t_x) path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) path = path.view(b, t_x, t_y) path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] - path = path * mask - return path + return path * mask + + +def generate_attention( + duration: torch.Tensor, x_mask: torch.Tensor, y_mask: torch.Tensor | None = None +) -> torch.Tensor: + """Generate an attention map from the linear scale durations. + + Args: + duration (Tensor): Linear scale durations. + x_mask (Tensor): Mask for the input (character) sequence. + y_mask (Tensor): Mask for the output (spectrogram) sequence. Compute it from the predicted durations + if None. Defaults to None. + + Shapes + - duration: :math:`(B, T_{en})` + - x_mask: :math:`(B, T_{en})` + - y_mask: :math:`(B, T_{de})` + """ + # compute decode mask from the durations + if y_mask is None: + y_lengths = duration.sum(dim=1).long() + y_lengths[y_lengths < 1] = 1 + y_mask = sequence_mask(y_lengths).unsqueeze(1).to(duration.dtype) + attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2) + return generate_path(duration, attn_mask.squeeze(1)).to(duration.dtype) + + +def expand_encoder_outputs( + x: torch.Tensor, duration: torch.Tensor, x_mask: torch.Tensor, y_lengths: torch.Tensor +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Generate attention alignment map from durations and expand encoder outputs. + + Shapes: + - x: Encoder output :math:`(B, D_{en}, T_{en})` + - duration: :math:`(B, T_{en})` + - x_mask: :math:`(B, T_{en})` + - y_lengths: :math:`(B)` + + Examples:: + + encoder output: [a,b,c,d] + durations: [1, 3, 2, 1] + + expanded: [a, b, b, b, c, c, d] + attention map: [[0, 0, 0, 0, 0, 0, 1], + [0, 0, 0, 0, 1, 1, 0], + [0, 1, 1, 1, 0, 0, 0], + [1, 0, 0, 0, 0, 0, 0]] + """ + y_mask = sequence_mask(y_lengths).unsqueeze(1).to(x.dtype) + attn = generate_attention(duration, x_mask, y_mask) + x_expanded = torch.einsum("kmn, kjm -> kjn", [attn.float(), x]) + return x_expanded, attn, y_mask def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=1.0): diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index f134daf58e..5ce7759dd8 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -1,5 +1,5 @@ import os -from typing import Any, Dict, List, Optional +from typing import Any, Optional import fsspec import numpy as np @@ -27,8 +27,8 @@ class LanguageManager(BaseIDManager): def __init__( self, - language_ids_file_path: str = "", - config: Coqpit = None, + language_ids_file_path: str | os.PathLike[Any] = "", + config: Coqpit | None = None, ): super().__init__(id_file_path=language_ids_file_path) @@ -40,11 +40,11 @@ def num_languages(self) -> int: return len(list(self.name_to_id.keys())) @property - def language_names(self) -> List: + def language_names(self) -> list: return list(self.name_to_id.keys()) @staticmethod - def parse_language_ids_from_config(c: Coqpit) -> Dict: + def parse_language_ids_from_config(c: Coqpit) -> dict: """Set language id from config. Args: @@ -70,13 +70,13 @@ def set_language_ids_from_config(self, c: Coqpit) -> None: self.name_to_id = self.parse_language_ids_from_config(c) @staticmethod - def parse_ids_from_data(items: List, parse_key: str) -> Any: + def parse_ids_from_data(items: list, parse_key: str) -> Any: raise NotImplementedError - def set_ids_from_data(self, items: List, parse_key: str) -> Any: + def set_ids_from_data(self, items: list, parse_key: str) -> Any: raise NotImplementedError - def save_ids_to_file(self, file_path: str) -> None: + def save_ids_to_file(self, file_path: str | os.PathLike[Any]) -> None: """Save language IDs to a json file. Args: diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py index 6a2f7df67b..49e93454f2 100644 --- a/TTS/tts/utils/managers.py +++ b/TTS/tts/utils/managers.py @@ -1,6 +1,7 @@ import json +import os import random -from typing import Any, Dict, List, Tuple, Union +from typing import Any import fsspec import numpy as np @@ -12,7 +13,8 @@ from TTS.utils.generic_utils import is_pytorch_at_least_2_4 -def load_file(path: str): +def load_file(path: str | os.PathLike[Any]): + path = str(path) if path.endswith(".json"): with fsspec.open(path, "r") as f: return json.load(f) @@ -23,7 +25,8 @@ def load_file(path: str): raise ValueError("Unsupported file type") -def save_file(obj: Any, path: str): +def save_file(obj: Any, path: str | os.PathLike[Any]): + path = str(path) if path.endswith(".json"): with fsspec.open(path, "w") as f: json.dump(obj, f, indent=4) @@ -39,23 +42,23 @@ class BaseIDManager: It defines common `ID` manager specific functions. """ - def __init__(self, id_file_path: str = ""): + def __init__(self, id_file_path: str | os.PathLike[Any] = ""): self.name_to_id = {} if id_file_path: self.load_ids_from_file(id_file_path) @staticmethod - def _load_json(json_file_path: str) -> Dict: - with fsspec.open(json_file_path, "r") as f: + def _load_json(json_file_path: str | os.PathLike[Any]) -> dict: + with fsspec.open(str(json_file_path), "r") as f: return json.load(f) @staticmethod - def _save_json(json_file_path: str, data: dict) -> None: - with fsspec.open(json_file_path, "w") as f: + def _save_json(json_file_path: str | os.PathLike[Any], data: dict) -> None: + with fsspec.open(str(json_file_path), "w") as f: json.dump(data, f, indent=4) - def set_ids_from_data(self, items: List, parse_key: str) -> None: + def set_ids_from_data(self, items: list, parse_key: str) -> None: """Set IDs from data samples. Args: @@ -63,7 +66,7 @@ def set_ids_from_data(self, items: List, parse_key: str) -> None: """ self.name_to_id = self.parse_ids_from_data(items, parse_key=parse_key) - def load_ids_from_file(self, file_path: str) -> None: + def load_ids_from_file(self, file_path: str | os.PathLike[Any]) -> None: """Set IDs from a file. Args: @@ -71,7 +74,7 @@ def load_ids_from_file(self, file_path: str) -> None: """ self.name_to_id = load_file(file_path) - def save_ids_to_file(self, file_path: str) -> None: + def save_ids_to_file(self, file_path: str | os.PathLike[Any]) -> None: """Save IDs to a json file. Args: @@ -93,7 +96,7 @@ def get_random_id(self) -> Any: return None @staticmethod - def parse_ids_from_data(items: List, parse_key: str) -> Tuple[Dict]: + def parse_ids_from_data(items: list, parse_key: str) -> tuple[dict]: """Parse IDs from data samples retured by `load_tts_samples()`. Args: @@ -130,10 +133,10 @@ class EmbeddingManager(BaseIDManager): def __init__( self, - embedding_file_path: Union[str, List[str]] = "", - id_file_path: str = "", - encoder_model_path: str = "", - encoder_config_path: str = "", + embedding_file_path: str | os.PathLike[Any] | list[str | os.PathLike[Any]] = "", + id_file_path: str | os.PathLike[Any] = "", + encoder_model_path: str | os.PathLike[Any] = "", + encoder_config_path: str | os.PathLike[Any] = "", use_cuda: bool = False, ): super().__init__(id_file_path=id_file_path) @@ -176,7 +179,7 @@ def embedding_names(self): """Get embedding names.""" return list(self.embeddings_by_names.keys()) - def save_embeddings_to_file(self, file_path: str) -> None: + def save_embeddings_to_file(self, file_path: str | os.PathLike[Any]) -> None: """Save embeddings to a json file. Args: @@ -185,7 +188,7 @@ def save_embeddings_to_file(self, file_path: str) -> None: save_file(self.embeddings, file_path) @staticmethod - def read_embeddings_from_file(file_path: str): + def read_embeddings_from_file(file_path: str | os.PathLike[Any]): """Load embeddings from a json file. Args: @@ -204,7 +207,7 @@ def read_embeddings_from_file(file_path: str): embeddings_by_names[x["name"]].append(x["embedding"]) return name_to_id, clip_ids, embeddings, embeddings_by_names - def load_embeddings_from_file(self, file_path: str) -> None: + def load_embeddings_from_file(self, file_path: str | os.PathLike[Any]) -> None: """Load embeddings from a json file. Args: @@ -214,7 +217,7 @@ def load_embeddings_from_file(self, file_path: str) -> None: file_path ) - def load_embeddings_from_list_of_files(self, file_paths: List[str]) -> None: + def load_embeddings_from_list_of_files(self, file_paths: list[str | os.PathLike[Any]]) -> None: """Load embeddings from a list of json files and don't allow duplicate keys. Args: @@ -239,7 +242,7 @@ def load_embeddings_from_list_of_files(self, file_paths: List[str]) -> None: # reset name_to_id to get the right speaker ids self.name_to_id = {name: i for i, name in enumerate(self.name_to_id)} - def get_embedding_by_clip(self, clip_idx: str) -> List: + def get_embedding_by_clip(self, clip_idx: str) -> list: """Get embedding by clip ID. Args: @@ -250,7 +253,7 @@ def get_embedding_by_clip(self, clip_idx: str) -> List: """ return self.embeddings[clip_idx]["embedding"] - def get_embeddings_by_name(self, idx: str) -> List[List]: + def get_embeddings_by_name(self, idx: str) -> list[list]: """Get all embeddings of a speaker. Args: @@ -261,7 +264,7 @@ def get_embeddings_by_name(self, idx: str) -> List[List]: """ return self.embeddings_by_names[idx] - def get_embeddings_by_names(self) -> Dict: + def get_embeddings_by_names(self) -> dict: """Get all embeddings by names. Returns: @@ -310,10 +313,12 @@ def get_random_embedding(self) -> Any: return None - def get_clips(self) -> List: + def get_clips(self) -> list: return sorted(self.embeddings.keys()) - def init_encoder(self, model_path: str, config_path: str, use_cuda=False) -> None: + def init_encoder( + self, model_path: str | os.PathLike[Any], config_path: str | os.PathLike[Any], use_cuda=False + ) -> None: """Initialize a speaker encoder model. Args: @@ -325,11 +330,12 @@ def init_encoder(self, model_path: str, config_path: str, use_cuda=False) -> Non self.encoder_config = load_config(config_path) self.encoder = setup_encoder_model(self.encoder_config) self.encoder_criterion = self.encoder.load_checkpoint( - self.encoder_config, model_path, eval=True, use_cuda=use_cuda, cache=True + self.encoder_config, str(model_path), eval=True, use_cuda=use_cuda, cache=True ) self.encoder_ap = AudioProcessor(**self.encoder_config.audio) - def compute_embedding_from_clip(self, wav_file: Union[str, List[str]]) -> list: + @torch.inference_mode() + def compute_embedding_from_clip(self, wav_file: str | os.PathLike[Any] | list[str | os.PathLike[Any]]) -> list: """Compute a embedding from a given audio file. Args: @@ -366,7 +372,7 @@ def _compute(wav_file: str): embedding = _compute(wav_file) return embedding[0].tolist() - def compute_embeddings(self, feats: Union[torch.Tensor, np.ndarray]) -> List: + def compute_embeddings(self, feats: torch.Tensor | np.ndarray) -> list: """Compute embedding from features. Args: diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 5229af81c5..6fab27de5a 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,7 +1,7 @@ import json import logging import os -from typing import Any, Dict, List, Union +from typing import Any import fsspec import numpy as np @@ -56,11 +56,11 @@ class SpeakerManager(EmbeddingManager): def __init__( self, - data_items: List[List[Any]] = None, + data_items: list[list[Any]] | None = None, d_vectors_file_path: str = "", - speaker_id_file_path: str = "", - encoder_model_path: str = "", - encoder_config_path: str = "", + speaker_id_file_path: str | os.PathLike[Any] = "", + encoder_model_path: str | os.PathLike[Any] = "", + encoder_config_path: str | os.PathLike[Any] = "", use_cuda: bool = False, ): super().__init__( @@ -82,11 +82,11 @@ def num_speakers(self): def speaker_names(self): return list(self.name_to_id.keys()) - def get_speakers(self) -> List: + def get_speakers(self) -> list: return self.name_to_id @staticmethod - def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager": + def init_from_config(config: "Coqpit", samples: list[list] | list[dict] = None) -> "SpeakerManager": """Initialize a speaker manager from config Args: @@ -150,7 +150,7 @@ def save_speaker_mapping(out_path, speaker_mapping): json.dump(speaker_mapping, f, indent=4) -def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager: +def get_speaker_manager(c: Coqpit, data: list = None, restore_path: str = None, out_path: str = None) -> SpeakerManager: """Initiate a `SpeakerManager` instance by the provided config. Args: @@ -185,9 +185,9 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, elif not c.use_d_vector_file: # restor speaker manager with speaker ID file. speaker_ids_from_data = speaker_manager.name_to_id speaker_manager.load_ids_from_file(speakers_file) - assert all( - speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data - ), " [!] You cannot introduce new speakers to a pre-trained model." + assert all(speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data), ( + " [!] You cannot introduce new speakers to a pre-trained model." + ) elif c.use_d_vector_file and c.d_vector_file: # new speaker manager with external speaker embeddings. speaker_manager.load_embeddings_from_file(c.d_vector_file) diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py index eddf05db3f..660370a832 100644 --- a/TTS/tts/utils/ssim.py +++ b/TTS/tts/utils/ssim.py @@ -1,6 +1,5 @@ # Adopted from https://github.com/photosynthesis-team/piq -from typing import List, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -24,11 +23,11 @@ def _reduce(x: torch.Tensor, reduction: str = "mean") -> torch.Tensor: def _validate_input( - tensors: List[torch.Tensor], - dim_range: Tuple[int, int] = (0, -1), - data_range: Tuple[float, float] = (0.0, -1.0), + tensors: list[torch.Tensor], + dim_range: tuple[int, int] = (0, -1), + data_range: tuple[float, float] = (0.0, -1.0), # size_dim_range: Tuple[float, float] = (0., -1.), - size_range: Optional[Tuple[int, int]] = None, + size_range: tuple[int, int] | None = None, ) -> None: r"""Check that input(-s) satisfies the requirements Args: @@ -50,16 +49,16 @@ def _validate_input( if size_range is None: assert t.size() == x.size(), f"Expected tensors with same size, got {t.size()} and {x.size()}" else: - assert ( - t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]] - ), f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}" + assert t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]], ( + f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}" + ) if dim_range[0] == dim_range[1]: assert t.dim() == dim_range[0], f"Expected number of dimensions to be {dim_range[0]}, got {t.dim()}" elif dim_range[0] < dim_range[1]: - assert ( - dim_range[0] <= t.dim() <= dim_range[1] - ), f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}" + assert dim_range[0] <= t.dim() <= dim_range[1], ( + f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}" + ) if data_range[0] < data_range[1]: assert data_range[0] <= t.min(), f"Expected values to be greater or equal to {data_range[0]}, got {t.min()}" @@ -89,13 +88,13 @@ def ssim( y: torch.Tensor, kernel_size: int = 11, kernel_sigma: float = 1.5, - data_range: Union[int, float] = 1.0, + data_range: int | float = 1.0, reduction: str = "mean", full: bool = False, downsample: bool = True, k1: float = 0.01, k2: float = 0.03, -) -> List[torch.Tensor]: +) -> list[torch.Tensor]: r"""Interface of Structural Similarity (SSIM) index. Inputs supposed to be in range ``[0, data_range]``. To match performance with skimage and tensorflow set ``'downsample' = True``. @@ -218,7 +217,7 @@ def __init__( k2: float = 0.03, downsample: bool = True, reduction: str = "mean", - data_range: Union[int, float] = 1.0, + data_range: int | float = 1.0, ) -> None: super().__init__() @@ -270,7 +269,7 @@ def _ssim_per_channel( kernel: torch.Tensor, k1: float = 0.01, k2: float = 0.03, -) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: +) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: r"""Calculate Structural Similarity (SSIM) index for X and Y per channel. Args: @@ -286,8 +285,7 @@ def _ssim_per_channel( """ if x.size(-1) < kernel.size(-1) or x.size(-2) < kernel.size(-2): raise ValueError( - f"Kernel size can't be greater than actual input size. Input size: {x.size()}. " - f"Kernel size: {kernel.size()}" + f"Kernel size can't be greater than actual input size. Input size: {x.size()}. Kernel size: {kernel.size()}" ) c1 = k1**2 @@ -321,7 +319,7 @@ def _ssim_per_channel_complex( kernel: torch.Tensor, k1: float = 0.01, k2: float = 0.03, -) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: +) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: r"""Calculate Structural Similarity (SSIM) index for Complex X and Y per channel. Args: @@ -338,8 +336,7 @@ def _ssim_per_channel_complex( n_channels = x.size(1) if x.size(-2) < kernel.size(-1) or x.size(-3) < kernel.size(-2): raise ValueError( - f"Kernel size can't be greater than actual input size. Input size: {x.size()}. " - f"Kernel size: {kernel.size()}" + f"Kernel size can't be greater than actual input size. Input size: {x.size()}. Kernel size: {kernel.size()}" ) c1 = k1**2 diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 797151c254..c09c3f5aa2 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -1,17 +1,12 @@ -from typing import Dict - import numpy as np import torch from torch import nn -def numpy_to_torch(np_array, dtype, cuda=False, device="cpu"): - if cuda: - device = "cuda" +def numpy_to_torch(np_array: np.ndarray, dtype: torch.dtype, device: str | torch.device = "cpu") -> torch.Tensor | None: if np_array is None: return None - tensor = torch.as_tensor(np_array, dtype=dtype, device=device) - return tensor + return torch.as_tensor(np_array, dtype=dtype, device=device) def compute_style_mel(style_wav, ap, cuda=False, device="cpu"): @@ -32,7 +27,7 @@ def run_model_torch( style_text: str = None, d_vector: torch.Tensor = None, language_id: torch.Tensor = None, -) -> Dict: +) -> dict: """Run a torch model for inference. It does not support batch inference. Args: @@ -76,18 +71,14 @@ def inv_spectrogram(postnet_output, ap, CONFIG): return wav -def id_to_torch(aux_id, cuda=False, device="cpu"): - if cuda: - device = "cuda" +def id_to_torch(aux_id, device: str | torch.device = "cpu") -> torch.Tensor | None: if aux_id is not None: aux_id = np.asarray(aux_id) aux_id = torch.from_numpy(aux_id).to(device) return aux_id -def embedding_to_torch(d_vector, cuda=False, device="cpu"): - if cuda: - device = "cuda" +def embedding_to_torch(d_vector, device: str | torch.device = "cpu") -> torch.Tensor | None: if d_vector is not None: d_vector = np.asarray(d_vector) d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor) diff --git a/TTS/tts/utils/text/bangla/phonemizer.py b/TTS/tts/utils/text/bangla/phonemizer.py index cddcb00fd5..1537240380 100644 --- a/TTS/tts/utils/text/bangla/phonemizer.py +++ b/TTS/tts/utils/text/bangla/phonemizer.py @@ -45,7 +45,7 @@ def tag_text(text: str): # create start and end text = "start" + text + "end" # tag text - parts = re.split("[\u0600-\u06FF]+", text) + parts = re.split("[\u0600-\u06ff]+", text) # remove non chars parts = [p for p in parts if p.strip()] # unique parts diff --git a/TTS/tts/utils/text/characters.py b/TTS/tts/utils/text/characters.py index 4bf9bf6bd5..f8beaef036 100644 --- a/TTS/tts/utils/text/characters.py +++ b/TTS/tts/utils/text/characters.py @@ -1,6 +1,5 @@ import logging from dataclasses import replace -from typing import Dict from TTS.tts.configs.shared_configs import CharactersConfig @@ -47,7 +46,7 @@ class BaseVocabulary: vocab (Dict): A dictionary of characters and their corresponding indices. """ - def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None): + def __init__(self, vocab: dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None): self.vocab = vocab self.pad = pad self.blank = blank @@ -290,9 +289,9 @@ def _create_vocab(self): self.vocab = _vocab + list(self._punctuations) if self.is_unique: duplicates = {x for x in self.vocab if self.vocab.count(x) > 1} - assert ( - len(self.vocab) == len(self._char_to_id) == len(self._id_to_char) - ), f" [!] There are duplicate characters in the character set. {duplicates}" + assert len(self.vocab) == len(self._char_to_id) == len(self._id_to_char), ( + f" [!] There are duplicate characters in the character set. {duplicates}" + ) def char_to_id(self, char: str) -> int: try: diff --git a/TTS/tts/utils/text/chinese_mandarin/numbers.py b/TTS/tts/utils/text/chinese_mandarin/numbers.py index 4787ea6100..3e6a043918 100644 --- a/TTS/tts/utils/text/chinese_mandarin/numbers.py +++ b/TTS/tts/utils/text/chinese_mandarin/numbers.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # Licensed under WTFPL or the Unlicense or CC0. # This uses Python 3, but it's easy to port to Python 2 by changing diff --git a/TTS/tts/utils/text/chinese_mandarin/phonemizer.py b/TTS/tts/utils/text/chinese_mandarin/phonemizer.py index e9d62e9d06..4dccdd5778 100644 --- a/TTS/tts/utils/text/chinese_mandarin/phonemizer.py +++ b/TTS/tts/utils/text/chinese_mandarin/phonemizer.py @@ -1,5 +1,3 @@ -from typing import List - try: import jieba import pypinyin @@ -9,7 +7,7 @@ from .pinyinToPhonemes import PINYIN_DICT -def _chinese_character_to_pinyin(text: str) -> List[str]: +def _chinese_character_to_pinyin(text: str) -> list[str]: pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True) pinyins_flat_list = [item for sublist in pinyins for item in sublist] return pinyins_flat_list @@ -25,9 +23,9 @@ def _chinese_pinyin_to_phoneme(pinyin: str) -> str: def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str: tokenized_text = jieba.cut(text, HMM=False) tokenized_text = " ".join(tokenized_text) - pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text) + pinyined_text: list[str] = _chinese_character_to_pinyin(tokenized_text) - results: List[str] = [] + results: list[str] = [] for token in pinyined_text: if token[-1] in "12345": # TODO transform to is_pinyin() diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index f496b9f0dd..795ab246d2 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -1,7 +1,6 @@ """Set of default text cleaners""" import re -from typing import Optional from unicodedata import normalize from anyascii import anyascii @@ -47,7 +46,7 @@ def remove_aux_symbols(text: str) -> str: return text -def replace_symbols(text: str, lang: Optional[str] = "en") -> str: +def replace_symbols(text: str, lang: str | None = "en") -> str: """Replace symbols based on the language tag. Args: diff --git a/TTS/tts/utils/text/cmudict.py b/TTS/tts/utils/text/cmudict.py index f206fb043b..9c0df06196 100644 --- a/TTS/tts/utils/text/cmudict.py +++ b/TTS/tts/utils/text/cmudict.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - import re VALID_SYMBOLS = [ @@ -121,7 +119,7 @@ def get_arpabet(word, cmudict, punctuation_symbols): word = word[:-1] arpabet = cmudict.lookup(word) if arpabet is not None: - return first_symbol + "{%s}" % arpabet[0] + last_symbol + return first_symbol + "{%s}" % arpabet[0] + last_symbol # noqa: UP031 return first_symbol + word + last_symbol diff --git a/TTS/tts/utils/text/english/abbreviations.py b/TTS/tts/utils/text/english/abbreviations.py index cd93c13c8e..20042b255b 100644 --- a/TTS/tts/utils/text/english/abbreviations.py +++ b/TTS/tts/utils/text/english/abbreviations.py @@ -2,7 +2,7 @@ # List of (regular expression, replacement) pairs for abbreviations in english: abbreviations_en = [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("mrs", "misess"), ("mr", "mister"), diff --git a/TTS/tts/utils/text/english/number_norm.py b/TTS/tts/utils/text/english/number_norm.py index e8377ede87..be2a4b3084 100644 --- a/TTS/tts/utils/text/english/number_norm.py +++ b/TTS/tts/utils/text/english/number_norm.py @@ -1,7 +1,6 @@ -""" from https://github.com/keithito/tacotron """ +"""from https://github.com/keithito/tacotron""" import re -from typing import Dict import inflect @@ -21,7 +20,7 @@ def _expand_decimal_point(m): return m.group(1).replace(".", " point ") -def __expand_currency(value: str, inflection: Dict[float, str]) -> str: +def __expand_currency(value: str, inflection: dict[float, str]) -> str: parts = value.replace(",", "").split(".") if len(parts) > 2: return f"{value} {inflection[2]}" # Unexpected format @@ -85,7 +84,11 @@ def _expand_number(m): if num % 100 == 0: return _inflect.number_to_words(num // 100) + " hundred" return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ") - return _inflect.number_to_words(num, andword="") + try: + text = _inflect.number_to_words(num, andword="") + except inflect.NumOutOfRangeError: + text = _inflect.number_to_words(num, group=1).replace(", ", " ") + return text def normalize_numbers(text): diff --git a/TTS/tts/utils/text/french/abbreviations.py b/TTS/tts/utils/text/french/abbreviations.py index f580dfed7b..e317bbbf3a 100644 --- a/TTS/tts/utils/text/french/abbreviations.py +++ b/TTS/tts/utils/text/french/abbreviations.py @@ -2,7 +2,7 @@ # List of (regular expression, replacement) pairs for abbreviations in french: abbreviations_fr = [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + (re.compile(f"\\b{x[0]}\\.", re.IGNORECASE), x[1]) for x in [ ("M", "monsieur"), ("Mlle", "mademoiselle"), @@ -38,7 +38,7 @@ ("boul", "boulevard"), ] ] + [ - (re.compile("\\b%s" % x[0]), x[1]) + (re.compile(f"\\b{x[0]}"), x[1]) for x in [ ("Mlle", "mademoiselle"), ("Mlles", "mesdemoiselles"), diff --git a/TTS/tts/utils/text/korean/ko_dictionary.py b/TTS/tts/utils/text/korean/ko_dictionary.py index 9b739339c6..706f9f5daf 100644 --- a/TTS/tts/utils/text/korean/ko_dictionary.py +++ b/TTS/tts/utils/text/korean/ko_dictionary.py @@ -1,4 +1,3 @@ -# coding: utf-8 # Add the word you want to the dictionary. etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"} diff --git a/TTS/tts/utils/text/korean/korean.py b/TTS/tts/utils/text/korean/korean.py index 423aeed377..1b1e0ca0fb 100644 --- a/TTS/tts/utils/text/korean/korean.py +++ b/TTS/tts/utils/text/korean/korean.py @@ -1,4 +1,3 @@ -# coding: utf-8 # Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py import re diff --git a/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py b/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py index 3c4a35bbfa..3be7354636 100644 --- a/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.bangla.phonemizer import bangla_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -41,7 +39,7 @@ def _phonemize(self, text, separator): return self.phonemize_bn(text, separator) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"bn": "Bangla"} def version(self) -> str: diff --git a/TTS/tts/utils/text/phonemizers/base.py b/TTS/tts/utils/text/phonemizers/base.py index 5e701df458..6cc6ec0b37 100644 --- a/TTS/tts/utils/text/phonemizers/base.py +++ b/TTS/tts/utils/text/phonemizers/base.py @@ -1,6 +1,5 @@ import abc import logging -from typing import List, Tuple from TTS.tts.utils.text.punctuation import Punctuation @@ -37,7 +36,7 @@ class BasePhonemizer(abc.ABC): def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False): # ensure the backend is installed on the system if not self.is_available(): - raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover + raise RuntimeError(f"{self.name()} not installed on your system") # pragma: nocover # ensure the backend support the requested language self._language = self._init_language(language) @@ -53,7 +52,7 @@ def _init_language(self, language): """ if not self.is_supported_language(language): - raise RuntimeError(f'language "{language}" is not supported by the ' f"{self.name()} backend") + raise RuntimeError(f'language "{language}" is not supported by the {self.name()} backend') return language @property @@ -93,7 +92,7 @@ def is_supported_language(self, language): def _phonemize(self, text, separator): """The main phonemization method""" - def _phonemize_preprocess(self, text) -> Tuple[List[str], List]: + def _phonemize_preprocess(self, text) -> tuple[list[str], list]: """Preprocess the text before phonemization 1. remove spaces diff --git a/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py b/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py index e5fcab6e09..fa4a515d1a 100644 --- a/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.belarusian.phonemizer import belarusian_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -34,7 +32,7 @@ def _phonemize(self, text, separator): return self.phonemize_be(text, separator) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"be": "Belarusian"} def version(self) -> str: diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py index a15df716e7..dbcb8994a7 100644 --- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py @@ -5,7 +5,6 @@ import subprocess import tempfile from pathlib import Path -from typing import Optional from packaging.version import Version @@ -104,7 +103,7 @@ class ESpeak(BasePhonemizer): def __init__( self, language: str, - backend: Optional[str] = None, + backend: str | None = None, punctuations: str = Punctuation.default_puncs(), keep_puncs: bool = True, ): @@ -184,7 +183,7 @@ def phonemize_espeak(self, text: str, separator: str = "|", *, tie: bool = False else: args.append("--ipa=1") if tie: - args.append("--tie=%s" % tie) + args.append(f"--tie={tie}") tmp = tempfile.NamedTemporaryFile(mode="w+t", delete=False, encoding="utf8") tmp.write(text) diff --git a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py index f3e9c9abd4..836fccf5b8 100644 --- a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py +++ b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py @@ -1,5 +1,4 @@ import importlib -from typing import List import gruut from gruut_ipa import IPA @@ -114,7 +113,7 @@ def is_supported_language(self, language): return gruut.is_language_supported(language) @staticmethod - def supported_languages() -> List: + def supported_languages() -> list: """Get a dictionary of supported languages. Returns: diff --git a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py index 878e5e5296..b3b3ba4db7 100644 --- a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -51,7 +49,7 @@ def phonemize(self, text: str, separator="|", language=None) -> str: return self._phonemize(text, separator) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"ja-jp": "Japanese (Japan)"} def version(self) -> str: diff --git a/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py b/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py index 0bdba2137b..93930d064e 100644 --- a/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -44,7 +42,7 @@ def phonemize(self, text: str, separator: str = "", character: str = "hangeul", return self._phonemize(text, separator, character) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"ko-kr": "hangeul(korean)"} def version(self) -> str: diff --git a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py index 1a9e98b091..87fb940f6b 100644 --- a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py @@ -1,5 +1,4 @@ import logging -from typing import Dict, List from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name @@ -19,7 +18,7 @@ class MultiPhonemizer: lang_to_phonemizer = {} - def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None: # pylint: disable=dangerous-default-value + def __init__(self, lang_to_phonemizer_name: dict = {}) -> None: # pylint: disable=dangerous-default-value for k, v in lang_to_phonemizer_name.items(): if v == "" and k in DEF_LANG_TO_PHONEMIZER.keys(): lang_to_phonemizer_name[k] = DEF_LANG_TO_PHONEMIZER[k] @@ -29,7 +28,7 @@ def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None: # pylint: disab self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name) @staticmethod - def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict: + def init_phonemizers(lang_to_phonemizer_name: dict) -> dict: lang_to_phonemizer = {} for k, v in lang_to_phonemizer_name.items(): lang_to_phonemizer[k] = get_phonemizer_by_name(v, language=k) @@ -44,7 +43,7 @@ def phonemize(self, text, separator="|", language=""): raise ValueError("Language must be set for multi-phonemizer to phonemize.") return self.lang_to_phonemizer[language].phonemize(text, separator) - def supported_languages(self) -> List: + def supported_languages(self) -> list: return list(self.lang_to_phonemizer.keys()) def print_logs(self, level: int = 0): diff --git a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py index 41480c4173..9e70b03a0c 100644 --- a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py +++ b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py @@ -1,5 +1,3 @@ -from typing import Dict - from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes from TTS.tts.utils.text.phonemizers.base import BasePhonemizer @@ -41,7 +39,7 @@ def _phonemize(self, text, separator): return self.phonemize_zh_cn(text, separator) @staticmethod - def supported_languages() -> Dict: + def supported_languages() -> dict: return {"zh-cn": "Chinese (China)"} def version(self) -> str: diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py index f653cdf13f..07a8753884 100644 --- a/TTS/tts/utils/text/tokenizer.py +++ b/TTS/tts/utils/text/tokenizer.py @@ -1,5 +1,6 @@ import logging -from typing import Callable, Dict, List, Union +from collections.abc import Callable +from typing import Union from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes @@ -43,7 +44,7 @@ def __init__( use_phonemes=False, text_cleaner: Callable = None, characters: "BaseCharacters" = None, - phonemizer: Union["Phonemizer", Dict] = None, + phonemizer: Union["Phonemizer", dict] = None, add_blank: bool = False, use_eos_bos=False, ): @@ -65,7 +66,7 @@ def characters(self, new_characters): self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None - def encode(self, text: str) -> List[int]: + def encode(self, text: str) -> list[int]: """Encodes a string of text as a sequence of IDs.""" token_ids = [] for char in text: @@ -80,14 +81,14 @@ def encode(self, text: str) -> List[int]: logger.warning("Character %s not found in the vocabulary. Discarding it.", repr(char)) return token_ids - def decode(self, token_ids: List[int]) -> str: + def decode(self, token_ids: list[int]) -> str: """Decodes a sequence of IDs to a string of text.""" text = "" for token_id in token_ids: text += self.characters.id_to_char(token_id) return text - def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument + def text_to_ids(self, text: str, language: str = None) -> list[int]: # pylint: disable=unused-argument """Converts a string of text to a sequence of token IDs. Args: @@ -121,15 +122,15 @@ def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: text = self.pad_with_bos_eos(text) return text - def ids_to_text(self, id_sequence: List[int]) -> str: + def ids_to_text(self, id_sequence: list[int]) -> str: """Converts a sequence of token IDs to a string of text.""" return self.decode(id_sequence) - def pad_with_bos_eos(self, char_sequence: List[str]): + def pad_with_bos_eos(self, char_sequence: list[str]): """Pads a sequence with the special BOS and EOS characters.""" return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id] - def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False): + def intersperse_blank_char(self, char_sequence: list[str], use_blank_char: bool = False): """Intersperses the blank character between characters in a sequence. Use the ```blank``` character if defined else use the ```pad``` character. @@ -163,7 +164,7 @@ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None): """ # init cleaners text_cleaner = None - if isinstance(config.text_cleaner, (str, list)): + if isinstance(config.text_cleaner, str | list): text_cleaner = getattr(cleaners, config.text_cleaner) # init characters diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py index 203091ea88..7fd4259178 100644 --- a/TTS/utils/audio/numpy_transforms.py +++ b/TTS/utils/audio/numpy_transforms.py @@ -1,6 +1,7 @@ import logging +import os from io import BytesIO -from typing import Optional +from typing import Any import librosa import numpy as np @@ -20,7 +21,7 @@ def build_mel_basis( fft_size: int, num_mels: int, mel_fmin: int, - mel_fmax: Optional[int] = None, + mel_fmax: int | None = None, **kwargs, ) -> np.ndarray: """Build melspectrogram basis. @@ -59,7 +60,7 @@ def _exp(x, base): return np.exp(x) -def amp_to_db(*, x: np.ndarray, gain: float = 1, base: int = 10, **kwargs) -> np.ndarray: +def amp_to_db(*, x: np.ndarray, gain: float = 1, base: float = 10, **kwargs) -> np.ndarray: """Convert amplitude values to decibels. Args: @@ -176,8 +177,8 @@ def stft( *, y: np.ndarray, fft_size: int, - hop_length: Optional[int] = None, - win_length: Optional[int] = None, + hop_length: int | None = None, + win_length: int | None = None, pad_mode: str = "reflect", window: str = "hann", center: bool = True, @@ -204,8 +205,8 @@ def stft( def istft( *, y: np.ndarray, - hop_length: Optional[int] = None, - win_length: Optional[int] = None, + hop_length: int | None = None, + win_length: int | None = None, window: str = "hann", center: bool = True, **kwargs, @@ -247,8 +248,8 @@ def compute_stft_paddings(*, x: np.ndarray, hop_length: int, pad_two_sides: bool def compute_f0( *, x: np.ndarray, - pitch_fmax: Optional[float] = None, - pitch_fmin: Optional[float] = None, + pitch_fmax: float | None = None, + pitch_fmin: float | None = None, hop_length: int, win_length: int, sample_rate: int, @@ -406,7 +407,9 @@ def rms_volume_norm(*, x: np.ndarray, db_level: float = -27.0, **kwargs) -> np.n return rms_norm(wav=x, db_level=db_level) -def load_wav(*, filename: str, sample_rate: Optional[int] = None, resample: bool = False, **kwargs) -> np.ndarray: +def load_wav( + *, filename: str | os.PathLike[Any], sample_rate: int | None = None, resample: bool = False, **kwargs +) -> np.ndarray: """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before. @@ -434,7 +437,7 @@ def load_wav(*, filename: str, sample_rate: Optional[int] = None, resample: bool def save_wav( *, wav: np.ndarray, - path: str, + path: str | os.PathLike[Any], sample_rate: int, pipe_out=None, do_rms_norm: bool = False, diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py index 1d8fed8e39..55b8575aa4 100644 --- a/TTS/utils/audio/processor.py +++ b/TTS/utils/audio/processor.py @@ -1,5 +1,6 @@ import logging -from typing import Optional +import os +from typing import Any import librosa import numpy as np @@ -221,9 +222,9 @@ def __init__( self.hop_length = hop_length self.win_length = win_length assert min_level_db != 0.0, " [!] min_level_db is 0" - assert ( - self.win_length <= self.fft_size - ), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}" + assert self.win_length <= self.fft_size, ( + f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}" + ) members = vars(self) logger.info("Setting up Audio Processor...") for key, value in members.items(): @@ -282,7 +283,9 @@ def normalize(self, S: np.ndarray) -> np.ndarray: S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm if self.clip_norm: S_norm = np.clip( - S_norm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type + S_norm, + -self.max_norm, # pylint: disable=invalid-unary-operand-type + self.max_norm, ) return S_norm S_norm = self.max_norm * S_norm @@ -317,7 +320,9 @@ def denormalize(self, S: np.ndarray) -> np.ndarray: if self.symmetric_norm: if self.clip_norm: S_denorm = np.clip( - S_denorm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type + S_denorm, + -self.max_norm, # pylint: disable=invalid-unary-operand-type + self.max_norm, ) S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db return S_denorm + self.ref_level_db @@ -350,9 +355,9 @@ def load_stats(self, stats_path: str) -> tuple[np.array, np.array, np.array, np. if key in skip_parameters: continue if key not in ["sample_rate", "trim_db"]: - assert ( - stats_config[key] == self.__dict__[key] - ), f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" + assert stats_config[key] == self.__dict__[key], ( + f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" + ) return mel_mean, mel_std, linear_mean, linear_std, stats_config # pylint: disable=attribute-defined-outside-init @@ -548,7 +553,7 @@ def sound_norm(x: np.ndarray) -> np.ndarray: return volume_norm(x=x) ### save and load ### - def load_wav(self, filename: str, sr: Optional[int] = None) -> np.ndarray: + def load_wav(self, filename: str | os.PathLike[Any], sr: int | None = None) -> np.ndarray: """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before. @@ -575,7 +580,7 @@ def load_wav(self, filename: str, sr: Optional[int] = None) -> np.ndarray: x = rms_volume_norm(x=x, db_level=self.db_level) return x - def save_wav(self, wav: np.ndarray, path: str, sr: Optional[int] = None, pipe_out=None) -> None: + def save_wav(self, wav: np.ndarray, path: str | os.PathLike[Any], sr: int | None = None, pipe_out=None) -> None: """Save a waveform to a file using Scipy. Args: diff --git a/TTS/utils/audio/torch_transforms.py b/TTS/utils/audio/torch_transforms.py index 632969c51a..59bb23cc4f 100644 --- a/TTS/utils/audio/torch_transforms.py +++ b/TTS/utils/audio/torch_transforms.py @@ -1,7 +1,113 @@ +import logging + import librosa import torch from torch import nn +logger = logging.getLogger(__name__) + + +hann_window = {} +mel_basis = {} + + +def amp_to_db(x: torch.Tensor, *, spec_gain: float = 1.0, clip_val: float = 1e-5) -> torch.Tensor: + """Spectral normalization / dynamic range compression.""" + return torch.log(torch.clamp(x, min=clip_val) * spec_gain) + + +def db_to_amp(x: torch.Tensor, *, spec_gain: float = 1.0) -> torch.Tensor: + """Spectral denormalization / dynamic range decompression.""" + return torch.exp(x) / spec_gain + + +def wav_to_spec(y: torch.Tensor, n_fft: int, hop_length: int, win_length: int, *, center: bool = False) -> torch.Tensor: + """ + Args Shapes: + - y : :math:`[B, 1, T]` + + Return Shapes: + - spec : :math:`[B,C,T]` + """ + y = y.squeeze(1) + + if torch.min(y) < -1.0: + logger.info("min value is %.3f", torch.min(y)) + if torch.max(y) > 1.0: + logger.info("max value is %.3f", torch.max(y)) + + global hann_window + wnsize_dtype_device = f"{win_length}_{y.dtype}_{y.device}" + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device) + + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_length, + win_length=win_length, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + ) + + return torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + + +def spec_to_mel( + spec: torch.Tensor, n_fft: int, num_mels: int, sample_rate: int, fmin: float, fmax: float +) -> torch.Tensor: + """ + Args Shapes: + - spec : :math:`[B,C,T]` + + Return Shapes: + - mel : :math:`[B,C,T]` + """ + global mel_basis + fmax_dtype_device = f"{n_fft}_{fmax}_{spec.dtype}_{spec.device}" + if fmax_dtype_device not in mel_basis: + # TODO: switch librosa to torchaudio + mel = librosa.filters.mel(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) + mel = torch.matmul(mel_basis[fmax_dtype_device], spec) + return amp_to_db(mel) + + +def wav_to_mel( + y: torch.Tensor, + n_fft: int, + num_mels: int, + sample_rate: int, + hop_length: int, + win_length: int, + fmin: float, + fmax: float, + *, + center: bool = False, +) -> torch.Tensor: + """ + Args Shapes: + - y : :math:`[B, 1, T]` + + Return Shapes: + - spec : :math:`[B,C,T]` + """ + spec = wav_to_spec(y, n_fft, hop_length, win_length, center=center) + return spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax) + class TorchSTFT(nn.Module): # pylint: disable=abstract-method """Some of the audio processing funtions using Torch for faster batch processing. @@ -157,11 +263,3 @@ def _build_mel_basis(self): norm=self.mel_norm, ) self.mel_basis = torch.from_numpy(mel_basis).float() - - @staticmethod - def _amp_to_db(x, spec_gain=1.0): - return torch.log(torch.clamp(x, min=1e-5) * spec_gain) - - @staticmethod - def _db_to_amp(x, spec_gain=1.0): - return torch.exp(x) / spec_gain diff --git a/TTS/utils/callbacks.py b/TTS/utils/callbacks.py deleted file mode 100644 index 511d215c65..0000000000 --- a/TTS/utils/callbacks.py +++ /dev/null @@ -1,105 +0,0 @@ -class TrainerCallback: - @staticmethod - def on_init_start(trainer) -> None: - if hasattr(trainer.model, "module"): - if hasattr(trainer.model.module, "on_init_start"): - trainer.model.module.on_init_start(trainer) - else: - if hasattr(trainer.model, "on_init_start"): - trainer.model.on_init_start(trainer) - - if hasattr(trainer.criterion, "on_init_start"): - trainer.criterion.on_init_start(trainer) - - if hasattr(trainer.optimizer, "on_init_start"): - trainer.optimizer.on_init_start(trainer) - - @staticmethod - def on_init_end(trainer) -> None: - if hasattr(trainer.model, "module"): - if hasattr(trainer.model.module, "on_init_end"): - trainer.model.module.on_init_end(trainer) - else: - if hasattr(trainer.model, "on_init_end"): - trainer.model.on_init_end(trainer) - - if hasattr(trainer.criterion, "on_init_end"): - trainer.criterion.on_init_end(trainer) - - if hasattr(trainer.optimizer, "on_init_end"): - trainer.optimizer.on_init_end(trainer) - - @staticmethod - def on_epoch_start(trainer) -> None: - if hasattr(trainer.model, "module"): - if hasattr(trainer.model.module, "on_epoch_start"): - trainer.model.module.on_epoch_start(trainer) - else: - if hasattr(trainer.model, "on_epoch_start"): - trainer.model.on_epoch_start(trainer) - - if hasattr(trainer.criterion, "on_epoch_start"): - trainer.criterion.on_epoch_start(trainer) - - if hasattr(trainer.optimizer, "on_epoch_start"): - trainer.optimizer.on_epoch_start(trainer) - - @staticmethod - def on_epoch_end(trainer) -> None: - if hasattr(trainer.model, "module"): - if hasattr(trainer.model.module, "on_epoch_end"): - trainer.model.module.on_epoch_end(trainer) - else: - if hasattr(trainer.model, "on_epoch_end"): - trainer.model.on_epoch_end(trainer) - - if hasattr(trainer.criterion, "on_epoch_end"): - trainer.criterion.on_epoch_end(trainer) - - if hasattr(trainer.optimizer, "on_epoch_end"): - trainer.optimizer.on_epoch_end(trainer) - - @staticmethod - def on_train_step_start(trainer) -> None: - if hasattr(trainer.model, "module"): - if hasattr(trainer.model.module, "on_train_step_start"): - trainer.model.module.on_train_step_start(trainer) - else: - if hasattr(trainer.model, "on_train_step_start"): - trainer.model.on_train_step_start(trainer) - - if hasattr(trainer.criterion, "on_train_step_start"): - trainer.criterion.on_train_step_start(trainer) - - if hasattr(trainer.optimizer, "on_train_step_start"): - trainer.optimizer.on_train_step_start(trainer) - - @staticmethod - def on_train_step_end(trainer) -> None: - if hasattr(trainer.model, "module"): - if hasattr(trainer.model.module, "on_train_step_end"): - trainer.model.module.on_train_step_end(trainer) - else: - if hasattr(trainer.model, "on_train_step_end"): - trainer.model.on_train_step_end(trainer) - - if hasattr(trainer.criterion, "on_train_step_end"): - trainer.criterion.on_train_step_end(trainer) - - if hasattr(trainer.optimizer, "on_train_step_end"): - trainer.optimizer.on_train_step_end(trainer) - - @staticmethod - def on_keyboard_interrupt(trainer) -> None: - if hasattr(trainer.model, "module"): - if hasattr(trainer.model.module, "on_keyboard_interrupt"): - trainer.model.module.on_keyboard_interrupt(trainer) - else: - if hasattr(trainer.model, "on_keyboard_interrupt"): - trainer.model.on_keyboard_interrupt(trainer) - - if hasattr(trainer.criterion, "on_keyboard_interrupt"): - trainer.criterion.on_keyboard_interrupt(trainer) - - if hasattr(trainer.optimizer, "on_keyboard_interrupt"): - trainer.optimizer.on_keyboard_interrupt(trainer) diff --git a/TTS/utils/capacitron_optimizer.py b/TTS/utils/capacitron_optimizer.py index 7206ffd508..01f303f98d 100644 --- a/TTS/utils/capacitron_optimizer.py +++ b/TTS/utils/capacitron_optimizer.py @@ -1,4 +1,4 @@ -from typing import Generator +from collections.abc import Generator from trainer.trainer_utils import get_optimizer diff --git a/TTS/utils/download.py b/TTS/utils/download.py index e94b1d68c8..75ef9164f6 100644 --- a/TTS/utils/download.py +++ b/TTS/utils/download.py @@ -7,8 +7,9 @@ import urllib import urllib.request import zipfile +from collections.abc import Iterable from os.path import expanduser -from typing import Any, Iterable, List, Optional +from typing import Any from torch.utils.model_zoo import tqdm @@ -16,7 +17,7 @@ def stream_url( - url: str, start_byte: Optional[int] = None, block_size: int = 32 * 1024, progress_bar: bool = True + url: str, start_byte: int | None = None, block_size: int = 32 * 1024, progress_bar: bool = True ) -> Iterable: """Stream url by chunk @@ -36,7 +37,7 @@ def stream_url( req = urllib.request.Request(url) if start_byte: - req.headers["Range"] = "bytes={}-".format(start_byte) + req.headers["Range"] = f"bytes={start_byte}-" with ( urllib.request.urlopen(req) as upointer, @@ -61,8 +62,8 @@ def stream_url( def download_url( url: str, download_folder: str, - filename: Optional[str] = None, - hash_value: Optional[str] = None, + filename: str | None = None, + hash_value: str | None = None, hash_type: str = "sha256", progress_bar: bool = True, resume: bool = False, @@ -88,10 +89,10 @@ def download_url( filepath = os.path.join(download_folder, filename) if resume and os.path.exists(filepath): mode = "ab" - local_size: Optional[int] = os.path.getsize(filepath) + local_size: int | None = os.path.getsize(filepath) elif not resume and os.path.exists(filepath): - raise RuntimeError("{} already exists. Delete the file manually and retry.".format(filepath)) + raise RuntimeError(f"{filepath} already exists. Delete the file manually and retry.") else: mode = "wb" local_size = None @@ -100,7 +101,7 @@ def download_url( with open(filepath, "rb") as file_obj: if validate_file(file_obj, hash_value, hash_type): return - raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath)) + raise RuntimeError(f"The hash of {filepath} does not match. Delete the file manually and retry.") with open(filepath, mode) as fpointer: for chunk in stream_url(url, start_byte=local_size, progress_bar=progress_bar): @@ -108,7 +109,7 @@ def download_url( with open(filepath, "rb") as file_obj: if hash_value and not validate_file(file_obj, hash_value, hash_type): - raise RuntimeError("The hash of {} does not match. Delete the file manually and retry.".format(filepath)) + raise RuntimeError(f"The hash of {filepath} does not match. Delete the file manually and retry.") def validate_file(file_obj: Any, hash_value: str, hash_type: str = "sha256") -> bool: @@ -140,7 +141,7 @@ def validate_file(file_obj: Any, hash_value: str, hash_type: str = "sha256") -> return hash_func.hexdigest() == hash_value -def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bool = False) -> List[str]: +def extract_archive(from_path: str, to_path: str | None = None, overwrite: bool = False) -> list[str]: """Extract archive. Args: from_path (str): the path of the archive. diff --git a/TTS/utils/downloaders.py b/TTS/utils/downloaders.py index 8705873982..c06c2649ad 100644 --- a/TTS/utils/downloaders.py +++ b/TTS/utils/downloaders.py @@ -1,6 +1,5 @@ import logging import os -from typing import Optional from TTS.utils.download import download_kaggle_dataset, download_url, extract_archive @@ -21,7 +20,7 @@ def download_ljspeech(path: str): extract_archive(archive) -def download_vctk(path: str, use_kaggle: Optional[bool] = False): +def download_vctk(path: str, use_kaggle: bool | None = False): """Download and extract VCTK dataset. Args: @@ -49,7 +48,7 @@ def download_tweb(path: str): download_kaggle_dataset("bryanpark/the-world-english-bible-speech-dataset", "TWEB", path) -def download_libri_tts(path: str, subset: Optional[str] = "all"): +def download_libri_tts(path: str, subset: str | None = "all"): """Download and extract libri tts dataset. Args: diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 3ee285232f..e1df6f6ed4 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -1,22 +1,37 @@ -# -*- coding: utf-8 -*- import datetime import importlib import logging +import os import re +from collections.abc import Callable from pathlib import Path -from typing import Dict, Optional +from typing import Any, TextIO, TypeVar import torch from packaging.version import Version +from typing_extensions import TypeIs logger = logging.getLogger(__name__) +_T = TypeVar("_T") + + +def exists(val: _T | None) -> TypeIs[_T]: + return val is not None + + +def default(val: _T | None, d: _T | Callable[[], _T]) -> _T: + if exists(val): + return val + return d() if callable(d) else d + def to_camel(text): text = text.capitalize() text = re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) text = text.replace("Tts", "TTS") text = text.replace("vc", "VC") + text = text.replace("Knn", "KNN") return text @@ -54,26 +69,7 @@ def get_import_path(obj: object) -> str: return ".".join([type(obj).__module__, type(obj).__name__]) -def set_init_dict(model_dict, checkpoint_state, c): - # Partial initialization: if there is a mismatch with new and old layer, it is skipped. - for k, v in checkpoint_state.items(): - if k not in model_dict: - logger.warning("Layer missing in the model finition %s", k) - # 1. filter out unnecessary keys - pretrained_dict = {k: v for k, v in checkpoint_state.items() if k in model_dict} - # 2. filter out different size layers - pretrained_dict = {k: v for k, v in pretrained_dict.items() if v.numel() == model_dict[k].numel()} - # 3. skip reinit layers - if c.has("reinit_layers") and c.reinit_layers is not None: - for reinit_layer_name in c.reinit_layers: - pretrained_dict = {k: v for k, v in pretrained_dict.items() if reinit_layer_name not in k} - # 4. overwrite entries in the existing state dict - model_dict.update(pretrained_dict) - logger.info("%d / %d layers are restored.", len(pretrained_dict), len(model_dict)) - return model_dict - - -def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict: +def format_aux_input(def_args: dict, kwargs: dict) -> dict: """Format kwargs to hande auxilary inputs to models. Args: @@ -84,9 +80,9 @@ def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict: Dict: arguments with formatted auxilary inputs. """ kwargs = kwargs.copy() - for name in def_args: + for name, arg in def_args.items(): if name not in kwargs or kwargs[name] is None: - kwargs[name] = def_args[name] + kwargs[name] = arg return kwargs @@ -112,26 +108,35 @@ def setup_logger( logger_name: str, level: int = logging.INFO, *, - formatter: Optional[logging.Formatter] = None, - screen: bool = False, - tofile: bool = False, - log_dir: str = "logs", + formatter: logging.Formatter | None = None, + stream: TextIO | None = None, + log_dir: str | os.PathLike[Any] | None = None, log_name: str = "log", ) -> None: + """Set up a logger. + + Args: + logger_name: Name of the logger to set up + level: Logging level + formatter: Formatter for the logger + stream: Add a StreamHandler for the given stream, e.g. sys.stderr or sys.stdout + log_dir: Folder to write the log file (no file created if None) + log_name: Prefix of the log file name + """ lg = logging.getLogger(logger_name) if formatter is None: formatter = logging.Formatter( "%(asctime)s.%(msecs)03d - %(levelname)-8s - %(name)s: %(message)s", datefmt="%y-%m-%d %H:%M:%S" ) lg.setLevel(level) - if tofile: + if log_dir is not None: Path(log_dir).mkdir(exist_ok=True, parents=True) log_file = Path(log_dir) / f"{log_name}_{get_timestamp()}.log" fh = logging.FileHandler(log_file, mode="w") fh.setFormatter(formatter) lg.addHandler(fh) - if screen: - sh = logging.StreamHandler() + if stream is not None: + sh = logging.StreamHandler(stream) sh.setFormatter(formatter) lg.addHandler(sh) @@ -139,3 +144,8 @@ def setup_logger( def is_pytorch_at_least_2_4() -> bool: """Check if the installed Pytorch version is 2.4 or higher.""" return Version(torch.__version__) >= Version("2.4") + + +def optional_to_str(x: Any | None) -> str: + """Convert input to string, using empty string if input is None.""" + return "" if x is None else str(x) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index bd445b3a2f..20d6ab226b 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -6,17 +6,36 @@ import zipfile from pathlib import Path from shutil import copyfile, rmtree -from typing import Dict, Tuple +from typing import Any, TypedDict import fsspec import requests from tqdm import tqdm from trainer.io import get_user_data_dir +from typing_extensions import Required from TTS.config import load_config, read_json_with_comments +from TTS.vc.configs.knnvc_config import KNNVCConfig logger = logging.getLogger(__name__) + +class ModelItem(TypedDict, total=False): + model_name: Required[str] + model_type: Required[str] + description: str + license: str + author: str + contact: str + commit: str | None + model_hash: str + tos_required: bool + default_vocoder: str | None + model_url: str | list[str] + github_rls_url: str | list[str] + hf_url: list[str] + + LICENSE_URLS = { "cc by-nc-nd 4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/", "mpl": "https://www.mozilla.org/en-US/MPL/2.0/", @@ -30,7 +49,7 @@ } -class ModelManager(object): +class ModelManager: tqdm_progress = None """Manage TTS models defined in .models.json. It provides an interface to list and download @@ -40,19 +59,24 @@ class ModelManager(object): home path. Args: - models_file (str): path to .model.json file. Defaults to None. - output_prefix (str): prefix to `tts` to download models. Defaults to None + models_file (str or Path): path to .model.json file. Defaults to None. + output_prefix (str or Path): prefix to `tts` to download models. Defaults to None progress_bar (bool): print a progress bar when donwloading a file. Defaults to False. """ - def __init__(self, models_file=None, output_prefix=None, progress_bar=False): + def __init__( + self, + models_file: str | os.PathLike[Any] | None = None, + output_prefix: str | os.PathLike[Any] | None = None, + progress_bar: bool = False, + ) -> None: super().__init__() self.progress_bar = progress_bar if output_prefix is None: self.output_prefix = get_user_data_dir("tts") else: - self.output_prefix = os.path.join(output_prefix, "tts") - self.models_dict = None + self.output_prefix = Path(output_prefix) / "tts" + self.models_dict = {} if models_file is not None: self.read_models_file(models_file) else: @@ -60,7 +84,7 @@ def __init__(self, models_file=None, output_prefix=None, progress_bar=False): path = Path(__file__).parent / "../.models.json" self.read_models_file(path) - def read_models_file(self, file_path): + def read_models_file(self, file_path: str | os.PathLike[Any]) -> None: """Read .models.json as a dict Args: @@ -68,7 +92,7 @@ def read_models_file(self, file_path): """ self.models_dict = read_json_with_comments(file_path) - def _list_models(self, model_type, model_count=0): + def _list_models(self, model_type: str, model_count: int = 0) -> list[str]: logger.info("") logger.info("Name format: type/language/dataset/model") model_list = [] @@ -83,21 +107,23 @@ def _list_models(self, model_type, model_count=0): model_count += 1 return model_list - def _list_for_model_type(self, model_type): + def _list_for_model_type(self, model_type: str) -> list[str]: models_name_list = [] model_count = 1 models_name_list.extend(self._list_models(model_type, model_count)) return models_name_list - def list_models(self): + def list_models(self) -> list[str]: models_name_list = [] model_count = 1 for model_type in self.models_dict: model_list = self._list_models(model_type, model_count) models_name_list.extend(model_list) + logger.info("") + logger.info("Path to downloaded models: %s", self.output_prefix) return models_name_list - def log_model_details(self, model_type, lang, dataset, model): + def log_model_details(self, model_type: str, lang: str, dataset: str, model: str) -> None: logger.info("Model type: %s", model_type) logger.info("Language supported: %s", lang) logger.info("Dataset used: %s", dataset) @@ -112,7 +138,7 @@ def log_model_details(self, model_type, lang, dataset, model): self.models_dict[model_type][lang][dataset][model]["default_vocoder"], ) - def model_info_by_idx(self, model_query): + def model_info_by_idx(self, model_query: str) -> None: """Print the description of the model from .models.json file using model_query_idx Args: @@ -144,7 +170,7 @@ def model_info_by_idx(self, model_query): model_type, lang, dataset, model = model_name_list[model_query_idx - 1].split("/") self.log_model_details(model_type, lang, dataset, model) - def model_info_by_full_name(self, model_query_name): + def model_info_by_full_name(self, model_query_name: str) -> None: """Print the description of the model from .models.json file using model_full_name Args: @@ -165,35 +191,35 @@ def model_info_by_full_name(self, model_query_name): return self.log_model_details(model_type, lang, dataset, model) - def list_tts_models(self): + def list_tts_models(self) -> list[str]: """Print all `TTS` models and return a list of model names Format is `language/dataset/model` """ return self._list_for_model_type("tts_models") - def list_vocoder_models(self): + def list_vocoder_models(self) -> list[str]: """Print all the `vocoder` models and return a list of model names Format is `language/dataset/model` """ return self._list_for_model_type("vocoder_models") - def list_vc_models(self): + def list_vc_models(self) -> list[str]: """Print all the voice conversion models and return a list of model names Format is `language/dataset/model` """ return self._list_for_model_type("voice_conversion_models") - def list_langs(self): + def list_langs(self) -> None: """Print all the available languages""" logger.info("Name format: type/language") for model_type in self.models_dict: for lang in self.models_dict[model_type]: logger.info(" %s/%s", model_type, lang) - def list_datasets(self): + def list_datasets(self) -> None: """Print all the datasets""" logger.info("Name format: type/language/dataset") for model_type in self.models_dict: @@ -202,7 +228,7 @@ def list_datasets(self): logger.info(" %s/%s/%s", model_type, lang, dataset) @staticmethod - def print_model_license(model_item: Dict): + def print_model_license(model_item: ModelItem) -> None: """Print the license of a model Args: @@ -217,49 +243,49 @@ def print_model_license(model_item: Dict): else: logger.info("Model's license - No license information available") - def _download_github_model(self, model_item: Dict, output_path: str): + def _download_github_model(self, model_item: ModelItem, output_path: Path) -> None: if isinstance(model_item["github_rls_url"], list): self._download_model_files(model_item["github_rls_url"], output_path, self.progress_bar) else: self._download_zip_file(model_item["github_rls_url"], output_path, self.progress_bar) - def _download_hf_model(self, model_item: Dict, output_path: str): + def _download_hf_model(self, model_item: ModelItem, output_path: Path) -> None: if isinstance(model_item["hf_url"], list): self._download_model_files(model_item["hf_url"], output_path, self.progress_bar) else: self._download_zip_file(model_item["hf_url"], output_path, self.progress_bar) - def download_fairseq_model(self, model_name, output_path): + def download_fairseq_model(self, model_name: str, output_path: Path) -> None: URI_PREFIX = "https://dl.fbaipublicfiles.com/mms/tts/" _, lang, _, _ = model_name.split("/") model_download_uri = os.path.join(URI_PREFIX, f"{lang}.tar.gz") self._download_tar_file(model_download_uri, output_path, self.progress_bar) @staticmethod - def set_model_url(model_item: Dict): - model_item["model_url"] = None + def set_model_url(model_item: ModelItem) -> ModelItem: + model_item["model_url"] = "" if "github_rls_url" in model_item: model_item["model_url"] = model_item["github_rls_url"] elif "hf_url" in model_item: model_item["model_url"] = model_item["hf_url"] - elif "fairseq" in model_item["model_name"]: + elif "fairseq" in model_item.get("model_name", ""): model_item["model_url"] = "https://dl.fbaipublicfiles.com/mms/tts/" - elif "xtts" in model_item["model_name"]: + elif "xtts" in model_item.get("model_name", ""): model_item["model_url"] = "https://huggingface.co/coqui/" return model_item - def _set_model_item(self, model_name): + def _set_model_item(self, model_name: str) -> tuple[ModelItem, str, str, str | None]: # fetch model info from the dict if "fairseq" in model_name: model_type, lang, dataset, model = model_name.split("/") - model_item = { + model_item: ModelItem = { + "model_name": model_name, "model_type": "tts_models", "license": "CC BY-NC 4.0", "default_vocoder": None, "author": "fairseq", "description": "this model is released by Meta under Fairseq repo. Visit https://github.com/facebookresearch/fairseq/tree/main/examples/mms for more info.", } - model_item["model_name"] = model_name elif "xtts" in model_name and len(model_name.split("/")) != 4: # loading xtts models with only model name (e.g. xtts_v2.0.2) # check model name has the version number with regex @@ -273,6 +299,8 @@ def _set_model_item(self, model_name): dataset = "multi-dataset" model = model_name model_item = { + "model_name": model_name, + "model_type": model_type, "default_vocoder": None, "license": "CPML", "contact": "info@coqui.ai", @@ -297,9 +325,9 @@ def _set_model_item(self, model_name): return model_item, model_full_name, model, md5hash @staticmethod - def ask_tos(model_full_path): + def ask_tos(model_full_path: Path) -> bool: """Ask the user to agree to the terms of service""" - tos_path = os.path.join(model_full_path, "tos_agreed.txt") + tos_path = model_full_path / "tos_agreed.txt" print(" > You must confirm the following:") print(' | > "I have purchased a commercial license from Coqui: licensing@coqui.ai"') print(' | > "Otherwise, I agree to the terms of the non-commercial CPML: https://coqui.ai/cpml" - [y/n]') @@ -311,7 +339,7 @@ def ask_tos(model_full_path): return False @staticmethod - def tos_agreed(model_item, model_full_path): + def tos_agreed(model_item: ModelItem, model_full_path: Path) -> bool: """Check if the user has agreed to the terms of service""" if "tos_required" in model_item and model_item["tos_required"]: tos_path = os.path.join(model_full_path, "tos_agreed.txt") @@ -320,12 +348,12 @@ def tos_agreed(model_item, model_full_path): return False return True - def create_dir_and_download_model(self, model_name, model_item, output_path): - os.makedirs(output_path, exist_ok=True) + def create_dir_and_download_model(self, model_name: str, model_item: ModelItem, output_path: Path) -> None: + output_path.mkdir(exist_ok=True, parents=True) # handle TOS if not self.tos_agreed(model_item, output_path): if not self.ask_tos(output_path): - os.rmdir(output_path) + output_path.rmdir() raise Exception(" [!] You must agree to the terms of service to use this model.") logger.info("Downloading model to %s", output_path) try: @@ -340,9 +368,12 @@ def create_dir_and_download_model(self, model_name, model_item, output_path): logger.exception("Failed to download the model file to %s", output_path) rmtree(output_path) raise e + checkpoints = list(Path(output_path).glob("*.pt*")) + if len(checkpoints) == 1: + checkpoints[0].rename(checkpoints[0].parent / "model.pth") self.print_model_license(model_item=model_item) - def check_if_configs_are_equal(self, model_name, model_item, output_path): + def check_if_configs_are_equal(self, model_name: str, model_item: ModelItem, output_path: Path) -> None: with fsspec.open(self._find_files(output_path)[1], "r", encoding="utf-8") as f: config_local = json.load(f) remote_url = None @@ -358,7 +389,7 @@ def check_if_configs_are_equal(self, model_name, model_item, output_path): logger.info("%s is already downloaded however it has been changed. Redownloading it...", model_name) self.create_dir_and_download_model(model_name, model_item, output_path) - def download_model(self, model_name): + def download_model(self, model_name: str) -> tuple[Path, Path | None, ModelItem]: """Download model files given the full model name. Model name is in the format 'type/language/dataset/model' @@ -374,12 +405,12 @@ def download_model(self, model_name): """ model_item, model_full_name, model, md5sum = self._set_model_item(model_name) # set the model specific output path - output_path = os.path.join(self.output_prefix, model_full_name) - if os.path.exists(output_path): + output_path = Path(self.output_prefix) / model_full_name + if output_path.is_dir(): if md5sum is not None: - md5sum_file = os.path.join(output_path, "hash.md5") - if os.path.isfile(md5sum_file): - with open(md5sum_file, mode="r") as f: + md5sum_file = output_path / "hash.md5" + if md5sum_file.is_file(): + with md5sum_file.open() as f: if not f.read() == md5sum: logger.info("%s has been updated, clearing model cache...", model_name) self.create_dir_and_download_model(model_name, model_item, output_path) @@ -404,15 +435,20 @@ def download_model(self, model_name): output_model_path = output_path output_config_path = None if ( - model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name and "xtts" not in model_name + model not in ["tortoise-v2", "bark", "knnvc"] and "fairseq" not in model_name and "xtts" not in model_name ): # TODO:This is stupid but don't care for now. output_model_path, output_config_path = self._find_files(output_path) + else: + output_config_path = output_model_path / "config.json" + if model == "knnvc" and not output_config_path.exists(): + knnvc_config = KNNVCConfig() + knnvc_config.save_json(output_config_path) # update paths in the config.json self._update_paths(output_path, output_config_path) return output_model_path, output_config_path, model_item @staticmethod - def _find_files(output_path: str) -> Tuple[str, str]: + def _find_files(output_path: Path) -> tuple[Path, Path]: """Find the model and config files in the output path Args: @@ -423,11 +459,11 @@ def _find_files(output_path: str) -> Tuple[str, str]: """ model_file = None config_file = None - for file_name in os.listdir(output_path): - if file_name in ["model_file.pth", "model_file.pth.tar", "model.pth"]: - model_file = os.path.join(output_path, file_name) - elif file_name == "config.json": - config_file = os.path.join(output_path, file_name) + for f in output_path.iterdir(): + if f.name in ["model_file.pth", "model_file.pth.tar", "model.pth", "checkpoint.pth"]: + model_file = f + elif f.name == "config.json": + config_file = f if model_file is None: raise ValueError(" [!] Model file not found in the output path") if config_file is None: @@ -435,7 +471,7 @@ def _find_files(output_path: str) -> Tuple[str, str]: return model_file, config_file @staticmethod - def _find_speaker_encoder(output_path: str) -> str: + def _find_speaker_encoder(output_path: Path) -> Path | None: """Find the speaker encoder file in the output path Args: @@ -445,24 +481,24 @@ def _find_speaker_encoder(output_path: str) -> str: str: path to the speaker encoder file """ speaker_encoder_file = None - for file_name in os.listdir(output_path): - if file_name in ["model_se.pth", "model_se.pth.tar"]: - speaker_encoder_file = os.path.join(output_path, file_name) + for f in output_path.iterdir(): + if f.name in ["model_se.pth", "model_se.pth.tar"]: + speaker_encoder_file = f return speaker_encoder_file - def _update_paths(self, output_path: str, config_path: str) -> None: + def _update_paths(self, output_path: Path, config_path: Path) -> None: """Update paths for certain files in config.json after download. Args: output_path (str): local path the model is downloaded to. config_path (str): local config.json path. """ - output_stats_path = os.path.join(output_path, "scale_stats.npy") - output_d_vector_file_path = os.path.join(output_path, "speakers.json") - output_d_vector_file_pth_path = os.path.join(output_path, "speakers.pth") - output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json") - output_speaker_ids_file_pth_path = os.path.join(output_path, "speaker_ids.pth") - speaker_encoder_config_path = os.path.join(output_path, "config_se.json") + output_stats_path = output_path / "scale_stats.npy" + output_d_vector_file_path = output_path / "speakers.json" + output_d_vector_file_pth_path = output_path / "speakers.pth" + output_speaker_ids_file_path = output_path / "speaker_ids.json" + output_speaker_ids_file_pth_path = output_path / "speaker_ids.pth" + speaker_encoder_config_path = output_path / "config_se.json" speaker_encoder_model_path = self._find_speaker_encoder(output_path) # update the scale_path.npy file path in the model config.json @@ -487,10 +523,10 @@ def _update_paths(self, output_path: str, config_path: str) -> None: self._update_path("model_args.speaker_encoder_config_path", speaker_encoder_config_path, config_path) @staticmethod - def _update_path(field_name, new_path, config_path): + def _update_path(field_name: str, new_path: Path | None, config_path: Path) -> None: """Update the path in the model config.json for the current environment after download""" - if new_path and os.path.exists(new_path): - config = load_config(config_path) + if new_path is not None and new_path.is_file(): + config = load_config(str(config_path)) field_names = field_name.split(".") if len(field_names) > 1: # field name points to a sub-level field @@ -515,7 +551,7 @@ def _update_path(field_name, new_path, config_path): config.save_json(config_path) @staticmethod - def _download_zip_file(file_url, output_folder, progress_bar): + def _download_zip_file(file_url: str, output_folder: Path, progress_bar: bool) -> None: """Download the github releases""" # download the file r = requests.get(file_url, stream=True) @@ -525,7 +561,7 @@ def _download_zip_file(file_url, output_folder, progress_bar): block_size = 1024 # 1 Kibibyte if progress_bar: ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) - temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1]) + temp_zip_name = output_folder / file_url.split("/")[-1] with open(temp_zip_name, "wb") as file: for data in r.iter_content(block_size): if progress_bar: @@ -533,24 +569,24 @@ def _download_zip_file(file_url, output_folder, progress_bar): file.write(data) with zipfile.ZipFile(temp_zip_name) as z: z.extractall(output_folder) - os.remove(temp_zip_name) # delete zip after extract + temp_zip_name.unlink() # delete zip after extract except zipfile.BadZipFile: logger.exception("Bad zip file - %s", file_url) raise zipfile.BadZipFile # pylint: disable=raise-missing-from # move the files to the outer path for file_path in z.namelist(): - src_path = os.path.join(output_folder, file_path) - if os.path.isfile(src_path): - dst_path = os.path.join(output_folder, os.path.basename(file_path)) + src_path = output_folder / file_path + if src_path.is_file(): + dst_path = output_folder / os.path.basename(file_path) if src_path != dst_path: copyfile(src_path, dst_path) # remove redundant (hidden or not) folders for file_path in z.namelist(): - if os.path.isdir(os.path.join(output_folder, file_path)): - rmtree(os.path.join(output_folder, file_path)) + if (output_folder / file_path).is_dir(): + rmtree(output_folder / file_path) @staticmethod - def _download_tar_file(file_url, output_folder, progress_bar): + def _download_tar_file(file_url: str, output_folder: Path, progress_bar: bool) -> None: """Download the github releases""" # download the file r = requests.get(file_url, stream=True) @@ -560,7 +596,7 @@ def _download_tar_file(file_url, output_folder, progress_bar): block_size = 1024 # 1 Kibibyte if progress_bar: ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) - temp_tar_name = os.path.join(output_folder, file_url.split("/")[-1]) + temp_tar_name = output_folder / file_url.split("/")[-1] with open(temp_tar_name, "wb") as file: for data in r.iter_content(block_size): if progress_bar: @@ -569,43 +605,35 @@ def _download_tar_file(file_url, output_folder, progress_bar): with tarfile.open(temp_tar_name) as t: t.extractall(output_folder) tar_names = t.getnames() - os.remove(temp_tar_name) # delete tar after extract + temp_tar_name.unlink() # delete tar after extract except tarfile.ReadError: logger.exception("Bad tar file - %s", file_url) raise tarfile.ReadError # pylint: disable=raise-missing-from # move the files to the outer path - for file_path in os.listdir(os.path.join(output_folder, tar_names[0])): - src_path = os.path.join(output_folder, tar_names[0], file_path) - dst_path = os.path.join(output_folder, os.path.basename(file_path)) + for file_path in (output_folder / tar_names[0]).iterdir(): + src_path = file_path + dst_path = output_folder / file_path.name if src_path != dst_path: copyfile(src_path, dst_path) # remove the extracted folder - rmtree(os.path.join(output_folder, tar_names[0])) + rmtree(output_folder / tar_names[0]) @staticmethod - def _download_model_files(file_urls, output_folder, progress_bar): + def _download_model_files(file_urls: list[str], output_folder: str | os.PathLike[Any], progress_bar: bool) -> None: """Download the github releases""" + output_folder = Path(output_folder) for file_url in file_urls: # download the file r = requests.get(file_url, stream=True) # extract the file - bease_filename = file_url.split("/")[-1] - temp_zip_name = os.path.join(output_folder, bease_filename) + base_filename = file_url.split("/")[-1] + file_path = output_folder / base_filename total_size_in_bytes = int(r.headers.get("content-length", 0)) block_size = 1024 # 1 Kibibyte - with open(temp_zip_name, "wb") as file: + with open(file_path, "wb") as f: if progress_bar: ModelManager.tqdm_progress = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) for data in r.iter_content(block_size): if progress_bar: ModelManager.tqdm_progress.update(len(data)) - file.write(data) - - @staticmethod - def _check_dict_key(my_dict, key): - if key in my_dict.keys() and my_dict[key] is not None: - if not isinstance(key, str): - return True - if isinstance(key, str) and len(my_dict[key]) > 0: - return True - return False + f.write(data) diff --git a/TTS/utils/radam.py b/TTS/utils/radam.py index cbd14990f3..b893d115c9 100644 --- a/TTS/utils/radam.py +++ b/TTS/utils/radam.py @@ -9,16 +9,16 @@ class RAdam(Optimizer): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): if lr < 0.0: - raise ValueError("Invalid learning rate: {}".format(lr)) + raise ValueError(f"Invalid learning rate: {lr}") if eps < 0.0: - raise ValueError("Invalid epsilon value: {}".format(eps)) + raise ValueError(f"Invalid epsilon value: {eps}") if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}") if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}") self.degenerated_to_sgd = degenerated_to_sgd - if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): + if isinstance(params, list | tuple) and len(params) > 0 and isinstance(params[0], dict): for param in params: if "betas" in param and (param["betas"][0] != betas[0] or param["betas"][1] != betas[1]): param["buffer"] = [[None, None, None] for _ in range(10)] diff --git a/TTS/utils/samplers.py b/TTS/utils/samplers.py index b08a763a33..d24733977a 100644 --- a/TTS/utils/samplers.py +++ b/TTS/utils/samplers.py @@ -1,6 +1,6 @@ import math import random -from typing import Callable, List, Union +from collections.abc import Callable from torch.utils.data.sampler import BatchSampler, Sampler, SubsetRandomSampler @@ -49,9 +49,9 @@ def __init__( label_key="class_name", ): super().__init__(dataset_items) - assert ( - batch_size % (num_classes_in_batch * num_gpus) == 0 - ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)." + assert batch_size % (num_classes_in_batch * num_gpus) == 0, ( + "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)." + ) label_indices = {} for idx, item in enumerate(dataset_items): @@ -176,7 +176,7 @@ def __init__( data, batch_size, drop_last, - sort_key: Union[Callable, List] = identity, + sort_key: Callable | list = identity, bucket_size_multiplier=100, ): super().__init__(sampler, batch_size, drop_last) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 90af4f48f9..cebb094a48 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -1,7 +1,8 @@ import logging import os import time -from typing import List +from pathlib import Path +from typing import Any import numpy as np import pysbd @@ -15,7 +16,10 @@ from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence from TTS.utils.audio import AudioProcessor from TTS.utils.audio.numpy_transforms import save_wav +from TTS.utils.generic_utils import optional_to_str +from TTS.vc.configs.openvoice_config import OpenVoiceConfig from TTS.vc.models import setup_model as setup_vc_model +from TTS.vc.models.openvoice import OpenVoice from TTS.vocoder.models import setup_model as setup_vocoder_model from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input @@ -25,18 +29,19 @@ class Synthesizer(nn.Module): def __init__( self, - tts_checkpoint: str = "", - tts_config_path: str = "", - tts_speakers_file: str = "", - tts_languages_file: str = "", - vocoder_checkpoint: str = "", - vocoder_config: str = "", - encoder_checkpoint: str = "", - encoder_config: str = "", - vc_checkpoint: str = "", - vc_config: str = "", - model_dir: str = "", - voice_dir: str = None, + *, + tts_checkpoint: str | os.PathLike[Any] | None = None, + tts_config_path: str | os.PathLike[Any] | None = None, + tts_speakers_file: str | os.PathLike[Any] | None = None, + tts_languages_file: str | os.PathLike[Any] | None = None, + vocoder_checkpoint: str | os.PathLike[Any] | None = None, + vocoder_config: str | os.PathLike[Any] | None = None, + encoder_checkpoint: str | os.PathLike[Any] | None = None, + encoder_config: str | os.PathLike[Any] | None = None, + vc_checkpoint: str | os.PathLike[Any] | None = None, + vc_config: str | os.PathLike[Any] | None = None, + model_dir: str | os.PathLike[Any] | None = None, + voice_dir: str | os.PathLike[Any] | None = None, use_cuda: bool = False, ) -> None: """General 🐸 TTS interface for inference. It takes a tts and a vocoder @@ -62,16 +67,17 @@ def __init__( use_cuda (bool, optional): enable/disable cuda. Defaults to False. """ super().__init__() - self.tts_checkpoint = tts_checkpoint - self.tts_config_path = tts_config_path - self.tts_speakers_file = tts_speakers_file - self.tts_languages_file = tts_languages_file - self.vocoder_checkpoint = vocoder_checkpoint - self.vocoder_config = vocoder_config - self.encoder_checkpoint = encoder_checkpoint - self.encoder_config = encoder_config - self.vc_checkpoint = vc_checkpoint - self.vc_config = vc_config + self.tts_checkpoint = optional_to_str(tts_checkpoint) + self.tts_config_path = optional_to_str(tts_config_path) + self.tts_speakers_file = optional_to_str(tts_speakers_file) + self.tts_languages_file = optional_to_str(tts_languages_file) + self.vocoder_checkpoint = optional_to_str(vocoder_checkpoint) + self.vocoder_config = optional_to_str(vocoder_config) + self.encoder_checkpoint = optional_to_str(encoder_checkpoint) + self.encoder_config = optional_to_str(encoder_config) + self.vc_checkpoint = optional_to_str(vc_checkpoint) + self.vc_config = optional_to_str(vc_config) + model_dir = optional_to_str(model_dir) self.use_cuda = use_cuda self.tts_model = None @@ -90,24 +96,21 @@ def __init__( assert torch.cuda.is_available(), "CUDA is not availabe on this machine." if tts_checkpoint: - self._load_tts(tts_checkpoint, tts_config_path, use_cuda) - self.output_sample_rate = self.tts_config.audio["sample_rate"] + self._load_tts(self.tts_checkpoint, self.tts_config_path, use_cuda) - if vocoder_checkpoint: - self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda) - self.output_sample_rate = self.vocoder_config.audio["sample_rate"] + if vc_checkpoint and model_dir == "": + self._load_vc(self.vc_checkpoint, self.vc_config, use_cuda) - if vc_checkpoint: - self._load_vc(vc_checkpoint, vc_config, use_cuda) - self.output_sample_rate = self.vc_config.audio["output_sample_rate"] + if vocoder_checkpoint: + self._load_vocoder(self.vocoder_checkpoint, self.vocoder_config, use_cuda) if model_dir: if "fairseq" in model_dir: self._load_fairseq_from_dir(model_dir, use_cuda) - self.output_sample_rate = self.tts_config.audio["sample_rate"] + elif "openvoice" in model_dir: + self._load_openvoice_from_dir(Path(model_dir), use_cuda) else: self._load_tts_from_dir(model_dir, use_cuda) - self.output_sample_rate = self.tts_config.audio["output_sample_rate"] @staticmethod def _get_segmenter(lang: str): @@ -136,6 +139,9 @@ def _load_vc(self, vc_checkpoint: str, vc_config_path: str, use_cuda: bool) -> N """ # pylint: disable=global-statement self.vc_config = load_config(vc_config_path) + self.output_sample_rate = self.vc_config.audio.get( + "output_sample_rate", self.vc_config.audio.get("sample_rate", None) + ) self.vc_model = setup_vc_model(config=self.vc_config) self.vc_model.load_checkpoint(self.vc_config, vc_checkpoint) if use_cuda: @@ -150,9 +156,24 @@ def _load_fairseq_from_dir(self, model_dir: str, use_cuda: bool) -> None: self.tts_model = Vits.init_from_config(self.tts_config) self.tts_model.load_fairseq_checkpoint(self.tts_config, checkpoint_dir=model_dir, eval=True) self.tts_config = self.tts_model.config + self.output_sample_rate = self.tts_config.audio["sample_rate"] if use_cuda: self.tts_model.cuda() + def _load_openvoice_from_dir(self, checkpoint: Path, use_cuda: bool) -> None: + """Load the OpenVoice model from a directory. + + We assume the model knows how to load itself from the directory and + there is a config.json file in the directory. + """ + self.vc_config = OpenVoiceConfig() + self.vc_model = OpenVoice.init_from_config(self.vc_config) + self.vc_model.load_checkpoint(self.vc_config, checkpoint, eval=True) + self.vc_config = self.vc_model.config + self.output_sample_rate = self.vc_config.audio["output_sample_rate"] + if use_cuda: + self.vc_model.cuda() + def _load_tts_from_dir(self, model_dir: str, use_cuda: bool) -> None: """Load the TTS model from a directory. @@ -160,6 +181,7 @@ def _load_tts_from_dir(self, model_dir: str, use_cuda: bool) -> None: """ config = load_config(os.path.join(model_dir, "config.json")) self.tts_config = config + self.output_sample_rate = self.tts_config.audio["output_sample_rate"] self.tts_model = setup_tts_model(config) self.tts_model.load_checkpoint(config, checkpoint_dir=model_dir, eval=True) if use_cuda: @@ -181,6 +203,7 @@ def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) - """ # pylint: disable=global-statement self.tts_config = load_config(tts_config_path) + self.output_sample_rate = self.tts_config.audio["sample_rate"] if self.tts_config["use_phonemes"] and self.tts_config["phonemizer"] is None: raise ValueError("Phonemizer is not defined in the TTS config.") @@ -218,13 +241,14 @@ def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> N use_cuda (bool): enable/disable CUDA use. """ self.vocoder_config = load_config(model_config) + self.output_sample_rate = self.vocoder_config.audio["sample_rate"] self.vocoder_ap = AudioProcessor(**self.vocoder_config.audio) self.vocoder_model = setup_vocoder_model(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda() - def split_into_sentences(self, text) -> List[str]: + def split_into_sentences(self, text) -> list[str]: """Split give text into sentences. Args: @@ -235,7 +259,7 @@ def split_into_sentences(self, text) -> List[str]: """ return self.seg.segment(text) - def save_wav(self, wav: List[int], path: str, pipe_out=None) -> None: + def save_wav(self, wav: list[int], path: str, pipe_out=None) -> None: """Save the waveform as a file. Args: @@ -250,9 +274,21 @@ def save_wav(self, wav: List[int], path: str, pipe_out=None) -> None: wav = np.array(wav) save_wav(wav=wav, path=path, sample_rate=self.output_sample_rate, pipe_out=pipe_out) - def voice_conversion(self, source_wav: str, target_wav: str) -> List[int]: - output_wav = self.vc_model.voice_conversion(source_wav, target_wav) - return output_wav + def voice_conversion(self, source_wav: str, target_wav: str | list[str], **kwargs) -> list[int]: + start_time = time.time() + + if not isinstance(target_wav, list): + target_wav = [target_wav] + output = self.vc_model.voice_conversion(source_wav, target_wav, **kwargs) + if self.vocoder_model is not None: + output = self.vocoder_model.inference(output) + + output = output.squeeze() + process_time = time.time() - start_time + audio_time = len(output) / self.output_sample_rate + logger.info("Processing time: %.3f", process_time) + logger.info("Real-time factor: %.3f", process_time / audio_time) + return output def tts( self, @@ -266,7 +302,7 @@ def tts( reference_speaker_name=None, split_sentences: bool = True, **kwargs, - ) -> List[int]: + ) -> list[int]: """🐸 TTS magic. Run all the models and generate speech. Args: diff --git a/TTS/vc/configs/freevc_config.py b/TTS/vc/configs/freevc_config.py index 207181b303..37f8048b7f 100644 --- a/TTS/vc/configs/freevc_config.py +++ b/TTS/vc/configs/freevc_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List, Optional from coqpit import Coqpit @@ -47,7 +46,7 @@ class FreeVCAudioConfig(Coqpit): win_length: int = field(default=1280) n_mel_channels: int = field(default=80) mel_fmin: float = field(default=0.0) - mel_fmax: Optional[float] = field(default=None) + mel_fmax: float | None = field(default=None) @dataclass @@ -122,11 +121,11 @@ class FreeVCArgs(Coqpit): kernel_size: int = field(default=3) p_dropout: float = field(default=0.1) resblock: str = field(default="1") - resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11]) - resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) - upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2]) + resblock_kernel_sizes: list[int] = field(default_factory=lambda: [3, 7, 11]) + resblock_dilation_sizes: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) + upsample_rates: list[int] = field(default_factory=lambda: [10, 8, 2, 2]) upsample_initial_channel: int = field(default=512) - upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) + upsample_kernel_sizes: list[int] = field(default_factory=lambda: [16, 16, 4, 4]) n_layers_q: int = field(default=3) use_spectral_norm: bool = field(default=False) gin_channels: int = field(default=256) @@ -229,7 +228,7 @@ class FreeVCConfig(BaseVCConfig): If true, language embedding is used. Defaults to `False`. Note: - Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters. + Check :class:`TTS.tts.configs.shared_configs.BaseVCConfig` for the inherited parameters. Example: @@ -269,7 +268,7 @@ class FreeVCConfig(BaseVCConfig): # use d-vectors use_d_vector_file: bool = False - d_vector_file: List[str] = None + d_vector_file: list[str] = None d_vector_dim: int = None def __post_init__(self): diff --git a/TTS/vc/configs/knnvc_config.py b/TTS/vc/configs/knnvc_config.py new file mode 100644 index 0000000000..7728ea0a9b --- /dev/null +++ b/TTS/vc/configs/knnvc_config.py @@ -0,0 +1,59 @@ +from dataclasses import dataclass, field + +from coqpit import Coqpit + +from TTS.config.shared_configs import BaseAudioConfig +from TTS.vc.configs.shared_configs import BaseVCConfig + + +@dataclass +class KNNVCAudioConfig(BaseAudioConfig): + """Audio configuration. + + Args: + sample_rate (int): + The sampling rate of the input waveform. + """ + + sample_rate: int = field(default=16000) + + +@dataclass +class KNNVCArgs(Coqpit): + """Model arguments. + + Args: + ssl_dim (int): + The dimension of the self-supervised learning embedding. + """ + + ssl_dim: int = field(default=1024) + + +@dataclass +class KNNVCConfig(BaseVCConfig): + """Parameters. + + Args: + model (str): + Model name. Do not change unless you know what you are doing. + + model_args (KNNVCArgs): + Model architecture arguments. Defaults to `KNNVCArgs()`. + + audio (KNNVCAudioConfig): + Audio processing configuration. Defaults to `KNNVCAudioConfig()`. + + wavlm_layer (int): + WavLM layer to use for feature extraction. + + topk (int): + k in the kNN -- the number of nearest neighbors to average over + """ + + model: str = "knnvc" + model_args: KNNVCArgs = field(default_factory=KNNVCArgs) + audio: KNNVCAudioConfig = field(default_factory=KNNVCAudioConfig) + + wavlm_layer: int = 6 + topk: int = 4 diff --git a/TTS/vc/configs/openvoice_config.py b/TTS/vc/configs/openvoice_config.py new file mode 100644 index 0000000000..167a61ddb3 --- /dev/null +++ b/TTS/vc/configs/openvoice_config.py @@ -0,0 +1,200 @@ +from dataclasses import dataclass, field + +from coqpit import Coqpit + +from TTS.vc.configs.shared_configs import BaseVCConfig + + +@dataclass +class OpenVoiceAudioConfig(Coqpit): + """Audio configuration + + Args: + input_sample_rate (int): + The sampling rate of the input waveform. + + output_sample_rate (int): + The sampling rate of the output waveform. + + fft_size (int): + The length of the filter. + + hop_length (int): + The hop length. + + win_length (int): + The window length. + """ + + input_sample_rate: int = field(default=22050) + output_sample_rate: int = field(default=22050) + fft_size: int = field(default=1024) + hop_length: int = field(default=256) + win_length: int = field(default=1024) + + +@dataclass +class OpenVoiceArgs(Coqpit): + """OpenVoice model arguments. + + zero_g (bool): + Whether to zero the gradients. + + inter_channels (int): + The number of channels in the intermediate layers. + + hidden_channels (int): + The number of channels in the hidden layers. + + filter_channels (int): + The number of channels in the filter layers. + + n_heads (int): + The number of attention heads. + + n_layers (int): + The number of layers. + + kernel_size (int): + The size of the kernel. + + p_dropout (float): + The dropout probability. + + resblock (str): + The type of residual block. + + resblock_kernel_sizes (List[int]): + The kernel sizes for the residual blocks. + + resblock_dilation_sizes (List[List[int]]): + The dilation sizes for the residual blocks. + + upsample_rates (List[int]): + The upsample rates. + + upsample_initial_channel (int): + The number of channels in the initial upsample layer. + + upsample_kernel_sizes (List[int]): + The kernel sizes for the upsample layers. + + n_layers_q (int): + The number of layers in the quantization network. + + use_spectral_norm (bool): + Whether to use spectral normalization. + + gin_channels (int): + The number of channels in the global conditioning vector. + + tau (float): + Tau parameter for the posterior encoder + """ + + zero_g: bool = field(default=True) + inter_channels: int = field(default=192) + hidden_channels: int = field(default=192) + filter_channels: int = field(default=768) + n_heads: int = field(default=2) + n_layers: int = field(default=6) + kernel_size: int = field(default=3) + p_dropout: float = field(default=0.1) + resblock: str = field(default="1") + resblock_kernel_sizes: list[int] = field(default_factory=lambda: [3, 7, 11]) + resblock_dilation_sizes: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) + upsample_rates: list[int] = field(default_factory=lambda: [8, 8, 2, 2]) + upsample_initial_channel: int = field(default=512) + upsample_kernel_sizes: list[int] = field(default_factory=lambda: [16, 16, 4, 4]) + n_layers_q: int = field(default=3) + use_spectral_norm: bool = field(default=False) + gin_channels: int = field(default=256) + tau: float = field(default=0.3) + + +@dataclass +class OpenVoiceConfig(BaseVCConfig): + """Defines parameters for OpenVoice VC model. + + Args: + model (str): + Model name. Do not change unless you know what you are doing. + + model_args (OpenVoiceArgs): + Model architecture arguments. Defaults to `OpenVoiceArgs()`. + + audio (OpenVoiceAudioConfig): + Audio processing configuration. Defaults to `OpenVoiceAudioConfig()`. + + return_wav (bool): + If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`. + + compute_linear_spec (bool): + If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`. + + use_weighted_sampler (bool): + If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`. + + weighted_sampler_attrs (dict): + Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities + by overweighting `root_path` by 2.0. Defaults to `{}`. + + weighted_sampler_multipliers (dict): + Weight each unique value of a key returned by the formatter for weighted sampling. + For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`. + It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`. + + r (int): + Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`. + + add_blank (bool): + If true, a blank token is added in between every character. Defaults to `True`. + + Note: + Check :class:`TTS.tts.configs.shared_configs.BaseVCConfig` for the inherited parameters. + + Example: + + >>> from TTS.vc.configs.openvoice_config import OpenVoiceConfig + >>> config = OpenVoiceConfig() + """ + + model: str = "openvoice" + # model specific params + model_args: OpenVoiceArgs = field(default_factory=OpenVoiceArgs) + audio: OpenVoiceAudioConfig = field(default_factory=OpenVoiceAudioConfig) + + # optimizer + # TODO with training support + + # loss params + # TODO with training support + + # data loader params + return_wav: bool = True + compute_linear_spec: bool = True + + # sampler params + use_weighted_sampler: bool = False # TODO: move it to the base config + weighted_sampler_attrs: dict = field(default_factory=lambda: {}) + weighted_sampler_multipliers: dict = field(default_factory=lambda: {}) + + # overrides + r: int = 1 # DO NOT CHANGE + add_blank: bool = True + + # multi-speaker settings + # use speaker embedding layer + num_speakers: int = 0 + speakers_file: str | None = None + speaker_embedding_channels: int = 256 + + # use d-vectors + use_d_vector_file: bool = False + d_vector_file: list[str] | None = None + d_vector_dim: int | None = None + + def __post_init__(self) -> None: + for key, val in self.model_args.items(): + if hasattr(self, key): + self[key] = val diff --git a/TTS/vc/configs/shared_configs.py b/TTS/vc/configs/shared_configs.py index b2fe63d29d..b84a97e487 100644 --- a/TTS/vc/configs/shared_configs.py +++ b/TTS/vc/configs/shared_configs.py @@ -1,12 +1,11 @@ from dataclasses import dataclass, field -from typing import List from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig @dataclass class BaseVCConfig(BaseTrainingConfig): - """Shared parameters among all the tts models. + """Shared parameters among all the VC models. Args: @@ -132,7 +131,7 @@ class BaseVCConfig(BaseTrainingConfig): shuffle: bool = False drop_last: bool = False # dataset - datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) + datasets: list[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # optimizer optimizer: str = "radam" optimizer_params: dict = None @@ -140,7 +139,7 @@ class BaseVCConfig(BaseTrainingConfig): lr_scheduler: str = None lr_scheduler_params: dict = field(default_factory=lambda: {}) # testing - test_sentences: List[str] = field(default_factory=lambda: []) + test_sentences: list[str] = field(default_factory=lambda: []) # evaluation eval_split_max_size: int = None eval_split_size: float = 0.01 diff --git a/TTS/vc/modules/__init__.py b/TTS/vc/layers/__init__.py similarity index 100% rename from TTS/vc/modules/__init__.py rename to TTS/vc/layers/__init__.py diff --git a/TTS/vc/modules/freevc/__init__.py b/TTS/vc/layers/freevc/__init__.py similarity index 100% rename from TTS/vc/modules/freevc/__init__.py rename to TTS/vc/layers/freevc/__init__.py diff --git a/TTS/vc/modules/freevc/commons.py b/TTS/vc/layers/freevc/commons.py similarity index 81% rename from TTS/vc/modules/freevc/commons.py rename to TTS/vc/layers/freevc/commons.py index feea7f34dc..49889e4816 100644 --- a/TTS/vc/modules/freevc/commons.py +++ b/TTS/vc/layers/freevc/commons.py @@ -3,7 +3,7 @@ import torch from torch.nn import functional as F -from TTS.tts.utils.helpers import convert_pad_shape, sequence_mask +from TTS.tts.utils.helpers import convert_pad_shape def init_weights(m: torch.nn.Module, mean: float = 0.0, std: float = 0.01) -> None: @@ -96,37 +96,11 @@ def subsequent_mask(length): return mask -@torch.jit.script -def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts - - def shift_1d(x): x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] return x -def generate_path(duration, mask): - """ - duration: [b, 1, t_x] - mask: [b, 1, t_y, t_x] - """ - b, _, t_y, t_x = mask.shape - cum_duration = torch.cumsum(duration, -1) - - cum_duration_flat = cum_duration.view(b * t_x) - path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) - path = path.view(b, t_x, t_y) - path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] - path = path.unsqueeze(1).transpose(2, 3) * mask - return path - - def clip_grad_value_(parameters, clip_value, norm_type=2): if isinstance(parameters, torch.Tensor): parameters = [parameters] diff --git a/TTS/vc/layers/freevc/mel_processing.py b/TTS/vc/layers/freevc/mel_processing.py new file mode 100644 index 0000000000..017d900284 --- /dev/null +++ b/TTS/vc/layers/freevc/mel_processing.py @@ -0,0 +1,58 @@ +import logging + +import torch +import torch.utils.data +from librosa.filters import mel as librosa_mel_fn + +from TTS.utils.audio.torch_transforms import amp_to_db + +logger = logging.getLogger(__name__) + +MAX_WAV_VALUE = 32768.0 + +mel_basis = {} +hann_window = {} + + +def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): + if torch.min(y) < -1.0: + logger.info("Min value is: %.3f", torch.min(y)) + if torch.max(y) > 1.0: + logger.info("Max value is: %.3f", torch.max(y)) + + global mel_basis, hann_window + dtype_device = str(y.dtype) + "_" + str(y.device) + fmax_dtype_device = str(fmax) + "_" + dtype_device + wnsize_dtype_device = str(win_size) + "_" + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) + + y = torch.nn.functional.pad( + y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" + ) + y = y.squeeze(1) + + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + ) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + + spec = torch.matmul(mel_basis[fmax_dtype_device], spec) + spec = amp_to_db(spec) + + return spec diff --git a/TTS/vc/modules/freevc/modules.py b/TTS/vc/layers/freevc/modules.py similarity index 97% rename from TTS/vc/modules/freevc/modules.py rename to TTS/vc/layers/freevc/modules.py index 722444a303..92df39b5e0 100644 --- a/TTS/vc/modules/freevc/modules.py +++ b/TTS/vc/layers/freevc/modules.py @@ -5,9 +5,9 @@ from torch.nn.utils.parametrizations import weight_norm from torch.nn.utils.parametrize import remove_parametrizations -import TTS.vc.modules.freevc.commons as commons from TTS.tts.layers.generic.normalization import LayerNorm2 -from TTS.vc.modules.freevc.commons import init_weights +from TTS.tts.layers.generic.wavenet import fused_add_tanh_sigmoid_multiply +from TTS.vc.layers.freevc.commons import init_weights from TTS.vocoder.models.hifigan_generator import get_padding LRELU_SLOPE = 0.1 @@ -48,7 +48,7 @@ def forward(self, x, x_mask): class WN(torch.nn.Module): def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): - super(WN, self).__init__() + super().__init__() assert kernel_size % 2 == 1 self.hidden_channels = hidden_channels self.kernel_size = (kernel_size,) @@ -99,7 +99,7 @@ def forward(self, x, x_mask, g=None, **kwargs): else: g_l = torch.zeros_like(x_in) - acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) acts = self.drop(acts) res_skip_acts = self.res_skip_layers[i](acts) @@ -122,7 +122,7 @@ def remove_weight_norm(self): class ResBlock1(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): - super(ResBlock1, self).__init__() + super().__init__() self.convs1 = nn.ModuleList( [ weight_norm( @@ -198,7 +198,7 @@ def remove_weight_norm(self): class ResBlock2(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3)): - super(ResBlock2, self).__init__() + super().__init__() self.convs = nn.ModuleList( [ weight_norm( diff --git a/TTS/vc/modules/freevc/speaker_encoder/__init__.py b/TTS/vc/layers/freevc/speaker_encoder/__init__.py similarity index 100% rename from TTS/vc/modules/freevc/speaker_encoder/__init__.py rename to TTS/vc/layers/freevc/speaker_encoder/__init__.py diff --git a/TTS/vc/modules/freevc/speaker_encoder/audio.py b/TTS/vc/layers/freevc/speaker_encoder/audio.py similarity index 92% rename from TTS/vc/modules/freevc/speaker_encoder/audio.py rename to TTS/vc/layers/freevc/speaker_encoder/audio.py index 5b23a4dbb6..5d14bf2f19 100644 --- a/TTS/vc/modules/freevc/speaker_encoder/audio.py +++ b/TTS/vc/layers/freevc/speaker_encoder/audio.py @@ -1,11 +1,10 @@ from pathlib import Path -from typing import Optional, Union # import webrtcvad import librosa import numpy as np -from TTS.vc.modules.freevc.speaker_encoder.hparams import ( +from TTS.vc.layers.freevc.speaker_encoder.hparams import ( audio_norm_target_dBFS, mel_n_channels, mel_window_length, @@ -16,7 +15,7 @@ int16_max = (2**15) - 1 -def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int] = None): +def preprocess_wav(fpath_or_wav: str | Path | np.ndarray, source_sr: int | None = None): """ Applies the preprocessing operations used in training the Speaker Encoder to a waveform either on disk or in memory. The waveform will be resampled to match the data hyperparameters. diff --git a/TTS/vc/modules/freevc/speaker_encoder/hparams.py b/TTS/vc/layers/freevc/speaker_encoder/hparams.py similarity index 100% rename from TTS/vc/modules/freevc/speaker_encoder/hparams.py rename to TTS/vc/layers/freevc/speaker_encoder/hparams.py diff --git a/TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py b/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py similarity index 81% rename from TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py rename to TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py index 294bf322cb..d2f4ffe394 100644 --- a/TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py +++ b/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py @@ -1,14 +1,13 @@ import logging from time import perf_counter as timer -from typing import List, Union import numpy as np import torch from torch import nn from trainer.io import load_fsspec -from TTS.vc.modules.freevc.speaker_encoder import audio -from TTS.vc.modules.freevc.speaker_encoder.hparams import ( +from TTS.vc.layers.freevc.speaker_encoder import audio +from TTS.vc.layers.freevc.speaker_encoder.hparams import ( mel_n_channels, mel_window_step, model_embedding_size, @@ -22,12 +21,8 @@ class SpeakerEncoder(nn.Module): - def __init__(self, weights_fpath, device: Union[str, torch.device] = None): - """ - :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). - If None, defaults to cuda if it is available on your machine, otherwise the model will - run on cpu. Outputs are always returned on the cpu, as numpy arrays. - """ + def __init__(self, weights_fpath): + """FreeVC speaker encoder.""" super().__init__() # Define the network @@ -35,13 +30,6 @@ def __init__(self, weights_fpath, device: Union[str, torch.device] = None): self.linear = nn.Linear(model_hidden_size, model_embedding_size) self.relu = nn.ReLU() - # Get the target device - if device is None: - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - elif isinstance(device, str): - device = torch.device(device) - self.device = device - # Load the pretrained model'speaker weights # weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt") # if not weights_fpath.exists(): @@ -52,8 +40,11 @@ def __init__(self, weights_fpath, device: Union[str, torch.device] = None): checkpoint = load_fsspec(weights_fpath, map_location="cpu") self.load_state_dict(checkpoint["model_state"], strict=False) - self.to(device) - logger.info("Loaded the voice encoder model on %s in %.2f seconds.", device.type, timer() - start) + logger.info("Loaded the voice encoder model in %.2f seconds.", timer() - start) + + @property + def device(self): + return next(self.parameters()).device def forward(self, mels: torch.FloatTensor): """ @@ -97,7 +88,7 @@ def compute_partial_slices(n_samples: int, rate, min_coverage): assert 0 < min_coverage <= 1 # Compute how many frames separate two partial utterances - samples_per_frame = int((sampling_rate * mel_window_step / 1000)) + samples_per_frame = int(sampling_rate * mel_window_step / 1000) n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) frame_step = int(np.round((sampling_rate / rate) / samples_per_frame)) assert 0 < frame_step, "The rate is too high" @@ -123,7 +114,7 @@ def compute_partial_slices(n_samples: int, rate, min_coverage): return wav_slices, mel_slices - def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75): + def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75) -> torch.Tensor: """ Computes an embedding for a single utterance. The utterance is divided in partial utterances and an embedding is computed for each. The complete utterance embedding is the @@ -143,8 +134,8 @@ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_ then the last partial utterance will be considered by zero-padding the audio. Otherwise, it will be discarded. If there aren't enough frames for one partial utterance, this parameter is ignored so that the function always returns at least one slice. - :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If - is True, the partial utterances as a numpy array of float32 of shape + :return: the embedding as a float tensor of shape (model_embedding_size,). If + is True, the partial utterances as a float tensor of shape (n_partials, model_embedding_size) and the wav partials as a list of slices will also be returned. """ @@ -160,24 +151,26 @@ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_ mels = np.array([mel[s] for s in mel_slices]) with torch.no_grad(): mels = torch.from_numpy(mels).to(self.device) - partial_embeds = self(mels).cpu().numpy() + partial_embeds = self(mels) # Compute the utterance embedding from the partial embeddings - raw_embed = np.mean(partial_embeds, axis=0) - embed = raw_embed / np.linalg.norm(raw_embed, 2) + raw_embed = partial_embeds.mean(dim=0) + embed = raw_embed / torch.norm(raw_embed, p=2) if return_partials: return embed, partial_embeds, wav_slices return embed - def embed_speaker(self, wavs: List[np.ndarray], **kwargs): + def embed_speaker(self, wavs: list[np.ndarray], **kwargs): """ Compute the embedding of a collection of wavs (presumably from the same speaker) by averaging their embedding and L2-normalizing it. :param wavs: list of wavs a numpy arrays of float32. :param kwargs: extra arguments to embed_utterance() - :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). + :return: the embedding as a float tensor of shape (model_embedding_size,). """ - raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) for wav in wavs], axis=0) - return raw_embed / np.linalg.norm(raw_embed, 2) + raw_embed = torch.mean( + torch.stack([self.embed_utterance(wav, return_partials=False, **kwargs) for wav in wavs]), dim=0 + ) + return raw_embed / torch.norm(raw_embed, p=2) diff --git a/TTS/vc/modules/freevc/wavlm/__init__.py b/TTS/vc/layers/freevc/wavlm/__init__.py similarity index 91% rename from TTS/vc/modules/freevc/wavlm/__init__.py rename to TTS/vc/layers/freevc/wavlm/__init__.py index 4046e137f5..d9c3858f89 100644 --- a/TTS/vc/modules/freevc/wavlm/__init__.py +++ b/TTS/vc/layers/freevc/wavlm/__init__.py @@ -6,14 +6,14 @@ from trainer.io import get_user_data_dir from TTS.utils.generic_utils import is_pytorch_at_least_2_4 -from TTS.vc.modules.freevc.wavlm.wavlm import WavLM, WavLMConfig +from TTS.vc.layers.freevc.wavlm.wavlm import WavLM, WavLMConfig logger = logging.getLogger(__name__) model_uri = "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/WavLM-Large.pt" -def get_wavlm(device="cpu"): +def get_wavlm(device="cpu") -> WavLM: """Download the model and return the model object.""" output_path = get_user_data_dir("tts") diff --git a/TTS/vc/modules/freevc/wavlm/config.json b/TTS/vc/layers/freevc/wavlm/config.json similarity index 100% rename from TTS/vc/modules/freevc/wavlm/config.json rename to TTS/vc/layers/freevc/wavlm/config.json diff --git a/TTS/vc/modules/freevc/wavlm/modules.py b/TTS/vc/layers/freevc/wavlm/modules.py similarity index 96% rename from TTS/vc/modules/freevc/wavlm/modules.py rename to TTS/vc/layers/freevc/wavlm/modules.py index 37c1a6e877..cf31a866de 100644 --- a/TTS/vc/modules/freevc/wavlm/modules.py +++ b/TTS/vc/layers/freevc/wavlm/modules.py @@ -9,7 +9,6 @@ import math import warnings -from typing import Dict, Optional, Tuple import torch import torch.nn.functional as F @@ -89,7 +88,7 @@ class Swish(nn.Module): def __init__(self): """Construct an MultiHeadedAttention object.""" - super(Swish, self).__init__() + super().__init__() self.act = torch.nn.Sigmoid() def forward(self, x): @@ -98,7 +97,7 @@ def forward(self, x): class GLU_Linear(nn.Module): def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True): - super(GLU_Linear, self).__init__() + super().__init__() self.glu_type = glu_type self.output_dim = output_dim @@ -158,7 +157,7 @@ def get_activation_fn(activation: str): elif activation == "glu": return lambda x: x else: - raise RuntimeError("--activation-fn {} not supported".format(activation)) + raise RuntimeError(f"--activation-fn {activation} not supported") def init_bert_params(module): @@ -219,7 +218,7 @@ def quant_noise(module, p, block_size): return module # supported modules - assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)) + assert isinstance(module, nn.Linear | nn.Embedding | nn.Conv2d) # test whether module.weight has the right sizes wrt block_size is_conv = module.weight.ndim == 4 @@ -331,7 +330,7 @@ def __init__( self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( - "Self-attention requires query, key and " "value to be of the same size" + "Self-attention requires query, key and value to be of the same size" ) k_bias = True @@ -424,17 +423,17 @@ def compute_bias(self, query_length, key_length): def forward( self, query, - key: Optional[Tensor], - value: Optional[Tensor], - key_padding_mask: Optional[Tensor] = None, - incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + key: Tensor | None, + value: Tensor | None, + key_padding_mask: Tensor | None = None, + incremental_state: dict[str, dict[str, Tensor | None]] | None = None, need_weights: bool = True, static_kv: bool = False, - attn_mask: Optional[Tensor] = None, + attn_mask: Tensor | None = None, before_softmax: bool = False, need_head_weights: bool = False, - position_bias: Optional[Tensor] = None, - ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]: + position_bias: Tensor | None = None, + ) -> tuple[Tensor, Tensor | None, Tensor | None]: """Input shape: Time x Batch x Channel Args: @@ -605,7 +604,7 @@ def forward( else: assert v is not None v = torch.cat([prev_value, v], dim=1) - prev_key_padding_mask: Optional[Tensor] = None + prev_key_padding_mask: Tensor | None = None if "prev_key_padding_mask" in saved_state: prev_key_padding_mask = saved_state["prev_key_padding_mask"] assert k is not None and v is not None @@ -700,7 +699,7 @@ def forward( assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn = self.out_proj(attn) - attn_weights: Optional[Tensor] = None + attn_weights: Tensor | None = None if need_weights: attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0) if not need_head_weights: @@ -711,12 +710,12 @@ def forward( @staticmethod def _append_prev_key_padding_mask( - key_padding_mask: Optional[Tensor], - prev_key_padding_mask: Optional[Tensor], + key_padding_mask: Tensor | None, + prev_key_padding_mask: Tensor | None, batch_size: int, src_len: int, static_kv: bool, - ) -> Optional[Tensor]: + ) -> Tensor | None: # saved key padding masks have shape (bsz, seq_len) if prev_key_padding_mask is not None and static_kv: new_key_padding_mask = prev_key_padding_mask @@ -748,19 +747,19 @@ def _append_prev_key_padding_mask( return new_key_padding_mask def _get_input_buffer( - self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] - ) -> Dict[str, Optional[Tensor]]: + self, incremental_state: dict[str, dict[str, Tensor | None]] | None + ) -> dict[str, Tensor | None]: result = self.get_incremental_state(incremental_state, "attn_state") if result is not None: return result else: - empty_result: Dict[str, Optional[Tensor]] = {} + empty_result: dict[str, Tensor | None] = {} return empty_result def _set_input_buffer( self, - incremental_state: Dict[str, Dict[str, Optional[Tensor]]], - buffer: Dict[str, Optional[Tensor]], + incremental_state: dict[str, dict[str, Tensor | None]], + buffer: dict[str, Tensor | None], ): return self.set_incremental_state(incremental_state, "attn_state", buffer) diff --git a/TTS/vc/modules/freevc/wavlm/wavlm.py b/TTS/vc/layers/freevc/wavlm/wavlm.py similarity index 96% rename from TTS/vc/modules/freevc/wavlm/wavlm.py rename to TTS/vc/layers/freevc/wavlm/wavlm.py index 10dd09ed0c..6358662e18 100644 --- a/TTS/vc/modules/freevc/wavlm/wavlm.py +++ b/TTS/vc/layers/freevc/wavlm/wavlm.py @@ -9,7 +9,7 @@ import logging import math -from typing import List, Optional, Tuple +from typing import Any import numpy as np import torch @@ -17,7 +17,7 @@ import torch.nn.functional as F from torch.nn import LayerNorm -from TTS.vc.modules.freevc.wavlm.modules import ( +from TTS.vc.layers.freevc.wavlm.modules import ( Fp32GroupNorm, Fp32LayerNorm, GLU_Linear, @@ -33,8 +33,8 @@ def compute_mask_indices( - shape: Tuple[int, int], - padding_mask: Optional[torch.Tensor], + shape: tuple[int, int], + padding_mask: torch.Tensor | None, mask_prob: float, mask_length: int, mask_type: str = "static", @@ -68,8 +68,7 @@ def compute_mask_indices( all_num_mask = int( # add a random number for probabilistic rounding - mask_prob * all_sz / float(mask_length) - + np.random.rand() + mask_prob * all_sz / float(mask_length) + np.random.rand() ) all_num_mask = max(min_masks, all_num_mask) @@ -80,8 +79,7 @@ def compute_mask_indices( sz = all_sz - padding_mask[i].long().sum().item() num_mask = int( # add a random number for probabilistic rounding - mask_prob * sz / float(mask_length) - + np.random.rand() + mask_prob * sz / float(mask_length) + np.random.rand() ) num_mask = max(min_masks, num_mask) else: @@ -155,9 +153,7 @@ def arrange(s, e, length, keep_length): class WavLMConfig: def __init__(self, cfg=None): - self.extractor_mode: str = ( - "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) - ) + self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) self.encoder_layers: int = 12 # num encoder layers in the transformer self.encoder_embed_dim: int = 768 # encoder embedding dimension @@ -166,9 +162,7 @@ def __init__(self, cfg=None): self.activation_fn: str = "gelu" # activation function to use self.layer_norm_first: bool = False # apply layernorm first in the transformer - self.conv_feature_layers: str = ( - "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] - ) + self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] self.conv_bias: bool = False # include bias in conv encoder self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this @@ -225,7 +219,7 @@ def __init__( cfg: WavLMConfig, ) -> None: super().__init__() - logger.info(f"WavLM Config: {cfg.__dict__}") + logger.info("WavLM Config: %s", cfg.__dict__) self.cfg = cfg feature_enc_layers = eval(cfg.conv_feature_layers) @@ -317,12 +311,12 @@ def forward_padding_mask( def extract_features( self, source: torch.Tensor, - padding_mask: Optional[torch.Tensor] = None, + padding_mask: torch.Tensor | None = None, mask: bool = False, ret_conv: bool = False, - output_layer: Optional[int] = None, + output_layer: int | None = None, ret_layer_results: bool = False, - ): + ) -> tuple[torch.Tensor, dict[str, Any]]: if self.feature_grad_mult > 0: features = self.feature_extractor(source) if self.feature_grad_mult != 1.0: @@ -367,7 +361,7 @@ def extract_features( class ConvFeatureExtractionModel(nn.Module): def __init__( self, - conv_layers: List[Tuple[int, int, int]], + conv_layers: list[tuple[int, int, int]], dropout: float = 0.0, mode: str = "default", conv_bias: bool = False, diff --git a/TTS/vc/models/__init__.py b/TTS/vc/models/__init__.py index a498b292b7..859eaeb2a7 100644 --- a/TTS/vc/models/__init__.py +++ b/TTS/vc/models/__init__.py @@ -1,20 +1,21 @@ import importlib import logging import re -from typing import Dict, List, Union - -logger = logging.getLogger(__name__) +from TTS.vc.configs.shared_configs import BaseVCConfig +from TTS.vc.models.base_vc import BaseVC -def to_camel(text): - text = text.capitalize() - return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) +logger = logging.getLogger(__name__) -def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseVC": +def setup_model(config: BaseVCConfig) -> BaseVC: logger.info("Using model: %s", config.model) # fetch the right model implementation. - if "model" in config and config["model"].lower() == "freevc": + if config["model"].lower() == "freevc": MyModel = importlib.import_module("TTS.vc.models.freevc").FreeVC - model = MyModel.init_from_config(config, samples) - return model + elif config["model"].lower() == "knnvc": + MyModel = importlib.import_module("TTS.vc.models.knnvc").KNNVC + else: + msg = f"Model {config.model} does not exist!" + raise ValueError(msg) + return MyModel.init_from_config(config) diff --git a/TTS/vc/models/base_vc.py b/TTS/vc/models/base_vc.py index 22ffd0095c..a953b901e8 100644 --- a/TTS/vc/models/base_vc.py +++ b/TTS/vc/models/base_vc.py @@ -1,7 +1,7 @@ import logging import os import random -from typing import Any, Optional, Union +from typing import Any import torch import torch.distributed as dist @@ -37,9 +37,9 @@ class BaseVC(BaseTrainerModel): def __init__( self, config: Coqpit, - ap: AudioProcessor, - speaker_manager: Optional[SpeakerManager] = None, - language_manager: Optional[LanguageManager] = None, + ap: AudioProcessor | None = None, + speaker_manager: SpeakerManager | None = None, + language_manager: LanguageManager | None = None, ) -> None: super().__init__() self.config = config @@ -51,7 +51,7 @@ def __init__( def _set_model_args(self, config: Coqpit) -> None: """Setup model args based on the config type (`ModelConfig` or `ModelArgs`). - `ModelArgs` has all the fields reuqired to initialize the model architecture. + `ModelArgs` has all the fields required to initialize the model architecture. `ModelConfig` has all the fields required for training, inference and containes `ModelArgs`. @@ -69,7 +69,7 @@ def _set_model_args(self, config: Coqpit) -> None: else: raise ValueError("config must be either a *Config or *Args") - def init_multispeaker(self, config: Coqpit, data: Optional[list[Any]] = None) -> None: + def init_multispeaker(self, config: Coqpit, data: list[Any] | None = None) -> None: """Initialize a speaker embedding layer if needen and define expected embedding channel size for defining `in_channels` size of the connected layers. @@ -106,7 +106,7 @@ def get_aux_input(self, **kwargs: Any) -> dict[str, Any]: """Prepare and return `aux_input` used by `forward()`""" return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None} - def get_aux_input_from_test_sentences(self, sentence_info: Union[str, list[str]]) -> dict[str, Any]: + def get_aux_input_from_test_sentences(self, sentence_info: str | list[str]) -> dict[str, Any]: if hasattr(self.config, "model_args"): config = self.config.model_args else: @@ -199,9 +199,9 @@ def format_batch(self, batch: dict[str, Any]) -> dict[str, Any]: extra_frames = dur.sum() - mel_lengths[idx] largest_idxs = torch.argsort(-dur)[:extra_frames] dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + assert dur.sum() == mel_lengths[idx], ( + f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + ) durations[idx, : text_lengths[idx]] = dur # set stop targets wrt reduction factor @@ -275,10 +275,10 @@ def get_data_loader( config: Coqpit, assets: dict, is_eval: bool, - samples: Union[list[dict], list[list]], + samples: list[dict] | list[list], verbose: bool, num_gpus: int, - rank: Optional[int] = None, + rank: int | None = None, ) -> "DataLoader": if is_eval and not config.run_eval: loader = None @@ -402,13 +402,11 @@ def test_run(self, assets: dict) -> tuple[dict, dict]: use_griffin_lim=True, do_trim_silence=False, ) - test_audios["{}-audio".format(idx)] = outputs_dict["wav"] - test_figures["{}-prediction".format(idx)] = plot_spectrogram( + test_audios[f"{idx}-audio"] = outputs_dict["wav"] + test_figures[f"{idx}-prediction"] = plot_spectrogram( outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False ) - test_figures["{}-alignment".format(idx)] = plot_alignment( - outputs_dict["outputs"]["alignments"], output_fig=False - ) + test_figures[f"{idx}-alignment"] = plot_alignment(outputs_dict["outputs"]["alignments"], output_fig=False) return test_figures, test_audios def on_init_start(self, trainer: Trainer) -> None: diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py index e5cfdc1e61..59af40a836 100644 --- a/TTS/vc/models/freevc.py +++ b/TTS/vc/models/freevc.py @@ -1,29 +1,27 @@ import logging -from typing import Dict, List, Optional, Tuple, Union import librosa import numpy as np import torch from coqpit import Coqpit from torch import nn -from torch.nn import Conv1d, Conv2d, ConvTranspose1d +from torch.nn import Conv1d, ConvTranspose1d from torch.nn import functional as F -from torch.nn.utils import spectral_norm from torch.nn.utils.parametrizations import weight_norm from torch.nn.utils.parametrize import remove_parametrizations from trainer.io import load_fsspec -import TTS.vc.modules.freevc.commons as commons -import TTS.vc.modules.freevc.modules as modules +import TTS.vc.layers.freevc.modules as modules +from TTS.tts.layers.vits.discriminator import DiscriminatorS from TTS.tts.utils.helpers import sequence_mask from TTS.tts.utils.speakers import SpeakerManager from TTS.vc.configs.freevc_config import FreeVCConfig +from TTS.vc.layers.freevc.commons import init_weights, rand_slice_segments +from TTS.vc.layers.freevc.mel_processing import mel_spectrogram_torch +from TTS.vc.layers.freevc.speaker_encoder.speaker_encoder import SpeakerEncoder as SpeakerEncoderEx +from TTS.vc.layers.freevc.wavlm import get_wavlm from TTS.vc.models.base_vc import BaseVC -from TTS.vc.modules.freevc.commons import init_weights -from TTS.vc.modules.freevc.mel_processing import mel_spectrogram_torch -from TTS.vc.modules.freevc.speaker_encoder.speaker_encoder import SpeakerEncoder as SpeakerEncoderEx -from TTS.vc.modules.freevc.wavlm import get_wavlm -from TTS.vocoder.models.hifigan_generator import get_padding +from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP logger = logging.getLogger(__name__) @@ -103,7 +101,7 @@ def __init__( upsample_kernel_sizes, gin_channels=0, ): - super(Generator, self).__init__() + super().__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) @@ -164,78 +162,9 @@ def remove_weight_norm(self): remove_parametrizations(l, "weight") -class DiscriminatorP(torch.nn.Module): - def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): - super(DiscriminatorP, self).__init__() - self.period = period - self.use_spectral_norm = use_spectral_norm - norm_f = weight_norm if use_spectral_norm is False else spectral_norm - self.convs = nn.ModuleList( - [ - norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), - ] - ) - self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) - - def forward(self, x): - fmap = [] - - # 1d to 2d - b, c, t = x.shape - if t % self.period != 0: # pad first - n_pad = self.period - (t % self.period) - x = F.pad(x, (0, n_pad), "reflect") - t = t + n_pad - x = x.view(b, c, t // self.period, self.period) - - for l in self.convs: - x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - -class DiscriminatorS(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm is False else spectral_norm - self.convs = nn.ModuleList( - [ - norm_f(Conv1d(1, 16, 15, 1, padding=7)), - norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), - norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), - norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), - norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), - norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), - ] - ) - self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) - - def forward(self, x): - fmap = [] - - for l in self.convs: - x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - class MultiPeriodDiscriminator(torch.nn.Module): def __init__(self, use_spectral_norm=False): - super(MultiPeriodDiscriminator, self).__init__() + super().__init__() periods = [2, 3, 5, 7, 11] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] @@ -260,7 +189,7 @@ def forward(self, y, y_hat): class SpeakerEncoder(torch.nn.Module): def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256): - super(SpeakerEncoder, self).__init__() + super().__init__() self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) self.linear = nn.Linear(model_hidden_size, model_embedding_size) self.relu = nn.ReLU() @@ -303,7 +232,7 @@ def embed_utterance(self, mel, partial_frames=128, partial_hop=64): class FreeVC(BaseVC): """ - Papaer:: + Paper:: https://arxiv.org/abs/2210.15418# Paper Abstract:: @@ -376,15 +305,11 @@ def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None): self.wavlm = get_wavlm() - @property - def device(self): - return next(self.parameters()).device - def load_pretrained_speaker_encoder(self): """Load pretrained speaker encoder model as mentioned in the paper.""" logger.info("Loading pretrained speaker encoder model ...") self.enc_spk_ex = SpeakerEncoderEx( - "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/speaker_encoder.pt", device=self.device + "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/speaker_encoder.pt" ) def init_multispeaker(self, config: Coqpit): @@ -405,15 +330,15 @@ def forward( self, c: torch.Tensor, spec: torch.Tensor, - g: Optional[torch.Tensor] = None, - mel: Optional[torch.Tensor] = None, - c_lengths: Optional[torch.Tensor] = None, - spec_lengths: Optional[torch.Tensor] = None, - ) -> Tuple[ + g: torch.Tensor | None = None, + mel: torch.Tensor | None = None, + c_lengths: torch.Tensor | None = None, + spec_lengths: torch.Tensor | None = None, + ) -> tuple[ torch.Tensor, torch.Tensor, torch.Tensor, - Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], ]: """ Forward pass of the model. @@ -454,13 +379,13 @@ def forward( z_p = self.flow(z, spec_mask, g=g) # Randomly slice z and compute o using dec - z_slice, ids_slice = commons.rand_slice_segments(z, spec_lengths, self.segment_size) + z_slice, ids_slice = rand_slice_segments(z, spec_lengths, self.segment_size) o = self.dec(z_slice, g=g) return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - @torch.no_grad() - def inference(self, c, g=None, mel=None, c_lengths=None): + @torch.inference_mode() + def inference(self, c, g=None, c_lengths=None): """ Inference pass of the model @@ -475,9 +400,6 @@ def inference(self, c, g=None, mel=None, c_lengths=None): """ if c_lengths is None: c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) - if not self.use_spk: - g = self.enc_spk.embed_utterance(mel) - g = g.unsqueeze(-1) z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths) z = self.flow(z_p, c_mask, g=g, reverse=True) o = self.dec(z * c_mask, g=g) @@ -508,51 +430,52 @@ def load_audio(self, wav): return wav.float() @torch.inference_mode() - def voice_conversion(self, src, tgt): + def voice_conversion(self, src: str | torch.Tensor, tgt: list[str | torch.Tensor]): """ Voice conversion pass of the model. Args: src (str or torch.Tensor): Source utterance. - tgt (str or torch.Tensor): Target utterance. + tgt (list of str or torch.Tensor): Target utterances. Returns: torch.Tensor: Output tensor. """ - wav_tgt = self.load_audio(tgt).cpu().numpy() - wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) - - if self.config.model_args.use_spk: - g_tgt = self.enc_spk_ex.embed_utterance(wav_tgt) - g_tgt = torch.from_numpy(g_tgt)[None, :, None].to(self.device) - else: - wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(self.device) - mel_tgt = mel_spectrogram_torch( - wav_tgt, - self.config.audio.filter_length, - self.config.audio.n_mel_channels, - self.config.audio.input_sample_rate, - self.config.audio.hop_length, - self.config.audio.win_length, - self.config.audio.mel_fmin, - self.config.audio.mel_fmax, - ) # src wav_src = self.load_audio(src) c = self.extract_wavlm_features(wav_src[None, :]) - if self.config.model_args.use_spk: - audio = self.inference(c, g=g_tgt) - else: - audio = self.inference(c, mel=mel_tgt.transpose(1, 2)) - audio = audio[0][0].data.cpu().float().numpy() - return audio + # tgt + g_tgts = [] + for tg in tgt: + wav_tgt = self.load_audio(tg).cpu().numpy() + wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) + + if self.config.model_args.use_spk: + g_tgts.append(self.enc_spk_ex.embed_utterance(wav_tgt)[None, :, None]) + else: + wav_tgt = torch.from_numpy(wav_tgt).unsqueeze(0).to(self.device) + mel_tgt = mel_spectrogram_torch( + wav_tgt, + self.config.audio.filter_length, + self.config.audio.n_mel_channels, + self.config.audio.input_sample_rate, + self.config.audio.hop_length, + self.config.audio.win_length, + self.config.audio.mel_fmin, + self.config.audio.mel_fmax, + ) + g_tgts.append(self.enc_spk.embed_utterance(mel_tgt.transpose(1, 2)).unsqueeze(-1)) + + g_tgt = torch.stack(g_tgts).mean(dim=0) + audio = self.inference(c, g=g_tgt) + return audio[0][0].data.cpu().float().numpy() def eval_step(): ... @staticmethod - def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None): + def init_from_config(config: FreeVCConfig) -> "FreeVC": model = FreeVC(config) return model diff --git a/TTS/vc/models/knnvc.py b/TTS/vc/models/knnvc.py new file mode 100644 index 0000000000..c31f52e749 --- /dev/null +++ b/TTS/vc/models/knnvc.py @@ -0,0 +1,181 @@ +import logging +import os +from typing import Any, TypeAlias + +import torch +import torch.nn.functional as F +import torchaudio +from coqpit import Coqpit + +from TTS.vc.configs.knnvc_config import KNNVCConfig +from TTS.vc.layers.freevc.wavlm import get_wavlm +from TTS.vc.models.base_vc import BaseVC + +logger = logging.getLogger(__name__) + +PathOrTensor: TypeAlias = str | os.PathLike[Any] | torch.Tensor + + +class KNNVC(BaseVC): + """ + Paper:: + https://arxiv.org/abs/2305.18975 + + Paper Abstract:: + Any-to-any voice conversion aims to transform source speech + into a target voice with just a few examples of the target speaker as a + reference. Recent methods produce convincing conversions, but at the cost of + increased complexity -- making results difficult to reproduce and build on. + Instead, we keep it simple. We propose k-nearest neighbors voice conversion + (kNN-VC): a straightforward yet effective method for any-to-any conversion. + First, we extract self-supervised representations of the source and reference + speech. To convert to the target speaker, we replace each frame of the source + representation with its nearest neighbor in the reference. Finally, a pretrained + vocoder synthesizes audio from the converted representation. Objective and + subjective evaluations show that kNN-VC improves speaker similarity with similar + intelligibility scores to existing methods. + + Samples:: + https://bshall.github.io/knn-vc + + Original code:: + https://github.com/bshall/knn-vc + + Examples: + >>> from TTS.vc.configs.knnvc_config import KNNVCConfig + >>> from TTS.vc.models.knnvc import KNNVC + >>> config = KNNVCConfig() + >>> model = KNNVC(config) + """ + + def __init__(self, config: Coqpit): + super().__init__(config) + self.ssl_dim = self.args.ssl_dim + self.wavlm = get_wavlm() + + @staticmethod + def init_from_config(config: KNNVCConfig) -> "KNNVC": + return KNNVC(config) + + @torch.inference_mode() + def get_features(self, audio: PathOrTensor, vad_trigger_level=0) -> torch.Tensor: + """Return features for the given waveform with output shape (seq_len, dim). + + Optionally perform VAD trimming on start/end with `vad_trigger_level`. + """ + # load audio + if isinstance(audio, torch.Tensor): + x: torch.Tensor = audio + sr = self.config.audio.sample_rate + if x.dim() == 1: + x = x[None] + else: + x, sr = torchaudio.load(audio, normalize=True) + + if not sr == self.config.audio.sample_rate: + logger.info("Resampling %d to %d in %s", sr, self.config.audio.sample_rate, audio) + x = torchaudio.functional.resample(x, orig_freq=sr, new_freq=self.config.audio.sample_rate) + sr = self.config.audio.sample_rate + + # trim silence from front and back + if vad_trigger_level > 1e-3: + transform = torchaudio.transforms.Vad(sample_rate=sr, trigger_level=vad_trigger_level) + x_front_trim = transform(x) + waveform_reversed = torch.flip(x_front_trim, (-1,)) + waveform_reversed_front_trim = transform(waveform_reversed) + x = torch.flip(waveform_reversed_front_trim, (-1,)) + + # extract the representation of each layer + wav_input_16khz = x.to(self.device) + features = self.wavlm.extract_features( + wav_input_16khz, output_layer=self.config.wavlm_layer, ret_layer_results=False + )[0] + return features.squeeze(0) + + def get_matching_set(self, wavs: list[PathOrTensor], vad_trigger_level=7) -> torch.Tensor: + """Get concatenated wavlm features for the matching set using all waveforms in `wavs`. + + Wavs are specified as either a list of paths or list of loaded waveform tensors of + shape (channels, T), assumed to be of 16kHz sample rate. + """ + feats = [] + for p in wavs: + feats.append(self.get_features(p, vad_trigger_level=vad_trigger_level)) + + feats = torch.concat(feats, dim=0).cpu() + return feats + + @staticmethod + def fast_cosine_dist(source_feats: torch.Tensor, matching_pool: torch.Tensor) -> torch.Tensor: + """Like torch.cdist, but fixed dim=-1 and for cosine distance.""" + source_norms = torch.norm(source_feats, p=2, dim=-1) + matching_norms = torch.norm(matching_pool, p=2, dim=-1) + dotprod = ( + -(torch.cdist(source_feats[None], matching_pool[None], p=2)[0] ** 2) + + source_norms[:, None] ** 2 + + matching_norms[None] ** 2 + ) + dotprod /= 2 + + dists = 1 - (dotprod / (source_norms[:, None] * matching_norms[None])) + return dists + + @torch.inference_mode() + def match( + self, + query_seq: torch.Tensor, + matching_set: torch.Tensor, + synth_set: torch.Tensor | None = None, + topk: int | None = None, + target_duration: float | None = None, + ) -> torch.Tensor: + """Given `query_seq`, `matching_set`, and `synth_set` tensors of shape (N, dim), perform kNN regression matching + with k=`topk`. + + Args: + `query_seq`: Tensor (N1, dim) of the input/source query features. + `matching_set`: Tensor (N2, dim) of the matching set used as the 'training set' for the kNN algorithm. + `synth_set`: optional Tensor (N2, dim) corresponding to the matching set. We use the matching set to assign + each query vector to a vector in the matching set, and then use the corresponding vector from + the synth set during HiFiGAN synthesis. + By default, and for best performance, this should be identical to the matching set. + `topk`: k in the kNN -- the number of nearest neighbors to average over. + `target_duration`: if set to a float, interpolate waveform duration to be equal to this value in seconds. + + Returns: + - converted features (1, N, dim) + """ + if topk is None: + topk = self.config.topk + synth_set = matching_set.to(self.device) if synth_set is None else synth_set.to(self.device) + matching_set = matching_set.to(self.device) + query_seq = query_seq.to(self.device) + + if target_duration is not None: + target_samples = int(target_duration * self.config.audio.sample_rate) + scale_factor = (target_samples / self.hop_length) / query_seq.shape[0] # n_targ_feats / n_input_feats + query_seq = F.interpolate(query_seq.T[None], scale_factor=scale_factor, mode="linear")[0].T + + dists = self.fast_cosine_dist(query_seq, matching_set) + best = dists.topk(k=topk, largest=False, dim=-1) + out_feats = synth_set[best.indices].mean(dim=1) + return out_feats.unsqueeze(0) + + def load_checkpoint(self, vc_config: KNNVCConfig, _vc_checkpoint: str | os.PathLike[Any]) -> None: + """kNN-VC does not use checkpoints.""" + + def forward(self) -> None: ... + def inference(self) -> None: ... + + @torch.inference_mode() + def voice_conversion( + self, + source: PathOrTensor, + target: list[PathOrTensor], + topk: int | None = None, + ) -> torch.Tensor: + if not isinstance(target, list): + target = [target] + source_features = self.get_features(source) + matching_set = self.get_matching_set(target) + return self.match(source_features, matching_set, topk=topk) diff --git a/TTS/vc/models/openvoice.py b/TTS/vc/models/openvoice.py new file mode 100644 index 0000000000..1049a580c7 --- /dev/null +++ b/TTS/vc/models/openvoice.py @@ -0,0 +1,320 @@ +import json +import logging +import os +from collections.abc import Mapping +from pathlib import Path +from typing import Any + +import librosa +import numpy as np +import numpy.typing as npt +import torch +from coqpit import Coqpit +from torch import nn +from torch.nn import functional as F +from trainer.io import load_fsspec + +from TTS.tts.layers.vits.networks import PosteriorEncoder +from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.audio.torch_transforms import wav_to_spec +from TTS.vc.configs.openvoice_config import OpenVoiceConfig +from TTS.vc.models.base_vc import BaseVC +from TTS.vc.models.freevc import Generator, ResidualCouplingBlock + +logger = logging.getLogger(__name__) + + +class ReferenceEncoder(nn.Module): + """NN module creating a fixed size prosody embedding from a spectrogram. + + inputs: mel spectrograms [batch_size, num_spec_frames, num_mel] + outputs: [batch_size, embedding_dim] + """ + + def __init__(self, spec_channels: int, embedding_dim: int = 0, layernorm: bool = True) -> None: + super().__init__() + self.spec_channels = spec_channels + ref_enc_filters = [32, 32, 64, 64, 128, 128] + K = len(ref_enc_filters) + filters = [1] + ref_enc_filters + convs = [ + torch.nn.utils.parametrizations.weight_norm( + nn.Conv2d( + in_channels=filters[i], + out_channels=filters[i + 1], + kernel_size=(3, 3), + stride=(2, 2), + padding=(1, 1), + ) + ) + for i in range(K) + ] + self.convs = nn.ModuleList(convs) + + out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K) + self.gru = nn.GRU( + input_size=ref_enc_filters[-1] * out_channels, + hidden_size=256 // 2, + batch_first=True, + ) + self.proj = nn.Linear(128, embedding_dim) + self.layernorm = nn.LayerNorm(self.spec_channels) if layernorm else None + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + N = inputs.size(0) + + out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs] + if self.layernorm is not None: + out = self.layernorm(out) + + for conv in self.convs: + out = conv(out) + out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K] + + out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K] + T = out.size(1) + N = out.size(0) + out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K] + + self.gru.flatten_parameters() + _memory, out = self.gru(out) # out --- [1, N, 128] + + return self.proj(out.squeeze(0)) + + def calculate_channels(self, L: int, kernel_size: int, stride: int, pad: int, n_convs: int) -> int: + for _ in range(n_convs): + L = (L - kernel_size + 2 * pad) // stride + 1 + return L + + +class OpenVoice(BaseVC): + """ + OpenVoice voice conversion model (inference only). + + Source: https://github.com/myshell-ai/OpenVoice + Paper: https://arxiv.org/abs/2312.01479 + + Paper abstract: + We introduce OpenVoice, a versatile voice cloning approach that requires + only a short audio clip from the reference speaker to replicate their voice and + generate speech in multiple languages. OpenVoice represents a significant + advancement in addressing the following open challenges in the field: 1) + Flexible Voice Style Control. OpenVoice enables granular control over voice + styles, including emotion, accent, rhythm, pauses, and intonation, in addition + to replicating the tone color of the reference speaker. The voice styles are not + directly copied from and constrained by the style of the reference speaker. + Previous approaches lacked the ability to flexibly manipulate voice styles after + cloning. 2) Zero-Shot Cross-Lingual Voice Cloning. OpenVoice achieves zero-shot + cross-lingual voice cloning for languages not included in the massive-speaker + training set. Unlike previous approaches, which typically require extensive + massive-speaker multi-lingual (MSML) dataset for all languages, OpenVoice can + clone voices into a new language without any massive-speaker training data for + that language. OpenVoice is also computationally efficient, costing tens of + times less than commercially available APIs that offer even inferior + performance. To foster further research in the field, we have made the source + code and trained model publicly accessible. We also provide qualitative results + in our demo website. Prior to its public release, our internal version of + OpenVoice was used tens of millions of times by users worldwide between May and + October 2023, serving as the backend of MyShell. + """ + + def __init__(self, config: Coqpit, speaker_manager: SpeakerManager | None = None) -> None: + super().__init__(config, None, speaker_manager, None) + + self.init_multispeaker(config) + + self.zero_g = self.args.zero_g + self.inter_channels = self.args.inter_channels + self.hidden_channels = self.args.hidden_channels + self.filter_channels = self.args.filter_channels + self.n_heads = self.args.n_heads + self.n_layers = self.args.n_layers + self.kernel_size = self.args.kernel_size + self.p_dropout = self.args.p_dropout + self.resblock = self.args.resblock + self.resblock_kernel_sizes = self.args.resblock_kernel_sizes + self.resblock_dilation_sizes = self.args.resblock_dilation_sizes + self.upsample_rates = self.args.upsample_rates + self.upsample_initial_channel = self.args.upsample_initial_channel + self.upsample_kernel_sizes = self.args.upsample_kernel_sizes + self.n_layers_q = self.args.n_layers_q + self.use_spectral_norm = self.args.use_spectral_norm + self.gin_channels = self.args.gin_channels + self.tau = self.args.tau + + self.spec_channels = config.audio.fft_size // 2 + 1 + + self.dec = Generator( + self.inter_channels, + self.resblock, + self.resblock_kernel_sizes, + self.resblock_dilation_sizes, + self.upsample_rates, + self.upsample_initial_channel, + self.upsample_kernel_sizes, + gin_channels=self.gin_channels, + ) + self.enc_q = PosteriorEncoder( + self.spec_channels, + self.inter_channels, + self.hidden_channels, + kernel_size=5, + dilation_rate=1, + num_layers=16, + cond_channels=self.gin_channels, + ) + + self.flow = ResidualCouplingBlock( + self.inter_channels, + self.hidden_channels, + kernel_size=5, + dilation_rate=1, + n_layers=4, + gin_channels=self.gin_channels, + ) + + self.ref_enc = ReferenceEncoder(self.spec_channels, self.gin_channels) + + @staticmethod + def init_from_config(config: OpenVoiceConfig) -> "OpenVoice": + return OpenVoice(config) + + def init_multispeaker(self, config: Coqpit, data: list[Any] | None = None) -> None: + """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer + or with external `d_vectors` computed from a speaker encoder model. + + You must provide a `speaker_manager` at initialization to set up the multi-speaker modules. + + Args: + config (Coqpit): Model configuration. + data (list, optional): Dataset items to infer number of speakers. Defaults to None. + """ + self.num_spks = config.num_speakers + if self.speaker_manager: + self.num_spks = self.speaker_manager.num_speakers + + def load_checkpoint( + self, + config: OpenVoiceConfig, + checkpoint_path: str | os.PathLike[Any], + eval: bool = False, + strict: bool = True, + cache: bool = False, + ) -> None: + """Map from OpenVoice's config structure.""" + config_path = Path(checkpoint_path).parent / "config.json" + with open(config_path, encoding="utf-8") as f: + config_org = json.load(f) + self.config.audio.input_sample_rate = config_org["data"]["sampling_rate"] + self.config.audio.output_sample_rate = config_org["data"]["sampling_rate"] + self.config.audio.fft_size = config_org["data"]["filter_length"] + self.config.audio.hop_length = config_org["data"]["hop_length"] + self.config.audio.win_length = config_org["data"]["win_length"] + state = load_fsspec(str(checkpoint_path), map_location=torch.device("cpu"), cache=cache) + self.load_state_dict(state["model"], strict=strict) + if eval: + self.eval() + + def forward(self) -> None: ... + def train_step(self) -> None: ... + def eval_step(self) -> None: ... + + @staticmethod + def _set_x_lengths(x: torch.Tensor, aux_input: Mapping[str, torch.Tensor | None]) -> torch.Tensor: + if "x_lengths" in aux_input and aux_input["x_lengths"] is not None: + return aux_input["x_lengths"] + return torch.tensor(x.shape[-1:]).to(x.device) + + @torch.inference_mode() + def inference( + self, + x: torch.Tensor, + aux_input: Mapping[str, torch.Tensor | None] = {"x_lengths": None, "g_src": None, "g_tgt": None}, + ) -> dict[str, torch.Tensor]: + """ + Inference pass of the model + + Args: + x (torch.Tensor): Input tensor. Shape: (batch_size, c_seq_len). + x_lengths (torch.Tensor): Lengths of the input tensor. Shape: (batch_size,). + g_src (torch.Tensor): Source speaker embedding tensor. Shape: (batch_size, spk_emb_dim). + g_tgt (torch.Tensor): Target speaker embedding tensor. Shape: (batch_size, spk_emb_dim). + + Returns: + o_hat: Output spectrogram tensor. Shape: (batch_size, spec_seq_len, spec_dim). + x_mask: Spectrogram mask. Shape: (batch_size, spec_seq_len). + (z, z_p, z_hat): A tuple of latent variables. + """ + x_lengths = self._set_x_lengths(x, aux_input) + if "g_src" in aux_input and aux_input["g_src"] is not None: + g_src = aux_input["g_src"] + else: + raise ValueError("aux_input must define g_src") + if "g_tgt" in aux_input and aux_input["g_tgt"] is not None: + g_tgt = aux_input["g_tgt"] + else: + raise ValueError("aux_input must define g_tgt") + z, _m_q, _logs_q, y_mask = self.enc_q( + x, x_lengths, g=g_src if not self.zero_g else torch.zeros_like(g_src), tau=self.tau + ) + z_p = self.flow(z, y_mask, g=g_src) + z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) + o_hat = self.dec(z_hat * y_mask, g=g_tgt if not self.zero_g else torch.zeros_like(g_tgt)) + return { + "model_outputs": o_hat, + "y_mask": y_mask, + "z": z, + "z_p": z_p, + "z_hat": z_hat, + } + + def load_audio(self, wav: str | npt.NDArray[np.float32] | torch.Tensor | list[float]) -> torch.Tensor: + """Read and format the input audio.""" + if isinstance(wav, str): + out = torch.from_numpy(librosa.load(wav, sr=self.config.audio.input_sample_rate)[0]) + elif isinstance(wav, np.ndarray): + out = torch.from_numpy(wav) + elif isinstance(wav, list): + out = torch.from_numpy(np.array(wav)) + else: + out = wav + return out.to(self.device).float() + + def extract_se(self, audio: str | torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + y = self.load_audio(audio) + y = y.to(self.device) + y = y.unsqueeze(0) + spec = wav_to_spec( + y, + n_fft=self.config.audio.fft_size, + hop_length=self.config.audio.hop_length, + win_length=self.config.audio.win_length, + center=False, + ).to(self.device) + with torch.no_grad(): + g = self.ref_enc(spec.transpose(1, 2)).unsqueeze(-1) + + return g, spec + + @torch.inference_mode() + def voice_conversion(self, src: str | torch.Tensor, tgt: list[str | torch.Tensor]) -> npt.NDArray[np.float32]: + """ + Voice conversion pass of the model. + + Args: + src (str or torch.Tensor): Source utterance. + tgt (list of str or torch.Tensor): Target utterance. + + Returns: + Output numpy array. + """ + src_se, src_spec = self.extract_se(src) + tgt_ses = [] + for tg in tgt: + tgt_se, _ = self.extract_se(tg) + tgt_ses.append(tgt_se) + tgt_se = torch.stack(tgt_ses).mean(dim=0) + + aux_input = {"g_src": src_se, "g_tgt": tgt_se} + audio = self.inference(src_spec, aux_input) + return audio["model_outputs"][0, 0].data.cpu().float().numpy() diff --git a/TTS/vc/modules/freevc/mel_processing.py b/TTS/vc/modules/freevc/mel_processing.py deleted file mode 100644 index a3e251891a..0000000000 --- a/TTS/vc/modules/freevc/mel_processing.py +++ /dev/null @@ -1,133 +0,0 @@ -import logging - -import torch -import torch.utils.data -from librosa.filters import mel as librosa_mel_fn - -logger = logging.getLogger(__name__) - -MAX_WAV_VALUE = 32768.0 - - -def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): - """ - PARAMS - ------ - C: compression factor - """ - return torch.log(torch.clamp(x, min=clip_val) * C) - - -def dynamic_range_decompression_torch(x, C=1): - """ - PARAMS - ------ - C: compression factor used to compress - """ - return torch.exp(x) / C - - -def spectral_normalize_torch(magnitudes): - output = dynamic_range_compression_torch(magnitudes) - return output - - -def spectral_de_normalize_torch(magnitudes): - output = dynamic_range_decompression_torch(magnitudes) - return output - - -mel_basis = {} -hann_window = {} - - -def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): - if torch.min(y) < -1.0: - logger.info("Min value is: %.3f", torch.min(y)) - if torch.max(y) > 1.0: - logger.info("Max value is: %.3f", torch.max(y)) - - global hann_window - dtype_device = str(y.dtype) + "_" + str(y.device) - wnsize_dtype_device = str(win_size) + "_" + dtype_device - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) - - y = torch.nn.functional.pad( - y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" - ) - y = y.squeeze(1) - - spec = torch.view_as_real( - torch.stft( - y, - n_fft, - hop_length=hop_size, - win_length=win_size, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True, - ) - ) - - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - return spec - - -def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): - global mel_basis - dtype_device = str(spec.dtype) + "_" + str(spec.device) - fmax_dtype_device = str(fmax) + "_" + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) - spec = torch.matmul(mel_basis[fmax_dtype_device], spec) - spec = spectral_normalize_torch(spec) - return spec - - -def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): - if torch.min(y) < -1.0: - logger.info("Min value is: %.3f", torch.min(y)) - if torch.max(y) > 1.0: - logger.info("Max value is: %.3f", torch.max(y)) - - global mel_basis, hann_window - dtype_device = str(y.dtype) + "_" + str(y.device) - fmax_dtype_device = str(fmax) + "_" + dtype_device - wnsize_dtype_device = str(win_size) + "_" + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) - - y = torch.nn.functional.pad( - y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" - ) - y = y.squeeze(1) - - spec = torch.view_as_real( - torch.stft( - y, - n_fft, - hop_length=hop_size, - win_length=win_size, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True, - ) - ) - - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - - spec = torch.matmul(mel_basis[fmax_dtype_device], spec) - spec = spectral_normalize_torch(spec) - - return spec diff --git a/TTS/vocoder/configs/hifigan_config.py b/TTS/vocoder/configs/hifigan_config.py index 9a102f0c89..60dde496b2 100644 --- a/TTS/vocoder/configs/hifigan_config.py +++ b/TTS/vocoder/configs/hifigan_config.py @@ -5,7 +5,7 @@ @dataclass class HifiganConfig(BaseGANVocoderConfig): - """Defines parameters for FullBand MelGAN vocoder. + """Defines parameters for HifiGAN vocoder. Example: diff --git a/TTS/vocoder/configs/multiband_melgan_config.py b/TTS/vocoder/configs/multiband_melgan_config.py index 763113537f..2139f47b0e 100644 --- a/TTS/vocoder/configs/multiband_melgan_config.py +++ b/TTS/vocoder/configs/multiband_melgan_config.py @@ -121,7 +121,7 @@ class MultibandMelganConfig(BaseGANVocoderConfig): pad_short: int = 2000 use_noise_augment: bool = False use_cache: bool = True - steps_to_start_discriminator: bool = 200000 + steps_to_start_discriminator: int = 200000 # LOSS PARAMETERS - overrides use_stft_loss: bool = True diff --git a/TTS/vocoder/configs/shared_configs.py b/TTS/vocoder/configs/shared_configs.py index a558cfcabb..548505a54d 100644 --- a/TTS/vocoder/configs/shared_configs.py +++ b/TTS/vocoder/configs/shared_configs.py @@ -168,7 +168,7 @@ class BaseGANVocoderConfig(BaseVocoderConfig): target_loss: str = "loss_0" # loss value to pick the best model to save after each epoch # optimizer - grad_clip: float = field(default_factory=lambda: [5, 5]) + grad_clip: float | list[float] = field(default_factory=lambda: [5, 5]) lr_gen: float = 0.0002 # Initial learning rate. lr_disc: float = 0.0002 # Initial learning rate. lr_scheduler_gen: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html @@ -178,5 +178,5 @@ class BaseGANVocoderConfig(BaseVocoderConfig): scheduler_after_epoch: bool = True use_pqmf: bool = False # enable/disable using pqmf for multi-band training. (Multi-band MelGAN) - steps_to_start_discriminator = 0 # start training the discriminator after this number of steps. + steps_to_start_discriminator: int = 0 # start training the discriminator after this number of steps. diff_samples_for_G_and_D: bool = False # use different samples for G and D training steps. diff --git a/TTS/vocoder/configs/univnet_config.py b/TTS/vocoder/configs/univnet_config.py index 67f324cfce..85662831ee 100644 --- a/TTS/vocoder/configs/univnet_config.py +++ b/TTS/vocoder/configs/univnet_config.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Dict from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig @@ -96,7 +95,7 @@ class UnivnetConfig(BaseGANVocoderConfig): # model specific params discriminator_model: str = "univnet_discriminator" generator_model: str = "univnet_generator" - generator_model_params: Dict = field( + generator_model_params: dict = field( default_factory=lambda: { "in_channels": 64, "out_channels": 1, @@ -121,7 +120,7 @@ class UnivnetConfig(BaseGANVocoderConfig): # loss weights - overrides stft_loss_weight: float = 2.5 - stft_loss_params: Dict = field( + stft_loss_params: dict = field( default_factory=lambda: { "n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], @@ -133,7 +132,7 @@ class UnivnetConfig(BaseGANVocoderConfig): hinge_G_loss_weight: float = 0 feat_match_loss_weight: float = 0 l1_spec_loss_weight: float = 0 - l1_spec_loss_params: Dict = field( + l1_spec_loss_params: dict = field( default_factory=lambda: { "use_mel": True, "sample_rate": 22050, @@ -153,7 +152,7 @@ class UnivnetConfig(BaseGANVocoderConfig): # lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) lr_scheduler_disc: str = None # one of the schedulers from https:#pytorch.org/docs/stable/optim.html # lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) - optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.5, 0.9], "weight_decay": 0.0}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.5, 0.9], "weight_decay": 0.0}) steps_to_start_discriminator: int = 200000 def __post_init__(self): diff --git a/TTS/vocoder/datasets/__init__.py b/TTS/vocoder/datasets/__init__.py index 04462817a8..cef6a50b05 100644 --- a/TTS/vocoder/datasets/__init__.py +++ b/TTS/vocoder/datasets/__init__.py @@ -1,5 +1,3 @@ -from typing import List - from coqpit import Coqpit from torch.utils.data import Dataset @@ -10,7 +8,7 @@ from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List) -> Dataset: +def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: list) -> Dataset: if config.model.lower() in "gan": dataset = GANDataset( ap=ap, diff --git a/TTS/vocoder/datasets/gan_dataset.py b/TTS/vocoder/datasets/gan_dataset.py index 0806c0d496..076545f8a2 100644 --- a/TTS/vocoder/datasets/gan_dataset.py +++ b/TTS/vocoder/datasets/gan_dataset.py @@ -32,7 +32,7 @@ def __init__( super().__init__() self.ap = ap self.item_list = items - self.compute_feat = not isinstance(items[0], (tuple, list)) + self.compute_feat = not isinstance(items[0], tuple | list) self.seq_len = seq_len self.hop_len = hop_len self.pad_short = pad_short @@ -128,9 +128,9 @@ def load_item(self, idx): # correct the audio length wrt padding applied in stft audio = np.pad(audio, (0, self.hop_len), mode="edge") audio = audio[: mel.shape[-1] * self.hop_len] - assert ( - mel.shape[-1] * self.hop_len == audio.shape[-1] - ), f" [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}" + assert mel.shape[-1] * self.hop_len == audio.shape[-1], ( + f" [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}" + ) audio = torch.from_numpy(audio).float().unsqueeze(0) mel = torch.from_numpy(mel).float().squeeze(0) diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index 6f34bccb7c..435330bebe 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -2,7 +2,6 @@ import os import random from multiprocessing import Manager -from typing import List, Tuple import numpy as np import torch @@ -65,7 +64,7 @@ def __getitem__(self, idx): item = self.load_item(idx) return item - def load_test_samples(self, num_samples: int) -> List[Tuple]: + def load_test_samples(self, num_samples: int) -> list[tuple]: """Return test samples. Args: @@ -103,9 +102,9 @@ def load_item(self, idx): audio = np.pad( audio, (0, self.seq_len + self.pad_short - len(audio)), mode="constant", constant_values=0.0 ) - assert ( - audio.shape[-1] >= self.seq_len + self.pad_short - ), f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}" + assert audio.shape[-1] >= self.seq_len + self.pad_short, ( + f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}" + ) # correct the audio length wrt hop length p = (audio.shape[-1] // self.hop_len + 1) * self.hop_len - audio.shape[-1] diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 4c4f5c48df..ffb71177c5 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -18,7 +18,7 @@ class WaveRNNDataset(Dataset): def __init__(self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, return_segments=True): super().__init__() self.ap = ap - self.compute_feat = not isinstance(items[0], (tuple, list)) + self.compute_feat = not isinstance(items[0], tuple | list) self.item_list = items self.seq_len = seq_len self.hop_len = hop_len diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index 8d4dd725ef..81a1f30884 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -1,5 +1,3 @@ -from typing import Dict, Union - import torch from torch import nn from torch.nn import functional as F @@ -226,9 +224,9 @@ class GeneratorLoss(nn.Module): def __init__(self, C): super().__init__() - assert not ( - C.use_mse_gan_loss and C.use_hinge_gan_loss - ), " [!] Cannot use HingeGANLoss and MSEGANLoss together." + assert not (C.use_mse_gan_loss and C.use_hinge_gan_loss), ( + " [!] Cannot use HingeGANLoss and MSEGANLoss together." + ) self.use_stft_loss = C.use_stft_loss if "use_stft_loss" in C else False self.use_subband_stft_loss = C.use_subband_stft_loss if "use_subband_stft_loss" in C else False @@ -313,9 +311,9 @@ class DiscriminatorLoss(nn.Module): def __init__(self, C): super().__init__() - assert not ( - C.use_mse_gan_loss and C.use_hinge_gan_loss - ), " [!] Cannot use HingeGANLoss and MSEGANLoss together." + assert not (C.use_mse_gan_loss and C.use_hinge_gan_loss), ( + " [!] Cannot use HingeGANLoss and MSEGANLoss together." + ) self.use_mse_gan_loss = C.use_mse_gan_loss self.use_hinge_gan_loss = C.use_hinge_gan_loss @@ -352,7 +350,7 @@ def forward(self, scores_fake, scores_real): class WaveRNNLoss(nn.Module): - def __init__(self, wave_rnn_mode: Union[str, int]): + def __init__(self, wave_rnn_mode: str | int): super().__init__() if wave_rnn_mode == "mold": self.loss_func = discretized_mix_logistic_loss @@ -363,6 +361,6 @@ def __init__(self, wave_rnn_mode: Union[str, int]): else: raise ValueError(" [!] Unknown mode for Wavernn.") - def forward(self, y_hat, y) -> Dict: + def forward(self, y_hat, y) -> dict: loss = self.loss_func(y_hat, y) return {"loss": loss} diff --git a/TTS/vocoder/layers/lvc_block.py b/TTS/vocoder/layers/lvc_block.py index 8913a1132e..ab1a56e7fc 100644 --- a/TTS/vocoder/layers/lvc_block.py +++ b/TTS/vocoder/layers/lvc_block.py @@ -175,9 +175,9 @@ def location_variable_convolution(x, kernel, bias, dilation, hop_size): batch, _, in_length = x.shape batch, _, out_channels, kernel_size, kernel_length = kernel.shape - assert in_length == ( - kernel_length * hop_size - ), f"length of (x, kernel) is not matched, {in_length} vs {kernel_length * hop_size}" + assert in_length == (kernel_length * hop_size), ( + f"length of (x, kernel) is not matched, {in_length} vs {kernel_length * hop_size}" + ) padding = dilation * int((kernel_size - 1) / 2) x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding) diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py index 9f1512c6d4..187e7062e2 100644 --- a/TTS/vocoder/layers/wavegrad.py +++ b/TTS/vocoder/layers/wavegrad.py @@ -74,7 +74,7 @@ def shif_and_scale(x, scale, shift): class UBlock(nn.Module): def __init__(self, input_size, hidden_size, factor, dilation): super().__init__() - assert isinstance(dilation, (list, tuple)) + assert isinstance(dilation, list | tuple) assert len(dilation) == 4 self.factor = factor diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py index 7a1716f16d..481d234a54 100644 --- a/TTS/vocoder/models/__init__.py +++ b/TTS/vocoder/models/__init__.py @@ -4,15 +4,14 @@ from coqpit import Coqpit -logger = logging.getLogger(__name__) - +from TTS.utils.generic_utils import to_camel +from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig, BaseVocoderConfig +from TTS.vocoder.models.base_vocoder import BaseVocoder -def to_camel(text): - text = text.capitalize() - return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) +logger = logging.getLogger(__name__) -def setup_model(config: Coqpit): +def setup_model(config: BaseVocoderConfig) -> BaseVocoder: """Load models directly from configuration.""" if "discriminator_model" in config and "generator_model" in config: MyModel = importlib.import_module("TTS.vocoder.models.gan") @@ -29,19 +28,20 @@ def setup_model(config: Coqpit): try: MyModel = getattr(MyModel, to_camel(config.model)) except ModuleNotFoundError as e: - raise ValueError(f"Model {config.model} not exist!") from e + raise ValueError(f"Model {config.model} does not exist!") from e logger.info("Vocoder model: %s", config.model) return MyModel.init_from_config(config) -def setup_generator(c): +def setup_generator(c: BaseGANVocoderConfig): """TODO: use config object as arguments""" logger.info("Generator model: %s", c.generator_model) MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) MyModel = getattr(MyModel, to_camel(c.generator_model)) # this is to preserve the Wavernn class name (instead of Wavernn) if c.generator_model.lower() in "hifigan_generator": - model = MyModel(in_channels=c.audio["num_mels"], out_channels=1, **c.generator_model_params) + c.generator_model_params["in_channels"] = c.generator_model_params.get("in_channels", c.audio["num_mels"]) + model = MyModel(out_channels=1, **c.generator_model_params) elif c.generator_model.lower() in "melgan_generator": model = MyModel( in_channels=c.audio["num_mels"], @@ -97,8 +97,8 @@ def setup_generator(c): return model -def setup_discriminator(c): - """TODO: use config objekt as arguments""" +def setup_discriminator(c: BaseGANVocoderConfig): + """TODO: use config object as arguments""" logger.info("Discriminator model: %s", c.discriminator_model) if "parallel_wavegan" in c.discriminator_model: MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator") @@ -107,7 +107,7 @@ def setup_discriminator(c): MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) if c.discriminator_model in "hifigan_discriminator": model = MyModel() - if c.discriminator_model in "random_window_discriminator": + elif c.discriminator_model in "random_window_discriminator": model = MyModel( cond_channels=c.audio["num_mels"], hop_length=c.audio["hop_length"], @@ -116,7 +116,7 @@ def setup_discriminator(c): cond_disc_out_channels=c.discriminator_model_params["cond_disc_out_channels"], window_sizes=c.discriminator_model_params["window_sizes"], ) - if c.discriminator_model in "melgan_multiscale_discriminator": + elif c.discriminator_model in "melgan_multiscale_discriminator": model = MyModel( in_channels=1, out_channels=1, @@ -125,7 +125,7 @@ def setup_discriminator(c): max_channels=c.discriminator_model_params["max_channels"], downsample_factors=c.discriminator_model_params["downsample_factors"], ) - if c.discriminator_model == "residual_parallel_wavegan_discriminator": + elif c.discriminator_model == "residual_parallel_wavegan_discriminator": model = MyModel( in_channels=1, out_channels=1, @@ -140,7 +140,7 @@ def setup_discriminator(c): nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, ) - if c.discriminator_model == "parallel_wavegan_discriminator": + elif c.discriminator_model == "parallel_wavegan_discriminator": model = MyModel( in_channels=1, out_channels=1, @@ -152,6 +152,8 @@ def setup_discriminator(c): nonlinear_activation_params={"negative_slope": 0.2}, bias=True, ) - if c.discriminator_model == "univnet_discriminator": + elif c.discriminator_model == "univnet_discriminator": model = MyModel() + else: + raise NotImplementedError(f"Model {c.discriminator_model} not implemented!") return model diff --git a/TTS/vocoder/models/fullband_melgan_generator.py b/TTS/vocoder/models/fullband_melgan_generator.py index ee25559af0..292d3323bb 100644 --- a/TTS/vocoder/models/fullband_melgan_generator.py +++ b/TTS/vocoder/models/fullband_melgan_generator.py @@ -24,7 +24,7 @@ def __init__( num_res_blocks=num_res_blocks, ) - @torch.no_grad() + @torch.inference_mode() def inference(self, cond_features): cond_features = cond_features.to(self.layers[1].weight.device) cond_features = torch.nn.functional.pad( diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index 8792950a56..6abb2dc997 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -1,5 +1,4 @@ from inspect import signature -from typing import Dict, List, Tuple import numpy as np import torch @@ -65,7 +64,7 @@ def inference(self, x: torch.Tensor) -> torch.Tensor: """ return self.model_g.inference(x) - def train_step(self, batch: Dict, criterion: Dict, optimizer_idx: int) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: dict, optimizer_idx: int) -> tuple[dict, dict]: """Compute model outputs and the loss values. `optimizer_idx` selects the generator or the discriminator for network on the current pass. @@ -185,7 +184,7 @@ def train_step(self, batch: Dict, criterion: Dict, optimizer_idx: int) -> Tuple[ outputs = {"model_outputs": self.y_hat_g} return outputs, loss_dict - def _log(self, name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, Dict]: + def _log(self, name: str, ap: AudioProcessor, batch: dict, outputs: dict) -> tuple[dict, dict]: """Logging shared by the training and evaluation. Args: @@ -205,22 +204,32 @@ def _log(self, name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tup return figures, audios def train_log( - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument - ) -> Tuple[Dict, np.ndarray]: + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument + ) -> tuple[dict, np.ndarray]: """Call `_log()` for training.""" figures, audios = self._log("eval", self.ap, batch, outputs) logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) - @torch.no_grad() - def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: + @torch.inference_mode() + def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> tuple[dict, dict]: """Call `train_step()` with `no_grad()`""" self.train_disc = True # Avoid a bug in the Training with the missing discriminator loss return self.train_step(batch, criterion, optimizer_idx) def eval_log( - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument - ) -> Tuple[Dict, np.ndarray]: + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument + ) -> tuple[dict, np.ndarray]: """Call `_log()` for evaluation.""" figures, audios = self._log("eval", self.ap, batch, outputs) logger.eval_figures(steps, figures) @@ -259,7 +268,7 @@ def on_train_step_start(self, trainer) -> None: """ self.train_disc = trainer.total_steps_done >= self.config.steps_to_start_discriminator - def get_optimizer(self) -> List: + def get_optimizer(self) -> list: """Initiate and return the GAN optimizers based on the config parameters. It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator. @@ -275,7 +284,7 @@ def get_optimizer(self) -> List: ) return [optimizer2, optimizer1] - def get_lr(self) -> List: + def get_lr(self) -> list: """Set the initial learning rates for each optimizer. Returns: @@ -283,7 +292,7 @@ def get_lr(self) -> List: """ return [self.config.lr_disc, self.config.lr_gen] - def get_scheduler(self, optimizer) -> List: + def get_scheduler(self, optimizer) -> list: """Set the schedulers for each optimizer. Args: @@ -297,7 +306,7 @@ def get_scheduler(self, optimizer) -> List: return [scheduler2, scheduler1] @staticmethod - def format_batch(batch: List) -> Dict: + def format_batch(batch: list) -> dict: """Format the batch for training. Args: @@ -316,12 +325,12 @@ def format_batch(batch: List) -> Dict: def get_data_loader( # pylint: disable=no-self-use, unused-argument self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: True, - samples: List, + samples: list, verbose: bool, num_gpus: int, - rank: int = None, # pylint: disable=unused-argument + rank: int | None = None, # pylint: disable=unused-argument ): """Initiate and return the GAN dataloader. diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py index afdd59a859..308b12ab56 100644 --- a/TTS/vocoder/models/hifigan_generator.py +++ b/TTS/vocoder/models/hifigan_generator.py @@ -178,6 +178,8 @@ def __init__( conv_pre_weight_norm=True, conv_post_weight_norm=True, conv_post_bias=True, + cond_in_each_up_layer=False, + pre_linear=None, ): r"""HiFiGAN Generator with Multi-Receptive Field Fusion (MRF) @@ -197,12 +199,17 @@ def __init__( for each consecutive upsampling layer. upsample_factors (List[int]): upsampling factors (stride) for each upsampling layer. inference_padding (int): constant padding applied to the input at inference time. Defaults to 5. + pre_linear (int): If not None, add nn.Linear(pre_linear, in_channels) before the convolutions. """ super().__init__() self.inference_padding = inference_padding self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_factors) + self.cond_in_each_up_layer = cond_in_each_up_layer + # initial upsampling layers + if pre_linear is not None: + self.lin_pre = nn.Linear(pre_linear, in_channels) self.conv_pre = weight_norm(Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3)) resblock = ResBlock1 if resblock_type == "1" else ResBlock2 # upsampling layers @@ -236,6 +243,12 @@ def __init__( if not conv_post_weight_norm: remove_parametrizations(self.conv_post, "weight") + if self.cond_in_each_up_layer: + self.conds = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + self.conds.append(nn.Conv1d(cond_channels, ch, 1)) + def forward(self, x, g=None): """ Args: @@ -249,12 +262,19 @@ def forward(self, x, g=None): x: [B, C, T] Tensor: [B, 1, T] """ + if hasattr(self, "lin_pre"): + x = self.lin_pre(x) + x = x.permute(0, 2, 1) o = self.conv_pre(x) if hasattr(self, "cond_layer"): o = o + self.cond_layer(g) for i in range(self.num_upsamples): o = F.leaky_relu(o, LRELU_SLOPE) o = self.ups[i](o) + + if self.cond_in_each_up_layer: + o = o + self.conds[i](g) + z_sum = None for j in range(self.num_kernels): if z_sum is None: @@ -267,7 +287,7 @@ def forward(self, x, g=None): o = torch.tanh(o) return o - @torch.no_grad() + @torch.inference_mode() def inference(self, c): """ Args: @@ -293,9 +313,7 @@ def remove_weight_norm(self): remove_parametrizations(self.conv_pre, "weight") remove_parametrizations(self.conv_post, "weight") - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/vocoder/models/melgan_generator.py b/TTS/vocoder/models/melgan_generator.py index 03c971afa4..53ed700755 100644 --- a/TTS/vocoder/models/melgan_generator.py +++ b/TTS/vocoder/models/melgan_generator.py @@ -84,9 +84,7 @@ def remove_weight_norm(self): except ValueError: layer.remove_weight_norm() - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/vocoder/models/multiband_melgan_generator.py b/TTS/vocoder/models/multiband_melgan_generator.py index 25d6590659..6eee712db3 100644 --- a/TTS/vocoder/models/multiband_melgan_generator.py +++ b/TTS/vocoder/models/multiband_melgan_generator.py @@ -32,7 +32,7 @@ def pqmf_analysis(self, x): def pqmf_synthesis(self, x): return self.pqmf_layer.synthesis(x) - @torch.no_grad() + @torch.inference_mode() def inference(self, cond_features): cond_features = cond_features.to(self.layers[1].weight.device) cond_features = torch.nn.functional.pad( diff --git a/TTS/vocoder/models/parallel_wavegan_discriminator.py b/TTS/vocoder/models/parallel_wavegan_discriminator.py index 211d45d91c..02ad60e0ff 100644 --- a/TTS/vocoder/models/parallel_wavegan_discriminator.py +++ b/TTS/vocoder/models/parallel_wavegan_discriminator.py @@ -71,7 +71,7 @@ def forward(self, x): def apply_weight_norm(self): def _apply_weight_norm(m): - if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d): torch.nn.utils.parametrizations.weight_norm(m) self.apply(_apply_weight_norm) @@ -174,7 +174,7 @@ def forward(self, x): def apply_weight_norm(self): def _apply_weight_norm(m): - if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d): torch.nn.utils.parametrizations.weight_norm(m) self.apply(_apply_weight_norm) diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py index 6a4d4ca6e7..71b38d4c0d 100644 --- a/TTS/vocoder/models/parallel_wavegan_generator.py +++ b/TTS/vocoder/models/parallel_wavegan_generator.py @@ -12,6 +12,13 @@ logger = logging.getLogger(__name__) +def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2**x): + assert layers % stacks == 0 + layers_per_cycle = layers // stacks + dilations = [dilation(i % layers_per_cycle) for i in range(layers)] + return (kernel_size - 1) * sum(dilations) + 1 + + class ParallelWaveganGenerator(torch.nn.Module): """PWGAN generator as in https://arxiv.org/pdf/1910.11480.pdf. It is similar to WaveNet with no causal convolution. @@ -101,9 +108,9 @@ def forward(self, c): # perform upsampling if c is not None and self.upsample_net is not None: c = self.upsample_net(c) - assert ( - c.shape[-1] == x.shape[-1] - ), f" [!] Upsampling scale does not match the expected output. {c.shape} vs {x.shape}" + assert c.shape[-1] == x.shape[-1], ( + f" [!] Upsampling scale does not match the expected output. {c.shape} vs {x.shape}" + ) # encode to hidden representation x = self.first_conv(x) @@ -120,7 +127,7 @@ def forward(self, c): return x - @torch.no_grad() + @torch.inference_mode() def inference(self, c): c = c.to(self.first_conv.weight.device) c = torch.nn.functional.pad(c, (self.inference_padding, self.inference_padding), "replicate") @@ -138,26 +145,17 @@ def _remove_weight_norm(m): def apply_weight_norm(self): def _apply_weight_norm(m): - if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d): torch.nn.utils.parametrizations.weight_norm(m) logger.info("Weight norm is applied to %s", m) self.apply(_apply_weight_norm) - @staticmethod - def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2**x): - assert layers % stacks == 0 - layers_per_cycle = layers // stacks - dilations = [dilation(i % layers_per_cycle) for i in range(layers)] - return (kernel_size - 1) * sum(dilations) + 1 - @property def receptive_field_size(self): - return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size) + return _get_receptive_field_size(self.layers, self.stacks, self.kernel_size) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py index 72e57a9c39..d991941441 100644 --- a/TTS/vocoder/models/univnet_generator.py +++ b/TTS/vocoder/models/univnet_generator.py @@ -1,5 +1,4 @@ import logging -from typing import List import numpy as np import torch @@ -7,6 +6,7 @@ from torch.nn.utils import parametrize from TTS.vocoder.layers.lvc_block import LVCBlock +from TTS.vocoder.models.parallel_wavegan_generator import _get_receptive_field_size logger = logging.getLogger(__name__) @@ -20,7 +20,7 @@ def __init__( out_channels: int, hidden_channels: int, cond_channels: int, - upsample_factors: List[int], + upsample_factors: list[int], lvc_layers_each_block: int, lvc_kernel_size: int, kpnet_hidden_channels: int, @@ -127,25 +127,18 @@ def apply_weight_norm(self): """Apply weight normalization module from all of the layers.""" def _apply_weight_norm(m): - if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + if isinstance(m, torch.nn.Conv1d | torch.nn.Conv2d): torch.nn.utils.parametrizations.weight_norm(m) logger.info("Weight norm is applied to %s", m) self.apply(_apply_weight_norm) - @staticmethod - def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2**x): - assert layers % stacks == 0 - layers_per_cycle = layers // stacks - dilations = [dilation(i % layers_per_cycle) for i in range(layers)] - return (kernel_size - 1) * sum(dilations) + 1 - @property def receptive_field_size(self): """Return receptive field size.""" - return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size) + return _get_receptive_field_size(self.layers, self.stacks, self.kernel_size) - @torch.no_grad() + @torch.inference_mode() def inference(self, c): """Perform inference. Args: diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index c49abd2201..5aa8ce5bb9 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Dict, List, Tuple import numpy as np import torch @@ -25,10 +24,10 @@ class WavegradArgs(Coqpit): use_weight_norm: bool = False y_conv_channels: int = 32 x_conv_channels: int = 768 - dblock_out_channels: List[int] = field(default_factory=lambda: [128, 128, 256, 512]) - ublock_out_channels: List[int] = field(default_factory=lambda: [512, 512, 256, 128, 128]) - upsample_factors: List[int] = field(default_factory=lambda: [4, 4, 4, 2, 2]) - upsample_dilations: List[List[int]] = field( + dblock_out_channels: list[int] = field(default_factory=lambda: [128, 128, 256, 512]) + ublock_out_channels: list[int] = field(default_factory=lambda: [512, 512, 256, 128, 128]) + upsample_factors: list[int] = field(default_factory=lambda: [4, 4, 4, 2, 2]) + upsample_dilations: list[list[int]] = field( default_factory=lambda: [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]] ) @@ -123,7 +122,7 @@ def load_noise_schedule(self, path): beta = np.load(path, allow_pickle=True).item()["beta"] # pylint: disable=unexpected-keyword-arg self.compute_noise_level(beta) - @torch.no_grad() + @torch.inference_mode() def inference(self, x, y_n=None): """ Shapes: @@ -218,9 +217,7 @@ def apply_weight_norm(self): self.out_conv = weight_norm(self.out_conv) self.y_conv = weight_norm(self.y_conv) - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: @@ -242,7 +239,7 @@ def load_checkpoint( ) self.compute_noise_level(betas) - def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]: # format data x = batch["input"] y = batch["waveform"] @@ -258,20 +255,30 @@ def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: return {"model_output": noise_hat}, {"loss": loss} def train_log( # pylint: disable=no-self-use - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument - ) -> Tuple[Dict, np.ndarray]: + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument + ) -> tuple[dict, np.ndarray]: pass - @torch.no_grad() - def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + @torch.inference_mode() + def eval_step(self, batch: dict, criterion: nn.Module) -> tuple[dict, dict]: return self.train_step(batch, criterion) def eval_log( # pylint: disable=no-self-use - self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument + self, + batch: dict, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument ) -> None: pass - def test(self, assets: Dict, test_loader: "DataLoader", outputs=None): # pylint: disable=unused-argument + def test(self, assets: dict, test_loader: "DataLoader", outputs=None): # pylint: disable=unused-argument # setup noise schedule and inference ap = assets["audio_processor"] noise_schedule = self.config["test_noise_schedule"] @@ -302,13 +309,22 @@ def get_criterion(): return torch.nn.L1Loss() @staticmethod - def format_batch(batch: Dict) -> Dict: + def format_batch(batch: dict) -> dict: # return a whole audio segment m, y = batch[0], batch[1] y = y.unsqueeze(1) return {"input": m, "waveform": y} - def get_data_loader(self, config: Coqpit, assets: Dict, is_eval: True, samples: List, verbose: bool, num_gpus: int): + def get_data_loader( + self, + config: Coqpit, + assets: dict, + is_eval: True, + samples: list, + verbose: bool, + num_gpus: int, + rank: int | None = None, + ): ap = assets["audio_processor"] dataset = WaveGradDataset( ap=ap, diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 723f18dde2..fb95d47589 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -1,7 +1,6 @@ import sys import time from dataclasses import dataclass, field -from typing import Dict, List, Tuple import numpy as np import torch @@ -17,6 +16,7 @@ from TTS.utils.audio.numpy_transforms import mulaw_decode from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset from TTS.vocoder.layers.losses import WaveRNNLoss +from TTS.vocoder.layers.upsample import Stretch2d from TTS.vocoder.models.base_vocoder import BaseVocoder from TTS.vocoder.utils.distribution import sample_from_discretized_mix_logistic, sample_from_gaussian @@ -66,19 +66,6 @@ def forward(self, x): return x -class Stretch2d(nn.Module): - def __init__(self, x_scale, y_scale): - super().__init__() - self.x_scale = x_scale - self.y_scale = y_scale - - def forward(self, x): - b, c, h, w = x.size() - x = x.unsqueeze(-1).unsqueeze(3) - x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale) - return x.view(b, c, h * self.y_scale, w * self.x_scale) - - class UpsampleNetwork(nn.Module): def __init__( self, @@ -183,7 +170,7 @@ class WavernnArgs(Coqpit): num_res_blocks: int = 10 use_aux_net: bool = True use_upsample_net: bool = True - upsample_factors: List[int] = field(default_factory=lambda: [4, 8, 8]) + upsample_factors: list[int] = field(default_factory=lambda: [4, 8, 8]) mode: str = "mold" # mold [string], gauss [string], bits [int] mulaw: bool = True # apply mulaw if mode is bits pad: int = 2 @@ -238,9 +225,9 @@ class of models has however remained an elusive problem. With a focus on text-to self.aux_dims = self.args.res_out_dims // 4 if self.args.use_upsample_net: - assert ( - np.cumprod(self.args.upsample_factors)[-1] == config.audio.hop_length - ), " [!] upsample scales needs to be equal to hop_length" + assert np.cumprod(self.args.upsample_factors)[-1] == config.audio.hop_length, ( + " [!] upsample scales needs to be equal to hop_length" + ) self.upsample = UpsampleNetwork( self.args.feat_dims, self.args.upsample_factors, @@ -319,7 +306,7 @@ def inference(self, mels, batched=None, target=None, overlap=None): rnn1 = self.get_gru_cell(self.rnn1) rnn2 = self.get_gru_cell(self.rnn2) - with torch.no_grad(): + with torch.inference_mode(): if isinstance(mels, np.ndarray): mels = torch.FloatTensor(mels).to(str(next(self.parameters()).device)) @@ -540,16 +527,14 @@ def xfade_and_unfold(y, target, overlap): return unfolded - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False, cache=False): # pylint: disable=unused-argument, redefined-builtin state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) self.load_state_dict(state["model"]) if eval: self.eval() assert not self.training - def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + def train_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]: mels = batch["input"] waveform = batch["waveform"] waveform_coarse = batch["waveform_coarse"] @@ -564,13 +549,16 @@ def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: loss_dict = criterion(y_hat, waveform_coarse) return {"model_output": y_hat}, loss_dict - def eval_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + def eval_step(self, batch: dict, criterion: dict) -> tuple[dict, dict]: return self.train_step(batch, criterion) @torch.no_grad() def test( - self, assets: Dict, test_loader: "DataLoader", output: Dict # pylint: disable=unused-argument - ) -> Tuple[Dict, Dict]: + self, + assets: dict, + test_loader: "DataLoader", + output: dict, # pylint: disable=unused-argument + ) -> tuple[dict, dict]: ap = self.ap figures = {} audios = {} @@ -591,14 +579,18 @@ def test( return figures, audios def test_log( - self, outputs: Dict, logger: "Logger", assets: Dict, steps: int # pylint: disable=unused-argument - ) -> Tuple[Dict, np.ndarray]: + self, + outputs: dict, + logger: "Logger", + assets: dict, + steps: int, # pylint: disable=unused-argument + ) -> tuple[dict, np.ndarray]: figures, audios = outputs logger.eval_figures(steps, figures) logger.eval_audios(steps, audios, self.ap.sample_rate) @staticmethod - def format_batch(batch: Dict) -> Dict: + def format_batch(batch: dict) -> dict: waveform = batch[0] mels = batch[1] waveform_coarse = batch[2] @@ -607,11 +599,12 @@ def format_batch(batch: Dict) -> Dict: def get_data_loader( # pylint: disable=no-self-use self, config: Coqpit, - assets: Dict, + assets: dict, is_eval: True, - samples: List, + samples: list, verbose: bool, num_gpus: int, + rank: int | None = None, ): ap = self.ap dataset = WaveRNNDataset( diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py index fe706ba9ff..bef68e5564 100644 --- a/TTS/vocoder/utils/distribution.py +++ b/TTS/vocoder/utils/distribution.py @@ -12,7 +12,7 @@ def gaussian_loss(y_hat, y, log_std_min=-7.0): mean = y_hat[:, :, :1] log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) # TODO: replace with pytorch dist - log_probs = -0.5 * (-math.log(2.0 * math.pi) - 2.0 * log_std - torch.pow(y - mean, 2) * torch.exp((-2.0 * log_std))) + log_probs = -0.5 * (-math.log(2.0 * math.pi) - 2.0 * log_std - torch.pow(y - mean, 2) * torch.exp(-2.0 * log_std)) return log_probs.squeeze().mean() diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index ac797d97f7..2823d206a0 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -1,5 +1,4 @@ import logging -from typing import Dict import numpy as np import torch @@ -32,7 +31,7 @@ def interpolate_vocoder_input(scale_factor, spec): return spec -def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> Dict: +def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> dict: """Plot the predicted and the real waveform and their spectrograms. Args: diff --git a/docs/source/conf.py b/docs/source/conf.py index e7d36c1f43..e878d0e8f9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -52,6 +52,7 @@ "sphinx_inline_tabs", ] +suppress_warnings = ["autosectionlabel.*"] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -67,6 +68,8 @@ "linkify", ] +myst_heading_anchors = 4 + # 'sphinxcontrib.katex', # 'sphinx.ext.autosectionlabel', diff --git a/docs/source/configuration.md b/docs/source/configuration.md index ada61e16db..220c96c363 100644 --- a/docs/source/configuration.md +++ b/docs/source/configuration.md @@ -1,6 +1,6 @@ # Configuration -We use 👩‍✈️[Coqpit] for configuration management. It provides basic static type checking and serialization capabilities on top of native Python `dataclasses`. Here is how a simple configuration looks like with Coqpit. +We use 👩‍✈️[Coqpit](https://github.com/idiap/coqui-ai-coqpit) for configuration management. It provides basic static type checking and serialization capabilities on top of native Python `dataclasses`. Here is how a simple configuration looks like with Coqpit. ```python from dataclasses import asdict, dataclass, field @@ -36,7 +36,7 @@ class SimpleConfig(Coqpit): check_argument("val_c", c, restricted=True) ``` -In TTS, each model must have a configuration class that exposes all the values necessary for its lifetime. +In Coqui, each model must have a configuration class that exposes all the values necessary for its lifetime. It defines model architecture, hyper-parameters, training, and inference settings. For our models, we merge all the fields in a single configuration class for ease. It may not look like a wise practice but enables easier bookkeeping and reproducible experiments. diff --git a/docs/source/formatting_your_dataset.md b/docs/source/datasets/formatting_your_dataset.md similarity index 95% rename from docs/source/formatting_your_dataset.md rename to docs/source/datasets/formatting_your_dataset.md index 23c497d0bf..e92263339e 100644 --- a/docs/source/formatting_your_dataset.md +++ b/docs/source/datasets/formatting_your_dataset.md @@ -1,7 +1,9 @@ (formatting_your_dataset)= -# Formatting Your Dataset +# Formatting your dataset -For training a TTS model, you need a dataset with speech recordings and transcriptions. The speech must be divided into audio clips and each clip needs transcription. +For training a TTS model, you need a dataset with speech recordings and +transcriptions. The speech must be divided into audio clips and each clip needs +a transcription. If you have a single audio file and you need to split it into clips, there are different open-source tools for you. We recommend Audacity. It is an open-source and free audio editing software. @@ -49,7 +51,7 @@ The format above is taken from widely-used the [LJSpeech](https://keithito.com/L Your dataset should have good coverage of the target language. It should cover the phonemic variety, exceptional sounds and syllables. This is extremely important for especially non-phonemic languages like English. -For more info about dataset qualities and properties check our [post](https://github.com/coqui-ai/TTS/wiki/What-makes-a-good-TTS-dataset). +For more info about dataset qualities and properties check [this page](what_makes_a_good_dataset.md). ## Using Your Dataset in 🐸TTS diff --git a/docs/source/datasets/index.md b/docs/source/datasets/index.md new file mode 100644 index 0000000000..6b040fc416 --- /dev/null +++ b/docs/source/datasets/index.md @@ -0,0 +1,12 @@ +# Datasets + +For training a TTS model, you need a dataset with speech recordings and +transcriptions. See the following pages for more information on: + +```{toctree} +:maxdepth: 1 + +formatting_your_dataset +what_makes_a_good_dataset +tts_datasets +``` diff --git a/docs/source/tts_datasets.md b/docs/source/datasets/tts_datasets.md similarity index 90% rename from docs/source/tts_datasets.md rename to docs/source/datasets/tts_datasets.md index 11da1b7688..df8d2f2ad9 100644 --- a/docs/source/tts_datasets.md +++ b/docs/source/datasets/tts_datasets.md @@ -1,6 +1,6 @@ -# TTS Datasets +# Public TTS datasets -Some of the known public datasets that we successfully applied 🐸TTS: +Some of the known public datasets that were successfully used for 🐸TTS: - [English - LJ Speech](https://keithito.com/LJ-Speech-Dataset/) - [English - Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/) diff --git a/docs/source/what_makes_a_good_dataset.md b/docs/source/datasets/what_makes_a_good_dataset.md similarity index 100% rename from docs/source/what_makes_a_good_dataset.md rename to docs/source/datasets/what_makes_a_good_dataset.md diff --git a/docs/source/docker_images.md b/docs/source/docker_images.md index 58d961203e..ef98fe302e 100644 --- a/docs/source/docker_images.md +++ b/docs/source/docker_images.md @@ -1,20 +1,20 @@ (docker_images)= -## Docker images +# Docker images We provide docker images to be able to test TTS without having to setup your own environment. -### Using premade images +## Using premade images You can use premade images built automatically from the latest TTS version. -#### CPU version +### CPU version ```bash -docker pull ghcr.io/coqui-ai/tts-cpu +docker pull ghcr.io/idiap/coqui-tts-cpu ``` -#### GPU version +### GPU version ```bash -docker pull ghcr.io/coqui-ai/tts +docker pull ghcr.io/idiap/coqui-tts ``` -### Building your own image +## Building your own image ```bash docker build -t tts . ``` @@ -25,14 +25,14 @@ You can pass any tts argument after the image name. ### CPU version ```bash -docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav +docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/idiap/coqui-tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav ``` ### GPU version For the GPU version, you need to have the latest NVIDIA drivers installed. With `nvidia-smi` you can check the CUDA version supported, it must be >= 11.8 ```bash -docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda +docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/idiap/coqui-tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda ``` ## Start a server @@ -41,14 +41,14 @@ Start the container and get a shell inside it. ### CPU version ```bash -docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu +docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/idiap/coqui-tts-cpu python3 TTS/server/server.py --list_models #To get the list of available models python3 TTS/server/server.py --model_name tts_models/en/vctk/vits ``` ### GPU version ```bash -docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/coqui-ai/tts +docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/idiap/coqui-tts python3 TTS/server/server.py --list_models #To get the list of available models python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda ``` diff --git a/docs/source/implementing_a_new_language_frontend.md b/docs/source/extension/implementing_a_new_language_frontend.md similarity index 88% rename from docs/source/implementing_a_new_language_frontend.md rename to docs/source/extension/implementing_a_new_language_frontend.md index 2041352d64..0b3ef59be0 100644 --- a/docs/source/implementing_a_new_language_frontend.md +++ b/docs/source/extension/implementing_a_new_language_frontend.md @@ -1,6 +1,6 @@ -# Implementing a New Language Frontend +# Implementing new language front ends -- Language frontends are located under `TTS.tts.utils.text` +- Language front ends are located under `TTS.tts.utils.text` - Each special language has a separate folder. - Each folder contains all the utilities for processing the text input. - `TTS.tts.utils.text.phonemizers` contains the main phonemizer for a language. This is the class that uses the utilities diff --git a/docs/source/implementing_a_new_model.md b/docs/source/extension/implementing_a_new_model.md similarity index 96% rename from docs/source/implementing_a_new_model.md rename to docs/source/extension/implementing_a_new_model.md index 1bf7a8822e..188f466c72 100644 --- a/docs/source/implementing_a_new_model.md +++ b/docs/source/extension/implementing_a_new_model.md @@ -1,4 +1,4 @@ -# Implementing a Model +# Implementing new models 1. Implement layers. @@ -36,7 +36,8 @@ There is also the `callback` interface by which you can manipulate both the model and the `Trainer` states. Callbacks give you an infinite flexibility to add custom behaviours for your model and training routines. - For more details, see {ref}`BaseTTS ` and :obj:`TTS.utils.callbacks`. + For more details, see [BaseTTS](../main_classes/model_api.md#base-tts-model) + and [`trainer.callbacks`](https://github.com/idiap/coqui-ai-Trainer/blob/main/trainer/callbacks.py). 6. Optionally, define `MyModelArgs`. @@ -62,7 +63,7 @@ We love you more when you document your code. ❤️ -# Template 🐸TTS Model implementation +## Template 🐸TTS Model implementation You can start implementing your model by copying the following base class. diff --git a/docs/source/extension/index.md b/docs/source/extension/index.md new file mode 100644 index 0000000000..39c36b632c --- /dev/null +++ b/docs/source/extension/index.md @@ -0,0 +1,14 @@ +# Adding models or languages + +You can extend Coqui by implementing new model architectures or adding front +ends for new languages. See the pages below for more details. The [project +structure](../project_structure.md) and [contribution +guidelines](../contributing.md) may also be helpful. Please open a pull request +with your changes to share back the improvements with the community. + +```{toctree} +:maxdepth: 1 + +implementing_a_new_model +implementing_a_new_language_frontend +``` diff --git a/docs/source/faq.md b/docs/source/faq.md index 1090aaa35c..4fbd149f00 100644 --- a/docs/source/faq.md +++ b/docs/source/faq.md @@ -1,28 +1,56 @@ -# Humble FAQ -We tried to collect common issues and questions we receive about 🐸TTS. It is worth checking before going deeper. +# FAQ +We tried to collect common issues and questions we receive about 🐸TTS. It is +worth checking before going deeper. -## Errors with a pre-trained model. How can I resolve this? -- Make sure you use the right commit version of 🐸TTS. Each pre-trained model has its corresponding version that needs to be used. It is defined on the model table. -- If it is still problematic, post your problem on [Discussions](https://github.com/idiap/coqui-ai-TTS/discussions). Please give as many details as possible (error message, your TTS version, your TTS model and config.json etc.) -- If you feel like it's a bug to be fixed, then prefer Github issues with the same level of scrutiny. +## Using Coqui -## What are the requirements of a good 🐸TTS dataset? -* {ref}`See this page ` +### Where does Coqui store downloaded models? -## How should I choose the right model? +The path to downloaded models is printed when running `tts --list_models`. +Default locations are: + +- **Linux:** `~/.local/share/tts` +- **Mac:** `~/Library/Application Support/tts` +- **Windows:** `C:\Users\\AppData\Local\tts` + +You can change the prefix of this `tts/` folder by setting the `XDG_DATA_HOME` +or `TTS_HOME` environment variables. + +### Errors with a pre-trained model. How can I resolve this? +- Make sure you use the latest version of 🐸TTS. Each pre-trained model is only + supported from a certain minimum version. +- If it is still problematic, post your problem on + [Discussions](https://github.com/idiap/coqui-ai-TTS/discussions). Please give + as many details as possible (error message, your TTS version, your TTS model + and config.json etc.) +- If you feel like it's a bug to be fixed, then prefer Github issues with the + same level of scrutiny. + +## Training Coqui models + +### What are the requirements of a good 🐸TTS dataset? +- [See this page](datasets/what_makes_a_good_dataset.md) + +### How should I choose the right model? - First, train Tacotron. It is smaller and faster to experiment with. If it performs poorly, try Tacotron2. - Tacotron models produce the most natural voice if your dataset is not too noisy. - If both models do not perform well and especially the attention does not align, then try AlignTTS or GlowTTS. - If you need faster models, consider SpeedySpeech, GlowTTS or AlignTTS. Keep in mind that SpeedySpeech requires a pre-trained Tacotron or Tacotron2 model to compute text-to-speech alignments. -## How can I train my own `tts` model? +### How can I train my own `tts` model? + +```{note} XTTS has separate fine-tuning scripts, see [here](models/xtts.md#training). +``` + 0. Check your dataset with notebooks in [dataset_analysis](https://github.com/idiap/coqui-ai-TTS/tree/main/notebooks/dataset_analysis) folder. Use [this notebook](https://github.com/idiap/coqui-ai-TTS/blob/main/notebooks/dataset_analysis/CheckSpectrograms.ipynb) to find the right audio processing parameters. A better set of parameters results in a better audio synthesis. -1. Write your own dataset `formatter` in `datasets/formatters.py` or format your dataset as one of the supported datasets, like LJSpeech. +1. Write your own dataset `formatter` in `datasets/formatters.py` or [format](datasets/formatting_your_dataset) your dataset as one of the supported datasets, like LJSpeech. A `formatter` parses the metadata file and converts a list of training samples. 2. If you have a dataset with a different alphabet than English, you need to set your own character list in the ```config.json```. - - If you use phonemes for training and your language is supported [here](https://github.com/rhasspy/gruut#supported-languages), you don't need to set your character list. + - If you use phonemes for training and your language is supported by + [Espeak](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md) + or [Gruut](https://github.com/rhasspy/gruut#supported-languages), you don't need to set your character list. - You can use `TTS/bin/find_unique_chars.py` to get characters used in your dataset. 3. Write your own text cleaner in ```utils.text.cleaners```. It is not always necessary, except when you have a different alphabet or language-specific requirements. @@ -61,15 +89,16 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is - SingleGPU training: ```CUDA_VISIBLE_DEVICES="0" python train_tts.py --config_path config.json``` - MultiGPU training: ```python3 -m trainer.distribute --gpus "0,1" --script TTS/bin/train_tts.py --config_path config.json``` -**Note:** You can also train your model using pure 🐍 python. Check ```{eval-rst} :ref: 'tutorial_for_nervous_beginners'```. +**Note:** You can also train your model using pure 🐍 python. Check the +[tutorial](tutorial_for_nervous_beginners.md). -## How can I train in a different language? +### How can I train in a different language? - Check steps 2, 3, 4, 5 above. -## How can I train multi-GPUs? +### How can I train multi-GPUs? - Check step 5 above. -## How can I check model performance? +### How can I check model performance? - You can inspect model training and performance using ```tensorboard```. It will show you loss, attention alignment, model output. Go with the order below to measure the model performance. 1. Check ground truth spectrograms. If they do not look as they are supposed to, then check audio processing parameters in ```config.json```. 2. Check train and eval losses and make sure that they all decrease smoothly in time. @@ -84,7 +113,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is - 'bidirectional_decoder' is your ultimate savior, but it trains 2x slower and demands 1.5x more GPU memory. - You can also try the other models like AlignTTS or GlowTTS. -## How do I know when to stop training? +### How do I know when to stop training? There is no single objective metric to decide the end of a training since the voice quality is a subjective matter. In our model trainings, we follow these steps; @@ -97,17 +126,17 @@ In our model trainings, we follow these steps; Keep in mind that the approach above only validates the model robustness. It is hard to estimate the voice quality without asking the actual people. The best approach is to pick a set of promising models and run a Mean-Opinion-Score study asking actual people to score the models. -## My model does not learn. How can I debug? +### My model does not learn. How can I debug? - Go over the steps under "How can I check model performance?" -## Attention does not align. How can I make it work? +### Attention does not align. How can I make it work? - Check the 4th step under "How can I check model performance?" -## How can I test a trained model? -- The best way is to use `tts` or `tts-server` commands. For details check {ref}`here `. +### How can I test a trained model? +- The best way is to use `tts` or `tts-server` commands. For details check [here](inference.md). - If you need to code your own ```TTS.utils.synthesizer.Synthesizer``` class. -## My Tacotron model does not stop - I see "Decoder stopped with 'max_decoder_steps" - Stopnet does not work. +### My Tacotron model does not stop - I see "Decoder stopped with 'max_decoder_steps" - Stopnet does not work. - In general, all of the above relates to the `stopnet`. It is the part of the model telling the `decoder` when to stop. - In general, a poor `stopnet` relates to something else that is broken in your model or dataset. Especially the attention module. - One common reason is the silent parts in the audio clips at the beginning and the ending. Check ```trim_db``` value in the config. You can find a better value for your dataset by using ```CheckSpectrogram``` notebook. If this value is too small, too much of the audio will be trimmed. If too big, then too much silence will remain. Both will curtail the `stopnet` performance. diff --git a/docs/source/index.md b/docs/source/index.md index 79993eec76..3a030b4f81 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,62 +1,63 @@ +--- +hide-toc: true +--- ```{include} ../../README.md :relative-images: +:end-before: ``` ----- - -# Documentation Content -```{eval-rst} -.. toctree:: - :maxdepth: 2 - :caption: Get started - - tutorial_for_nervous_beginners - installation - faq - contributing - -.. toctree:: - :maxdepth: 2 - :caption: Using 🐸TTS - - inference - docker_images - implementing_a_new_model - implementing_a_new_language_frontend - training_a_model - finetuning - configuration - formatting_your_dataset - what_makes_a_good_dataset - tts_datasets - marytts - -.. toctree:: - :maxdepth: 2 - :caption: Main Classes - - main_classes/trainer_api - main_classes/audio_processor - main_classes/model_api - main_classes/dataset - main_classes/gan - main_classes/speaker_manager - -.. toctree:: - :maxdepth: 2 - :caption: `tts` Models - - models/glow_tts.md - models/vits.md - models/forward_tts.md - models/tacotron1-2.md - models/overflow.md - models/tortoise.md - models/bark.md - models/xtts.md - -.. toctree:: - :maxdepth: 2 - :caption: `vocoder` Models +```{toctree} +:maxdepth: 1 +:caption: Get started +:hidden: + +tutorial_for_nervous_beginners +installation +docker_images +faq +project_structure +contributing +``` + +```{toctree} +:maxdepth: 1 +:caption: Using Coqui +:hidden: + +inference +training/index +extension/index +datasets/index +``` + + +```{toctree} +:maxdepth: 1 +:caption: Main Classes +:hidden: + +configuration +main_classes/trainer_api +main_classes/audio_processor +main_classes/model_api +main_classes/dataset +main_classes/gan +main_classes/speaker_manager +``` + + +```{toctree} +:maxdepth: 1 +:caption: TTS Models +:hidden: + +models/glow_tts.md +models/vits.md +models/forward_tts.md +models/tacotron1-2.md +models/overflow.md +models/tortoise.md +models/bark.md +models/xtts.md ``` diff --git a/docs/source/inference.md b/docs/source/inference.md index 4cb8f45a71..1bb844aee3 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -1,194 +1,22 @@ (synthesizing_speech)= -# Synthesizing Speech +# Synthesizing speech -First, you need to install TTS. We recommend using PyPi. You need to call the command below: +## Overview -```bash -$ pip install coqui-tts -``` - -After the installation, 2 terminal commands are available. - -1. TTS Command Line Interface (CLI). - `tts` -2. Local Demo Server. - `tts-server` -3. In 🐍Python. - `from TTS.api import TTS` - -## On the Commandline - `tts` -![cli.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/tts_cli.gif) - -After the installation, 🐸TTS provides a CLI interface for synthesizing speech using pre-trained models. You can either use your own model or the release models under 🐸TTS. - -Listing released 🐸TTS models. - -```bash -tts --list_models -``` - -Run a TTS model, from the release models list, with its default vocoder. (Simply copy and paste the full model names from the list as arguments for the command below.) - -```bash -tts --text "Text for TTS" \ - --model_name "///" \ - --out_path folder/to/save/output.wav -``` - -Run a tts and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model. - -```bash -tts --text "Text for TTS" \ - --model_name "tts_models///" \ - --vocoder_name "vocoder_models///" \ - --out_path folder/to/save/output.wav -``` - -Run your own TTS model (Using Griffin-Lim Vocoder) - -```bash -tts --text "Text for TTS" \ - --model_path path/to/model.pth \ - --config_path path/to/config.json \ - --out_path folder/to/save/output.wav -``` - -Run your own TTS and Vocoder models - -```bash -tts --text "Text for TTS" \ - --config_path path/to/config.json \ - --model_path path/to/model.pth \ - --out_path folder/to/save/output.wav \ - --vocoder_path path/to/vocoder.pth \ - --vocoder_config_path path/to/vocoder_config.json -``` - -Run a multi-speaker TTS model from the released models list. - -```bash -tts --model_name "tts_models///" --list_speaker_idxs # list the possible speaker IDs. -tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "tts_models///" --speaker_idx "" -``` - -Run a released voice conversion model - -```bash -tts --model_name "voice_conversion///" - --source_wav "my/source/speaker/audio.wav" - --target_wav "my/target/speaker/audio.wav" - --out_path folder/to/save/output.wav -``` - -**Note:** You can use ```./TTS/bin/synthesize.py``` if you prefer running ```tts``` from the TTS project folder. - -## On the Demo Server - `tts-server` - - -![server.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/demo_server.gif) - -You can boot up a demo 🐸TTS server to run an inference with your models (make -sure to install the additional dependencies with `pip install coqui-tts[server]`). -Note that the server is not optimized for performance but gives you an easy way -to interact with the models. +Coqui TTS provides three main methods for inference: -The demo server provides pretty much the same interface as the CLI command. +1. 🐍Python API +2. TTS command line interface (CLI) +3. [Local demo server](server.md) -```bash -tts-server -h # see the help -tts-server --list_models # list the available models. +```{include} ../../README.md +:start-after: ``` -Run a TTS model, from the release models list, with its default vocoder. -If the model you choose is a multi-speaker TTS model, you can select different speakers on the Web interface and synthesize -speech. - -```bash -tts-server --model_name "///" -``` - -Run a TTS and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model. - -```bash -tts-server --model_name "///" \ - --vocoder_name "///" -``` - -## Python 🐸TTS API - -You can run a multi-speaker and multi-lingual model in Python as - -```python -import torch -from TTS.api import TTS - -# Get device -device = "cuda" if torch.cuda.is_available() else "cpu" - -# List available 🐸TTS models -print(TTS().list_models()) - -# Init TTS -tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) - -# Run TTS -# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language -# Text to speech list of amplitude values as output -wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en") -# Text to speech to a file -tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav") -``` - -#### Here is an example for a single speaker model. - -```python -# Init TTS with the target model name -tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False) -# Run TTS -tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH) -``` - -#### Example voice cloning with YourTTS in English, French and Portuguese: - -```python -tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to("cuda") -tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav") -tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="output.wav") -tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="output.wav") -``` - -#### Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav` - -```python -tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda") -tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav") -``` - -#### Example voice cloning by a single speaker TTS model combining with the voice conversion model. - -This way, you can clone voices by using any model in 🐸TTS. - -```python -tts = TTS("tts_models/de/thorsten/tacotron2-DDC") -tts.tts_with_vc_to_file( - "Wie sage ich auf Italienisch, dass ich dich liebe?", - speaker_wav="target/speaker.wav", - file_path="ouptut.wav" -) -``` - -#### Example text to speech using **Fairseq models in ~1100 languages** 🤯. -For these models use the following name format: `tts_models//fairseq/vits`. - -You can find the list of language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms). - -```python -from TTS.api import TTS -api = TTS(model_name="tts_models/eng/fairseq/vits").to("cuda") -api.tts_to_file("This is a test.", file_path="output.wav") -# TTS with on the fly voice conversion -api = TTS("tts_models/deu/fairseq/vits") -api.tts_with_vc_to_file( - "Wie sage ich auf Italienisch, dass ich dich liebe?", - speaker_wav="target/speaker.wav", - file_path="ouptut.wav" -) +```{toctree} +:hidden: +vc +server +marytts ``` diff --git a/docs/source/installation.md b/docs/source/installation.md index 405c436643..1315395a59 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -1,40 +1,6 @@ # Installation -🐸TTS supports python >=3.9 <3.13.0 and was tested on Ubuntu 22.04. - -## Using `pip` - -`pip` is recommended if you want to use 🐸TTS only for inference. - -You can install from PyPI as follows: - -```bash -pip install coqui-tts # from PyPI -``` - -Or install from Github: - -```bash -pip install git+https://github.com/idiap/coqui-ai-TTS # from Github +```{include} ../../README.md +:start-after: +:end-before: ``` - -## Installing From Source - -This is recommended for development and more control over 🐸TTS. - -```bash -git clone https://github.com/idiap/coqui-ai-TTS -cd coqui-ai-TTS -make system-deps # only on Linux systems. - -# Install package and optional extras -make install - -# Same as above + dev dependencies and pre-commit -make install_dev -``` - -## On Windows -If you are on Windows, 👑@GuyPaddock wrote installation instructions -[here](https://stackoverflow.com/questions/66726331/) (note that these are out -of date, e.g. you need to have at least Python 3.9) diff --git a/docs/source/main_classes/model_api.md b/docs/source/main_classes/model_api.md index 71b3d41640..bb7e9d1a1d 100644 --- a/docs/source/main_classes/model_api.md +++ b/docs/source/main_classes/model_api.md @@ -1,22 +1,22 @@ # Model API Model API provides you a set of functions that easily make your model compatible with the `Trainer`, -`Synthesizer` and `ModelZoo`. +`Synthesizer` and the Coqui Python API. -## Base TTS Model +## Base Trainer Model ```{eval-rst} .. autoclass:: TTS.model.BaseTrainerModel :members: ``` -## Base tts Model +## Base TTS Model ```{eval-rst} .. autoclass:: TTS.tts.models.base_tts.BaseTTS :members: ``` -## Base vocoder Model +## Base Vocoder Model ```{eval-rst} .. autoclass:: TTS.vocoder.models.base_vocoder.BaseVocoder diff --git a/docs/source/main_classes/trainer_api.md b/docs/source/main_classes/trainer_api.md index 335294aa4d..bdb6048e45 100644 --- a/docs/source/main_classes/trainer_api.md +++ b/docs/source/main_classes/trainer_api.md @@ -1,3 +1,3 @@ # Trainer API -We made the trainer a separate project on https://github.com/eginhard/coqui-trainer +We made the trainer a separate project: https://github.com/idiap/coqui-ai-Trainer diff --git a/docs/source/marytts.md b/docs/source/marytts.md index 9091ca330f..11cf4a2b9a 100644 --- a/docs/source/marytts.md +++ b/docs/source/marytts.md @@ -1,4 +1,4 @@ -# Mary-TTS API Support for Coqui-TTS +# Mary-TTS API support for Coqui TTS ## What is Mary-TTS? diff --git a/docs/source/models/bark.md b/docs/source/models/bark.md index a180afbb91..77f99c0d3a 100644 --- a/docs/source/models/bark.md +++ b/docs/source/models/bark.md @@ -37,7 +37,7 @@ from TTS.api import TTS # Load the model to GPU # Bark is really slow on CPU, so we recommend using GPU. -tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True) +tts = TTS("tts_models/multilingual/multi-dataset/bark").to("cuda") # Cloning a new speaker @@ -57,7 +57,7 @@ tts.tts_to_file(text="Hello, my name is Manmay , how are you?", # random speaker -tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True) +tts = TTS("tts_models/multilingual/multi-dataset/bark").to("cuda") tts.tts_to_file("hello world", file_path="out.wav") ``` diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index c07d879f7c..5f6c6ba44c 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -1,25 +1,25 @@ -# ⓍTTS -ⓍTTS is a super cool Text-to-Speech model that lets you clone voices in different languages by using just a quick 3-second audio clip. Built on the 🐢Tortoise, -ⓍTTS has important model changes that make cross-language voice cloning and multi-lingual speech generation super easy. +# XTTS +XTTS is a super cool Text-to-Speech model that lets you clone voices in different languages by using just a quick 3-second audio clip. Built on the 🐢Tortoise, +XTTS has important model changes that make cross-language voice cloning and multi-lingual speech generation super easy. There is no need for an excessive amount of training data that spans countless hours. -### Features +## Features - Voice cloning. - Cross-language voice cloning. - Multi-lingual speech generation. - 24khz sampling rate. -- Streaming inference with < 200ms latency. (See [Streaming inference](#streaming-inference)) +- Streaming inference with < 200ms latency. (See [Streaming inference](#streaming-manually)) - Fine-tuning support. (See [Training](#training)) -### Updates with v2 +## Updates with v2 - Improved voice cloning. - Voices can be cloned with a single audio file or multiple audio files, without any effect on the runtime. - Across the board quality improvements. -### Code +## Code Current implementation only supports inference and GPT encoder training. -### Languages +## Languages XTTS-v2 supports 17 languages: - Arabic (ar) @@ -40,15 +40,15 @@ XTTS-v2 supports 17 languages: - Spanish (es) - Turkish (tr) -### License +## License This model is licensed under [Coqui Public Model License](https://coqui.ai/cpml). -### Contact +## Contact Come and join in our 🐸Community. We're active on [Discord](https://discord.gg/fBC58unbKE) and [Github](https://github.com/idiap/coqui-ai-TTS/discussions). -### Inference +## Inference -#### 🐸TTS Command line +### 🐸TTS Command line You can check all supported languages with the following command: @@ -64,7 +64,7 @@ You can check all Coqui available speakers with the following command: --list_speaker_idx ``` -##### Coqui speakers +#### Coqui speakers You can do inference using one of the available speakers using the following command: ```console @@ -75,10 +75,10 @@ You can do inference using one of the available speakers using the following com --use_cuda ``` -##### Clone a voice +#### Clone a voice You can clone a speaker voice using a single or multiple references: -###### Single reference +##### Single reference ```console tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ @@ -88,7 +88,7 @@ You can clone a speaker voice using a single or multiple references: --use_cuda ``` -###### Multiple references +##### Multiple references ```console tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ --text "Bugün okula gitmek istemiyorum." \ @@ -106,19 +106,19 @@ or for all wav files in a directory you can use: --use_cuda ``` -#### 🐸TTS API +### 🐸TTS API -##### Clone a voice +#### Clone a voice You can clone a speaker voice using a single or multiple references: -###### Single reference +##### Single reference Splits the text into sentences and generates audio for each sentence. The audio files are then concatenated to produce the final audio. You can optionally disable sentence splitting for better coherence but more VRAM and possibly hitting models context length limit. ```python from TTS.api import TTS -tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda") # generate speech by cloning a voice using default settings tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", @@ -129,7 +129,7 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t ) ``` -###### Multiple references +##### Multiple references You can pass multiple audio files to the `speaker_wav` argument for better voice cloning. @@ -137,15 +137,15 @@ You can pass multiple audio files to the `speaker_wav` argument for better voice from TTS.api import TTS # using the default version set in 🐸TTS -tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda") # using a specific version # 👀 see the branch names for versions on https://huggingface.co/coqui/XTTS-v2/tree/main # ❗some versions might be incompatible with the API -tts = TTS("xtts_v2.0.2", gpu=True) +tts = TTS("xtts_v2.0.2").to("cuda") # getting the latest XTTS_v2 -tts = TTS("xtts", gpu=True) +tts = TTS("xtts").to("cuda") # generate speech by cloning a voice using default settings tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", @@ -154,37 +154,38 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t language="en") ``` -##### Coqui speakers +#### Coqui speakers You can do inference using one of the available speakers using the following code: ```python from TTS.api import TTS -tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda") # generate speech by cloning a voice using default settings -tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - file_path="output.wav", - speaker="Ana Florence", - language="en", - split_sentences=True - ) +tts.tts_to_file( + text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + file_path="output.wav", + speaker="Ana Florence", + language="en", + split_sentences=True +) ``` -#### 🐸TTS Model API +### 🐸TTS Model API To use the model API, you need to download the model files and pass config and model file paths manually. -#### Manual Inference +### Manual Inference If you want to be able to `load_checkpoint` with `use_deepspeed=True` and **enjoy the speedup**, you need to install deepspeed first. ```console -pip install deepspeed==0.10.3 +pip install deepspeed ``` -##### inference parameters +#### Inference parameters - `text`: The text to be synthesized. - `language`: The language of the text to be synthesized. @@ -199,7 +200,7 @@ pip install deepspeed==0.10.3 - `enable_text_splitting`: Whether to split the text into sentences and generate audio for each sentence. It allows you to have infinite input length but might loose important context between sentences. Defaults to True. -##### Inference +#### Inference ```python @@ -230,8 +231,13 @@ out = model.inference( torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000) ``` +You can also use the Coqui speakers: + +```python +gpt_cond_latent, speaker_embedding = model.speaker_manager.speakers["Ana Florence"].values() +``` -##### Streaming manually +#### Streaming manually Here the goal is to stream the audio as it is being generated. This is useful for real-time applications. Streaming inference is typically slower than regular inference, but it allows to get a first chunk of audio faster. @@ -275,9 +281,9 @@ torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000) ``` -### Training +## Training -#### Easy training +### Easy training To make `XTTS_v2` GPT encoder training easier for beginner users we did a gradio demo that implements the whole fine-tuning pipeline. The gradio demo enables the user to easily do the following steps: - Preprocessing of the uploaded audio or audio files in 🐸 TTS coqui formatter @@ -286,7 +292,7 @@ To make `XTTS_v2` GPT encoder training easier for beginner users we did a gradio The user can run this gradio demo locally or remotely using a Colab Notebook. -##### Run demo on Colab +#### Run demo on Colab To make the `XTTS_v2` fine-tuning more accessible for users that do not have good GPUs available we did a Google Colab Notebook. The Colab Notebook is available [here](https://colab.research.google.com/drive/1GiI4_X724M8q2W-zZ-jXo7cWTV7RfaH-?usp=sharing). @@ -302,7 +308,7 @@ If you are not able to acess the video you need to follow the steps: 5. Soon the training is done you can go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. Then you can do the inference on the model by clicking on the button "Step 4 - Inference". -##### Run demo locally +#### Run demo locally To run the demo locally you need to do the following steps: 1. Install 🐸 TTS following the instructions available [here](https://coqui-tts.readthedocs.io/en/latest/installation.html). @@ -319,7 +325,7 @@ If you are not able to access the video, here is what you need to do: 4. Go to the third Tab (3 - Inference) and then click on the button "Step 3 - Load Fine-tuned XTTS model" and wait until the fine-tuned model is loaded. 5. Now you can run inference with the model by clicking on the button "Step 4 - Inference". -#### Advanced training +### Advanced training A recipe for `XTTS_v2` GPT encoder training using `LJSpeech` dataset is available at https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech/xtts_v1/train_gpt_xtts.py @@ -393,6 +399,6 @@ torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000) ## XTTS Model ```{eval-rst} -.. autoclass:: TTS.tts.models.xtts.XTTS +.. autoclass:: TTS.tts.models.xtts.Xtts :members: ``` diff --git a/docs/source/project_structure.md b/docs/source/project_structure.md new file mode 100644 index 0000000000..af3e472adc --- /dev/null +++ b/docs/source/project_structure.md @@ -0,0 +1,30 @@ +# Project structure + +## Directory structure + +A non-comprehensive overview of the Coqui source code: + +| Directory | Contents | +| - | - | +| **Core** | | +| **[`TTS/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS)** | Main source code | +| **[`- .models.json`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/.models.json)** | Pretrained model list | +| **[`- api.py`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/api.py)** | Python API | +| **[`- bin/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/bin)** | Executables and CLI | +| **[`- tts/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/tts)** | Text-to-speech models | +| **[`- configs/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/tts/configs)** | Model configurations | +| **[`- layers/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/tts/layers)** | Model layer definitions | +| **[`- models/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/tts/models)** | Model definitions | +| **[`- vc/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/vc)** | Voice conversion models | +| `- (same)` | | +| **[`- vocoder/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/vocoder)** | Vocoder models | +| `- (same)` | | +| **[`- encoder/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/TTS/encoder)** | Speaker encoder models | +| `- (same)` | | +| **Recipes/notebooks** | | +| **[`notebooks/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/notebooks)** | Jupyter Notebooks for model evaluation, parameter selection and data analysis | +| **[`recipes/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes)** | Training recipes | +| **Others** | | +| **[`pyproject.toml`](https://github.com/idiap/coqui-ai-TTS/tree/dev/pyproject.toml)** | Project metadata, configuration and dependencies | +| **[`docs/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/docs)** | Documentation | +| **[`tests/`](https://github.com/idiap/coqui-ai-TTS/tree/dev/tests)** | Unit and integration tests | diff --git a/docs/source/server.md b/docs/source/server.md new file mode 100644 index 0000000000..69bdace27b --- /dev/null +++ b/docs/source/server.md @@ -0,0 +1,30 @@ +# Demo server + +![server.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/demo_server.gif) + +You can boot up a demo 🐸TTS server to run an inference with your models (make +sure to install the additional dependencies with `pip install coqui-tts[server]`). +Note that the server is not optimized for performance. + +The demo server provides pretty much the same interface as the CLI command. + +```bash +tts-server -h # see the help +tts-server --list_models # list the available models. +``` + +Run a TTS model, from the release models list, with its default vocoder. +If the model you choose is a multi-speaker or multilingual TTS model, you can +select different speakers and languages on the Web interface and synthesize +speech. + +```bash +tts-server --model_name "///" +``` + +Run a TTS and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model. + +```bash +tts-server --model_name "///" \ + --vocoder_name "///" +``` diff --git a/docs/source/finetuning.md b/docs/source/training/finetuning.md similarity index 91% rename from docs/source/finetuning.md rename to docs/source/training/finetuning.md index 548e385ec7..fa2ed34a54 100644 --- a/docs/source/finetuning.md +++ b/docs/source/training/finetuning.md @@ -1,4 +1,4 @@ -# Fine-tuning a 🐸 TTS model +# Fine-tuning a model ## Fine-tuning @@ -21,17 +21,21 @@ them and fine-tune it for your own dataset. This will help you in two main ways: Fine-tuning comes to the rescue in this case. You can take one of our pre-trained models and fine-tune it on your own speech dataset and achieve reasonable results with only a couple of hours of data. - However, note that, fine-tuning does not ensure great results. The model performance still depends on the - {ref}`dataset quality ` and the hyper-parameters you choose for fine-tuning. Therefore, + However, note that, fine-tuning does not ensure great results. The model + performance still depends on the [dataset quality](../datasets/what_makes_a_good_dataset.md) + and the hyper-parameters you choose for fine-tuning. Therefore, it still takes a bit of tinkering. ## Steps to fine-tune a 🐸 TTS model +```{note} XTTS has separate fine-tuning scripts, see [here](../models/xtts.md#training). +``` + 1. Setup your dataset. You need to format your target dataset in a certain way so that 🐸TTS data loader will be able to load it for the - training. Please see {ref}`this page ` for more information about formatting. + training. Please see [this page](../datasets/formatting_your_dataset.md) for more information about formatting. 2. Choose the model you want to fine-tune. @@ -47,7 +51,8 @@ them and fine-tune it for your own dataset. This will help you in two main ways: You should choose the model based on your requirements. Some models are fast and some are better in speech quality. One lazy way to test a model is running the model on the hardware you want to use and see how it works. For - simple testing, you can use the `tts` command on the terminal. For more info see {ref}`here `. + simple testing, you can use the `tts` command on the terminal. For more info + see [here](../inference.md). 3. Download the model. diff --git a/docs/source/training/index.md b/docs/source/training/index.md new file mode 100644 index 0000000000..b09f9cadcb --- /dev/null +++ b/docs/source/training/index.md @@ -0,0 +1,13 @@ +# Training and fine-tuning + +The following pages show you how to train and fine-tune Coqui models: + +```{toctree} +:maxdepth: 1 + +training_a_model +finetuning +``` + +Also see the [XTTS page](../models/xtts.md#training) if you want to fine-tune +that model. diff --git a/docs/source/training_a_model.md b/docs/source/training/training_a_model.md similarity index 92% rename from docs/source/training_a_model.md rename to docs/source/training/training_a_model.md index 989a57042a..22505ccb17 100644 --- a/docs/source/training_a_model.md +++ b/docs/source/training/training_a_model.md @@ -1,4 +1,4 @@ -# Training a Model +# Training a model 1. Decide the model you want to use. @@ -11,11 +11,10 @@ 3. Check the recipes. - Recipes are located under `TTS/recipes/`. They do not promise perfect models but they provide a good start point for - `Nervous Beginners`. + Recipes are located under `TTS/recipes/`. They do not promise perfect models but they provide a good start point. A recipe for `GlowTTS` using `LJSpeech` dataset looks like below. Let's be creative and call this `train_glowtts.py`. - ```{literalinclude} ../../recipes/ljspeech/glow_tts/train_glowtts.py + ```{literalinclude} ../../../recipes/ljspeech/glow_tts/train_glowtts.py ``` You need to change fields of the `BaseDatasetConfig` to match your dataset and then update `GlowTTSConfig` @@ -113,7 +112,7 @@ Note that different models have different metrics, visuals and outputs. - You should also check the [FAQ page](https://github.com/coqui-ai/TTS/wiki/FAQ) for common problems and solutions + You should also check the [FAQ page](../faq.md) for common problems and solutions that occur in a training. 7. Use your best model for inference. @@ -132,7 +131,7 @@ In the example above, we trained a `GlowTTS` model, but the same workflow applies to all the other 🐸TTS models. -# Multi-speaker Training +## Multi-speaker Training Training a multi-speaker model is mostly the same as training a single-speaker model. You need to specify a couple of configuration parameters, initiate a `SpeakerManager` instance and pass it to the model. @@ -142,5 +141,5 @@ d-vectors. For using d-vectors, you first need to compute the d-vectors using th The same Glow-TTS model above can be trained on a multi-speaker VCTK dataset with the script below. -```{literalinclude} ../../recipes/vctk/glow_tts/train_glow_tts.py +```{literalinclude} ../../../recipes/vctk/glow_tts/train_glow_tts.py ``` diff --git a/docs/source/tutorial_for_nervous_beginners.md b/docs/source/tutorial_for_nervous_beginners.md index b417c4c45a..5e5eac0e0a 100644 --- a/docs/source/tutorial_for_nervous_beginners.md +++ b/docs/source/tutorial_for_nervous_beginners.md @@ -1,24 +1,40 @@ -# Tutorial For Nervous Beginners +# Tutorial for nervous beginners -## Installation +First [install](installation.md) Coqui TTS. -User friendly installation. Recommended only for synthesizing voice. +## Synthesizing Speech + +You can run `tts` and synthesize speech directly on the terminal. ```bash -$ pip install coqui-tts +$ tts -h # see the help +$ tts --list_models # list the available models. ``` -Developer friendly installation. +![cli.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/tts_cli.gif) + + +You can call `tts-server` to start a local demo server that you can open on +your favorite web browser and 🗣️ (make sure to install the additional +dependencies with `pip install coqui-tts[server]`). ```bash -$ git clone https://github.com/idiap/coqui-ai-TTS -$ cd coqui-ai-TTS -$ pip install -e . +$ tts-server -h # see the help +$ tts-server --list_models # list the available models. ``` +![server.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/demo_server.gif) + +See [this page](inference.md) for more details on synthesizing speech with the +CLI, server or Python API. ## Training a `tts` Model -A breakdown of a simple script that trains a GlowTTS model on the LJspeech dataset. See the comments for more details. +```{note} XTTS has separate fine-tuning scripts, see [here](models/xtts.md#training). +``` + +A breakdown of a simple script that trains a GlowTTS model on the LJspeech +dataset. For a more in-depth guide to training and fine-tuning also see [this +page](training/index.md). ### Pure Python Way @@ -99,25 +115,3 @@ We still support running training from CLI like in the old days. The same traini ``` ❗️ Note that you can also use ```train_vocoder.py``` as the ```tts``` models above. - -## Synthesizing Speech - -You can run `tts` and synthesize speech directly on the terminal. - -```bash -$ tts -h # see the help -$ tts --list_models # list the available models. -``` - -![cli.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/tts_cli.gif) - - -You can call `tts-server` to start a local demo server that you can open on -your favorite web browser and 🗣️ (make sure to install the additional -dependencies with `pip install coqui-tts[server]`). - -```bash -$ tts-server -h # see the help -$ tts-server --list_models # list the available models. -``` -![server.gif](https://github.com/idiap/coqui-ai-TTS/raw/main/images/demo_server.gif) diff --git a/docs/source/vc.md b/docs/source/vc.md new file mode 100644 index 0000000000..8b45d9393a --- /dev/null +++ b/docs/source/vc.md @@ -0,0 +1,84 @@ +# Voice conversion + +## Overview + +Voice conversion (VC) converts the voice in a speech signal from one speaker to +that of another speaker while preserving the linguistic content. Coqui supports +both voice conversion on its own, as well as applying it after speech synthesis +to enable multi-speaker output with single-speaker TTS models. + +### Python API + +Converting the voice in `source_wav` to the voice of `target_wav` (the latter +can also be a list of files): + +```python +from TTS.api import TTS + +tts = TTS("voice_conversion_models/multilingual/vctk/freevc24").to("cuda") +tts.voice_conversion_to_file( + source_wav="my/source.wav", + target_wav="my/target.wav", + file_path="output.wav" +) +``` + +Voice cloning by combining TTS and VC. The FreeVC model is used for voice +conversion after synthesizing speech. + +```python + +tts = TTS("tts_models/de/thorsten/tacotron2-DDC") +tts.tts_with_vc_to_file( + "Wie sage ich auf Italienisch, dass ich dich liebe?", + speaker_wav=["target1.wav", "target2.wav"], + file_path="output.wav" +) +``` + +Some models, including [XTTS](models/xtts.md), support voice cloning directly +and a separate voice conversion step is not necessary: + +```python +tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda") +wav = tts.tts( + text="Hello world!", + speaker_wav="my/cloning/audio.wav", + language="en" +) +``` + +### CLI + +```sh +tts --out_path output/path/speech.wav \ + --model_name "//" \ + --source_wav \ + --target_wav +``` + +## Pretrained models + +Coqui includes the following pretrained voice conversion models. Training is not +supported. + +### FreeVC + +- `voice_conversion_models/multilingual/vctk/freevc24` + +Adapted from: https://github.com/OlaWod/FreeVC + +### kNN-VC + +- `voice_conversion_models/multilingual/multi-dataset/knnvc` + +At least 1-5 minutes of target speaker data are recommended. + +Adapted from: https://github.com/bshall/knn-vc + +### OpenVoice + +- `voice_conversion_models/multilingual/multi-dataset/openvoice_v1` +- `voice_conversion_models/multilingual/multi-dataset/openvoice_v2` + +Adapted from: https://github.com/myshell-ai/OpenVoice diff --git a/hubconf.py b/hubconf.py index 6e10928265..b49c9d6bcc 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,4 +1,14 @@ -dependencies = ["torch", "gdown", "pysbd", "gruut", "anyascii", "pypinyin", "coqpit", "mecab-python3", "unidic-lite"] +dependencies = [ + "torch", + "gdown", + "pysbd", + "gruut", + "anyascii", + "pypinyin", + "coqpit-config", + "mecab-python3", + "unidic-lite", +] import torch from TTS.utils.manage import ModelManager @@ -39,5 +49,5 @@ def tts(model_name="tts_models/en/ljspeech/tacotron2-DCA", vocoder_name=None, us if __name__ == "__main__": - synthesizer = torch.hub.load("coqui-ai/TTS:dev", "tts", source="github") + synthesizer = torch.hub.load("idiap/coqui-ai-TTS:dev", "tts", source="github") synthesizer.tts("This is a test!") diff --git a/notebooks/dataset_analysis/analyze.py b/notebooks/dataset_analysis/analyze.py index 4855886efd..44bf25c071 100644 --- a/notebooks/dataset_analysis/analyze.py +++ b/notebooks/dataset_analysis/analyze.py @@ -43,7 +43,7 @@ def process_meta_data(path): meta_data = {} # load meta data - with open(path, "r", encoding="utf-8") as f: + with open(path, encoding="utf-8") as f: data = csv.reader(f, delimiter="|") for row in data: frames = int(row[2]) @@ -58,7 +58,7 @@ def process_meta_data(path): "utt": utt, "frames": frames, "audio_len": audio_len, - "row": "{}|{}|{}|{}".format(row[0], row[1], row[2], row[3]), + "row": f"{row[0]}|{row[1]}|{row[2]}|{row[3]}", } ) @@ -156,7 +156,7 @@ def plot_phonemes(train_path, cmu_dict_path, save_path): phonemes = {} - with open(train_path, "r", encoding="utf-8") as f: + with open(train_path, encoding="utf-8") as f: data = csv.reader(f, delimiter="|") phonemes["None"] = 0 for row in data: diff --git a/notebooks/PlotUmapLibriTTS.ipynb b/notebooks/plot_embeddings_umap.ipynb similarity index 56% rename from notebooks/PlotUmapLibriTTS.ipynb rename to notebooks/plot_embeddings_umap.ipynb index 1e29790b9e..b661f85673 100644 --- a/notebooks/PlotUmapLibriTTS.ipynb +++ b/notebooks/plot_embeddings_umap.ipynb @@ -4,13 +4,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Overview\n", + "# Overview\n", "\n", "This notebook can be used with both a single or multi- speaker corpus and allows the interactive plotting of speaker embeddings linked to underlying audio (see instructions in the repo's speaker_embedding directory)\n", "\n", "Depending on the directory structure used for your corpus, you may need to adjust handling of **speaker_to_utter** and **locations**." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -19,63 +26,47 @@ "source": [ "import os\n", "import glob\n", + "import random\n", + "from collections import defaultdict\n", + "from pathlib import Path\n", + "\n", "import numpy as np\n", + "import torch\n", "import umap\n", "\n", - "from TTS.utils.audio import AudioProcessor\n", + "from TTS.bin.compute_embeddings import compute_embeddings\n", "from TTS.config import load_config\n", + "from TTS.config.shared_configs import BaseDatasetConfig\n", + "from TTS.tts.datasets import load_tts_samples\n", + "from TTS.utils.audio import AudioProcessor\n", "\n", "from bokeh.io import output_notebook, show\n", "from bokeh.plotting import figure\n", "from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n", "from bokeh.transform import factor_cmap\n", - "from bokeh.palettes import Category10" + "from bokeh.palettes import Category10\n", + "\n", + "output_notebook()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "For larger sets of speakers, you can use **Category20**, but you need to change it in the **pal** variable too\n", + "For larger sets of speakers, you can use `Category20`, but you need to change it in the `pal` variable too\n", "\n", - "List of Bokeh palettes here: http://docs.bokeh.org/en/1.4.0/docs/reference/palettes.html\n", + "List of Bokeh palettes here: https://docs.bokeh.org/en/latest/docs/reference/palettes.html\n", "\n", "**NB:** if you have problems with other palettes, first see https://stackoverflow.com/questions/48333820/why-do-some-bokeh-palettes-raise-a-valueerror-when-used-in-factor-cmap" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output_notebook()" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You should also adjust all the path constants to point at the relevant locations for you locally" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n", - "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth\"\n", - "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", - "\n", - "# My single speaker locations\n", - "#EMBED_PATH = \"/home/neil/main/Projects/TTS3/embeddings/neil14/\"\n", - "#AUDIO_PATH = \"/home/neil/data/Projects/NeilTTS/neil14/wavs/\"\n", + "## Config\n", "\n", - "# My multi speaker locations\n", - "EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n", - "AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\"" + "You should adjust all the paths to point at the relevant locations for you locally." ] }, { @@ -84,7 +75,16 @@ "metadata": {}, "outputs": [], "source": [ - "!ls -1 $MODEL_RUN_PATH" + "# Dataset\n", + "formatter_name = \"ljspeech\"\n", + "dataset_name = \"ljspeech\"\n", + "dataset_path = \"path/to/LJSpeech-1.1\"\n", + "meta_file_train = \"metadata.csv\"\n", + "\n", + "# Speaker encoder\n", + "se_model_path = \"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar\"\n", + "se_config_path = \"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json\"\n", + "embedding_path = \"speakers.pth\"" ] }, { @@ -93,15 +93,25 @@ "metadata": {}, "outputs": [], "source": [ - "CONFIG = load_config(CONFIG_PATH)\n", - "ap = AudioProcessor(**CONFIG['audio'])" + "dataset_config = BaseDatasetConfig()\n", + "dataset_config.formatter = formatter_name\n", + "dataset_config.dataset_name = dataset_name\n", + "dataset_config.path = dataset_path\n", + "dataset_config.meta_file_train = meta_file_train\n", + "\n", + "meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=False)\n", + "utt_to_wav = {\n", + " item[\"audio_unique_name\"]: str(Path(item[\"audio_file\"]).relative_to(dataset_path)) for item in meta_data_train\n", + "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Bring in the embeddings created by **compute_embeddings.py**" + "## Compute embeddings\n", + "\n", + "You can skip this if you have already computed embeddings with `TTS/bin/compute_embeddings.py`" ] }, { @@ -110,33 +120,38 @@ "metadata": {}, "outputs": [], "source": [ - "embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n", - "print(f'Embeddings found: {len(embed_files)}')" + "compute_embeddings(\n", + " model_path=se_model_path,\n", + " config_path=se_config_path,\n", + " output_path=embedding_path,\n", + " formatter_name=formatter_name,\n", + " dataset_name=dataset_name,\n", + " dataset_path=dataset_path,\n", + " meta_file_train=meta_file_train,\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Check that we did indeed find an embedding" + "## Plot Umap" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "embed_files[0]" + "Bring in the embeddings created by `TTS/bin/compute_embeddings.py`" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "### Process the speakers\n", - "\n", - "Assumes count of **speaker_paths** corresponds to number of speakers (so a corpus in just one directory would be treated like a single speaker and the multiple directories of LibriTTS are treated as distinct speakers)" + "embeddings = torch.load(embedding_path, weights_only=True)" ] }, { @@ -145,15 +160,13 @@ "metadata": {}, "outputs": [], "source": [ - "speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n", - "speaker_to_utter = {}\n", - "for embed_file in embed_files:\n", - " speaker_path = os.path.dirname(os.path.dirname(embed_file))\n", - " try:\n", - " speaker_to_utter[speaker_path].append(embed_file)\n", - " except:\n", - " speaker_to_utter[speaker_path]=[embed_file]\n", - "print(f'Speaker count: {len(speaker_paths)}')" + "speakers = set()\n", + "speaker_to_utter = defaultdict(list)\n", + "for idx, embedding in embeddings.items():\n", + " speaker = embedding[\"name\"]\n", + " speakers.add(speaker)\n", + " speaker_to_utter[speaker].append(idx)\n", + "print(f\"Speaker count: {len(speakers)}\")" ] }, { @@ -175,35 +188,32 @@ "labels = []\n", "locations = []\n", "\n", - "# single speaker \n", - "#num_speakers = 1\n", - "#num_utters = 1000\n", + "# single speaker\n", + "num_speakers = 1\n", + "num_utters = 1000\n", "\n", "# multi speaker\n", - "num_speakers = 10\n", - "num_utters = 20\n", + "# num_speakers = 10\n", + "# num_utters = 20\n", "\n", - "\n", - "speaker_idxs = np.random.choice(range(len(speaker_paths)), num_speakers, replace=False )\n", + "speaker_idxs = random.sample(list(speakers), num_speakers)\n", "\n", "for speaker_num, speaker_idx in enumerate(speaker_idxs):\n", - " speaker_path = speaker_paths[speaker_idx]\n", - " speakers_utter = speaker_to_utter[speaker_path]\n", - " utter_idxs = np.random.randint(0, len(speakers_utter) , num_utters)\n", + " speakers_utter = speaker_to_utter[speaker_idx]\n", + " utter_idxs = random.sample(speakers_utter, num_utters)\n", " for utter_idx in utter_idxs:\n", - " embed_path = speaker_to_utter[speaker_path][utter_idx]\n", - " embed = np.load(embed_path)\n", - " embeds.append(embed)\n", - " labels.append(str(speaker_num))\n", - " locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n", - "embeds = np.concatenate(embeds)" + " embed = np.array(embeddings[utter_idx][\"embedding\"])\n", + " embeds.append(embed)\n", + " labels.append(speaker_idx)\n", + " locations.append(utt_to_wav[utter_idx])\n", + "embeds = np.stack(embeds)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Load embeddings with UMAP" + "### Load embeddings with UMAP" ] }, { @@ -222,9 +232,7 @@ "source": [ "### Interactively charting the data in Bokeh\n", "\n", - "Set up various details for Bokeh to plot the data\n", - "\n", - "You can use the regular Bokeh [tools](http://docs.bokeh.org/en/1.4.0/docs/user_guide/tools.html?highlight=tools) to explore the data, with reset setting it back to normal\n", + "You can use the regular Bokeh [tools](https://docs.bokeh.org/en/latest/docs/user_guide/interaction/tools.html) to explore the data, with reset setting it back to normal\n", "\n", "Once you have started the local server (see cell below) you can then click on plotted points which will open a tab to play the audio for that point, enabling easy exploration of your corpus\n", "\n", @@ -238,22 +246,17 @@ "outputs": [], "source": [ "source_wav_stems = ColumnDataSource(\n", - " data=dict(\n", - " x = projection.T[0].tolist(),\n", - " y = projection.T[1].tolist(),\n", - " desc=locations,\n", - " label=labels\n", - " )\n", + " data=dict(\n", + " x=projection.T[0].tolist(),\n", + " y=projection.T[1].tolist(),\n", + " desc=locations,\n", + " label=labels,\n", " )\n", + ")\n", "\n", - "hover = HoverTool(\n", - " tooltips=[\n", - " (\"file\", \"@desc\"),\n", - " (\"speaker\", \"@label\"),\n", - " ]\n", - " )\n", + "hover = HoverTool(tooltips=[(\"file\", \"@desc\"), (\"speaker\", \"@label\")])\n", "\n", - "# optionally consider adding these to the tooltips if you want additional detail\n", + "### Optionally consider adding these to the tooltips if you want additional detail\n", "# for the coordinates: (\"(x,y)\", \"($x, $y)\"),\n", "# for the index of the embedding / wav file: (\"index\", \"$index\"),\n", "\n", @@ -261,10 +264,13 @@ "pal_size = max(len(factors), 3)\n", "pal = Category10[pal_size]\n", "\n", - "p = figure(plot_width=600, plot_height=400, tools=[hover,BoxZoomTool(), ResetTool(), TapTool()])\n", - "\n", - "\n", - "p.circle('x', 'y', source=source_wav_stems, color=factor_cmap('label', palette=pal, factors=factors),)\n", + "p = figure(width=600, height=400, tools=[hover, BoxZoomTool(), ResetTool(), TapTool()])\n", + "p.scatter(\n", + " \"x\",\n", + " \"y\",\n", + " source=source_wav_stems,\n", + " color=factor_cmap(\"label\", palette=pal, factors=factors),\n", + ")\n", "\n", "url = \"http://localhost:8000/@desc\"\n", "taptool = p.select(type=TapTool)\n", @@ -292,7 +298,7 @@ "metadata": {}, "outputs": [], "source": [ - "%cd $AUDIO_PATH\n", + "%cd $dataset_path\n", "%pwd\n", "!python -m http.server" ] @@ -300,7 +306,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -314,7 +320,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index d66f33d602..821ddc78d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,10 +25,10 @@ build-backend = "hatchling.build" [project] name = "coqui-tts" -version = "0.24.3" +version = "0.25.3" description = "Deep learning for Text to Speech." readme = "README.md" -requires-python = ">=3.9, <3.13" +requires-python = ">=3.10, <3.13" license = {text = "MPL-2.0"} authors = [ {name = "Eren Gölge", email = "egolge@coqui.ai"} @@ -39,7 +39,6 @@ maintainers = [ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -70,30 +69,31 @@ dependencies = [ "pyyaml>=6.0", "fsspec[http]>=2023.6.0", "packaging>=23.1", + "typing_extensions>=4.10", # Inference "pysbd>=0.3.4", # Training "matplotlib>=3.7.0", # Coqui stack - "coqui-tts-trainer>=0.1.4,<0.2.0", - "coqpit>=0.0.16", + "coqui-tts-trainer>=0.2.0,<0.3.0", + "coqpit-config>=0.2.0,<0.3.0", "monotonic-alignment-search>=0.1.0", # Gruut + supported languages "gruut[de,es,fr]>=2.4.0", # Tortoise "einops>=0.6.0", - "transformers>=4.43.0,<=4.46.2", + "transformers>=4.47.0", # Bark "encodec>=0.1.1", # XTTS - "num2words>=0.5.11", - "spacy[ja]>=3,<3.8", + "num2words>=0.5.14", + "spacy[ja]>=3.2,<3.8", ] [project.optional-dependencies] # Only used in notebooks notebooks = [ - "bokeh==1.4.0", + "bokeh>=3.0.3", "pandas>=1.4,<2.0", "umap-learn>=0.5.1", ] @@ -115,7 +115,7 @@ ko = [ ] # Japanese ja = [ - "mecab-python3>=1.0.2", + "mecab-python3>=1.0.6", "unidic-lite==1.0.8", "cutlet>=0.2.0", ] @@ -135,20 +135,19 @@ all = [ [dependency-groups] dev = [ - "black==24.2.0", "coverage[toml]>=7", - "nose2>=0.15", - "pre-commit>=3", - "ruff==0.7.0", + "pre-commit>=4", + "pytest>=8", + "ruff==0.9.1", ] # Dependencies for building the documentation docs = [ - "furo>=2023.5.20", - "myst-parser==2.0.0", - "sphinx==7.2.5", + "furo>=2024.8.6", + "myst-parser==3.0.1", + "sphinx==7.4.7", "sphinx_inline_tabs>=2023.4.21", - "sphinx_copybutton>=0.1", - "linkify-it-py>=2.0.0", + "sphinx_copybutton>=0.5.2", + "linkify-it-py>=2.0.3", ] [project.urls] @@ -173,7 +172,6 @@ exclude = [ "/.readthedocs.yml", "/Makefile", "/dockerfiles", - "/run_bash_tests.sh", "/scripts", "/tests", ] @@ -192,6 +190,7 @@ lint.extend-select = [ "F704", # yield-outside-function "F706", # return-outside-function "F841", # unused-variable + "G004", # no f-string in logging "I", # import sorting "PIE790", # unnecessary-pass "PLC", @@ -201,6 +200,7 @@ lint.extend-select = [ "PLR0911", # too-many-return-statements "PLR1711", # useless-return "PLW", + "UP", # pyupgrade "W291", # trailing-whitespace "NPY201", # NumPy 2.0 deprecation ] @@ -231,14 +231,10 @@ max-returns = 7 "E402", # module level import not at top of file ] -[tool.black] -line-length = 120 -target-version = ['py39'] +[tool.coverage.report] +skip_covered = true +skip_empty = true [tool.coverage.run] parallel = true source = ["TTS"] - -[tool.cibuildwheel] -build = "cp*" -skip = "*-win32 *i686 *musllinux*" diff --git a/recipes/ljspeech/xtts_v1/train_gpt_xtts.py b/recipes/ljspeech/xtts_v1/train_gpt_xtts.py index d31ec8f1ed..a077a18064 100644 --- a/recipes/ljspeech/xtts_v1/train_gpt_xtts.py +++ b/recipes/ljspeech/xtts_v1/train_gpt_xtts.py @@ -4,7 +4,8 @@ from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.datasets import load_tts_samples -from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig +from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig +from TTS.tts.models.xtts import XttsAudioConfig from TTS.utils.manage import ModelManager # Logging parameters diff --git a/recipes/ljspeech/xtts_v2/train_gpt_xtts.py b/recipes/ljspeech/xtts_v2/train_gpt_xtts.py index ccaa97f1e4..362f45008e 100644 --- a/recipes/ljspeech/xtts_v2/train_gpt_xtts.py +++ b/recipes/ljspeech/xtts_v2/train_gpt_xtts.py @@ -4,7 +4,8 @@ from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.datasets import load_tts_samples -from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig +from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig +from TTS.tts.models.xtts import XttsAudioConfig from TTS.utils.manage import ModelManager # Logging parameters diff --git a/run_bash_tests.sh b/run_bash_tests.sh deleted file mode 100755 index 2f5ba88934..0000000000 --- a/run_bash_tests.sh +++ /dev/null @@ -1,7 +0,0 @@ -set -e -TF_CPP_MIN_LOG_LEVEL=3 - -# runtime bash based tests -# TODO: move these to python -./tests/bash_tests/test_demo_server.sh && \ -./tests/bash_tests/test_compute_statistics.sh diff --git a/scripts/sync_readme.py b/scripts/sync_readme.py index 584286814b..97256bca6d 100644 --- a/scripts/sync_readme.py +++ b/scripts/sync_readme.py @@ -22,8 +22,12 @@ def sync_readme(): new_content = replace_between_markers(orig_content, "tts-readme", description.strip()) if args.check: if orig_content != new_content: - print("README.md is out of sync; please edit TTS/bin/TTS_README.md and run scripts/sync_readme.py") + print( + "README.md is out of sync; please reconcile README.md and TTS/bin/synthesize.py and run scripts/sync_readme.py" + ) exit(42) + print("All good, files in sync") + exit(0) readme_path.write_text(new_content) print("Updated README.md") diff --git a/tests/__init__.py b/tests/__init__.py index f0a8b2f118..0ee20a92df 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,5 +1,8 @@ import os +from collections.abc import Callable +from typing import Optional +import pytest from trainer.generic_utils import get_cuda from TTS.config import BaseDatasetConfig @@ -39,9 +42,10 @@ def get_tests_output_path(): return path -def run_cli(command): - exit_status = os.system(command) - assert exit_status == 0, f" [!] command `{command}` failed." +def run_main(main_func: Callable, args: list[str] | None = None, expected_code: int = 0): + with pytest.raises(SystemExit) as exc_info: + main_func(args) + assert exc_info.value.code == expected_code def get_test_data_config(): diff --git a/tests/aux_tests/test_audio_processor.py b/tests/aux_tests/test_audio_processor.py index 5b1fa9d38a..6caf6db30d 100644 --- a/tests/aux_tests/test_audio_processor.py +++ b/tests/aux_tests/test_audio_processor.py @@ -1,190 +1,194 @@ import os -import unittest -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +import pytest + +from tests import get_tests_input_path from TTS.config import BaseAudioConfig from TTS.utils.audio.processor import AudioProcessor -TESTS_PATH = get_tests_path() -OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -os.makedirs(OUT_PATH, exist_ok=True) conf = BaseAudioConfig(mel_fmax=8000, pitch_fmax=640, pitch_fmin=1) -# pylint: disable=protected-access -class TestAudio(unittest.TestCase): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.ap = AudioProcessor(**conf) - - def test_audio_synthesis(self): - """1. load wav - 2. set normalization parameters - 3. extract mel-spec - 4. invert to wav and save the output - """ - print(" > Sanity check for the process wav -> mel -> wav") - - def _test(max_norm, signal_norm, symmetric_norm, clip_norm): - self.ap.max_norm = max_norm - self.ap.signal_norm = signal_norm - self.ap.symmetric_norm = symmetric_norm - self.ap.clip_norm = clip_norm - wav = self.ap.load_wav(WAV_FILE) - mel = self.ap.melspectrogram(wav) - wav_ = self.ap.inv_melspectrogram(mel) - file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav".format( - max_norm, signal_norm, symmetric_norm, clip_norm - ) - print(" | > Creating wav file at : ", file_name) - self.ap.save_wav(wav_, OUT_PATH + file_name) - - # maxnorm = 1.0 - _test(1.0, False, False, False) - _test(1.0, True, False, False) - _test(1.0, True, True, False) - _test(1.0, True, False, True) - _test(1.0, True, True, True) - # maxnorm = 4.0 - _test(4.0, False, False, False) - _test(4.0, True, False, False) - _test(4.0, True, True, False) - _test(4.0, True, False, True) - _test(4.0, True, True, True) - - def test_normalize(self): - """Check normalization and denormalization for range values and consistency""" - print(" > Testing normalization and denormalization.") - wav = self.ap.load_wav(WAV_FILE) - wav = self.ap.sound_norm(wav) # normalize audio to get abetter normalization range below. - self.ap.signal_norm = False - x = self.ap.melspectrogram(wav) - x_old = x - - self.ap.signal_norm = True - self.ap.symmetric_norm = False - self.ap.clip_norm = False - self.ap.max_norm = 4.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - assert (x_old - x).sum() == 0 - # check value range - assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() - assert x_norm.min() >= 0 - 1, x_norm.min() - # check denorm. - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3, (x - x_).mean() - - self.ap.signal_norm = True - self.ap.symmetric_norm = False - self.ap.clip_norm = True - self.ap.max_norm = 4.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - # check value range - assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= 0, x_norm.min() - # check denorm. - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3, (x - x_).mean() - - self.ap.signal_norm = True - self.ap.symmetric_norm = True - self.ap.clip_norm = False - self.ap.max_norm = 4.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - # check value range - assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() # pylint: disable=invalid-unary-operand-type - assert x_norm.min() <= 0, x_norm.min() - # check denorm. - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3, (x - x_).mean() - - self.ap.signal_norm = True - self.ap.symmetric_norm = True - self.ap.clip_norm = True - self.ap.max_norm = 4.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - # check value range - assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type - assert x_norm.min() <= 0, x_norm.min() - # check denorm. - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3, (x - x_).mean() - - self.ap.signal_norm = True - self.ap.symmetric_norm = False - self.ap.max_norm = 1.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= 0, x_norm.min() - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3 - - self.ap.signal_norm = True - self.ap.symmetric_norm = True - self.ap.max_norm = 1.0 - x_norm = self.ap.normalize(x) - print( - f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" - ) - - assert (x_old - x).sum() == 0 - assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type - assert x_norm.min() < 0, x_norm.min() - x_ = self.ap.denormalize(x_norm) - assert (x - x_).sum() < 1e-3 - - def test_scaler(self): - scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy") - conf.stats_path = scaler_stats_path - conf.preemphasis = 0.0 - conf.do_trim_silence = True - conf.signal_norm = True - - ap = AudioProcessor(**conf) - mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path) - ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) - - self.ap.signal_norm = False - self.ap.preemphasis = 0.0 - - # test scaler forward and backward transforms - wav = self.ap.load_wav(WAV_FILE) - mel_reference = self.ap.melspectrogram(wav) - mel_norm = ap.melspectrogram(wav) - mel_denorm = ap.denormalize(mel_norm) - assert abs(mel_reference - mel_denorm).max() < 1e-4 - - def test_compute_f0(self): # pylint: disable=no-self-use - ap = AudioProcessor(**conf) - wav = ap.load_wav(WAV_FILE) - pitch = ap.compute_f0(wav) - mel = ap.melspectrogram(wav) - assert pitch.shape[0] == mel.shape[1] +@pytest.fixture +def ap(): + """Set up audio processor.""" + return AudioProcessor(**conf) + + +norms = [ + # maxnorm = 1.0 + (1.0, False, False, False), + (1.0, True, False, False), + (1.0, True, True, False), + (1.0, True, False, True), + (1.0, True, True, True), + # maxnorm = 4.0 + (4.0, False, False, False), + (4.0, True, False, False), + (4.0, True, True, False), + (4.0, True, False, True), + (4.0, True, True, True), +] + + +@pytest.mark.parametrize("norms", norms) +def test_audio_synthesis(tmp_path, ap, norms): + """1. load wav + 2. set normalization parameters + 3. extract mel-spec + 4. invert to wav and save the output + """ + print(" > Sanity check for the process wav -> mel -> wav") + max_norm, signal_norm, symmetric_norm, clip_norm = norms + ap.max_norm = max_norm + ap.signal_norm = signal_norm + ap.symmetric_norm = symmetric_norm + ap.clip_norm = clip_norm + wav = ap.load_wav(WAV_FILE) + mel = ap.melspectrogram(wav) + wav_ = ap.inv_melspectrogram(mel) + file_name = ( + f"audio_test-melspec_max_norm_{max_norm}-signal_norm_{signal_norm}-" + f"symmetric_{symmetric_norm}-clip_norm_{clip_norm}.wav" + ) + print(" | > Creating wav file at : ", file_name) + ap.save_wav(wav_, tmp_path / file_name) + + +def test_normalize(ap): + """Check normalization and denormalization for range values and consistency""" + print(" > Testing normalization and denormalization.") + wav = ap.load_wav(WAV_FILE) + wav = ap.sound_norm(wav) # normalize audio to get abetter normalization range below. + ap.signal_norm = False + x = ap.melspectrogram(wav) + x_old = x + + ap.signal_norm = True + ap.symmetric_norm = False + ap.clip_norm = False + ap.max_norm = 4.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + assert (x_old - x).sum() == 0 + # check value range + assert x_norm.max() <= ap.max_norm + 1, x_norm.max() + assert x_norm.min() >= 0 - 1, x_norm.min() + # check denorm. + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3, (x - x_).mean() + + ap.signal_norm = True + ap.symmetric_norm = False + ap.clip_norm = True + ap.max_norm = 4.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + # check value range + assert x_norm.max() <= ap.max_norm, x_norm.max() + assert x_norm.min() >= 0, x_norm.min() + # check denorm. + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3, (x - x_).mean() + + ap.signal_norm = True + ap.symmetric_norm = True + ap.clip_norm = False + ap.max_norm = 4.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + # check value range + assert x_norm.max() <= ap.max_norm + 1, x_norm.max() + assert x_norm.min() >= -ap.max_norm - 2, x_norm.min() # pylint: disable=invalid-unary-operand-type + assert x_norm.min() <= 0, x_norm.min() + # check denorm. + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3, (x - x_).mean() + + ap.signal_norm = True + ap.symmetric_norm = True + ap.clip_norm = True + ap.max_norm = 4.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + # check value range + assert x_norm.max() <= ap.max_norm, x_norm.max() + assert x_norm.min() >= -ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type + assert x_norm.min() <= 0, x_norm.min() + # check denorm. + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3, (x - x_).mean() + + ap.signal_norm = True + ap.symmetric_norm = False + ap.max_norm = 1.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + assert x_norm.max() <= ap.max_norm, x_norm.max() + assert x_norm.min() >= 0, x_norm.min() + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3 + + ap.signal_norm = True + ap.symmetric_norm = True + ap.max_norm = 1.0 + x_norm = ap.normalize(x) + print( + f" > MaxNorm: {ap.max_norm}, ClipNorm:{ap.clip_norm}, SymmetricNorm:{ap.symmetric_norm}, SignalNorm:{ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) + + assert (x_old - x).sum() == 0 + assert x_norm.max() <= ap.max_norm, x_norm.max() + assert x_norm.min() >= -ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type + assert x_norm.min() < 0, x_norm.min() + x_ = ap.denormalize(x_norm) + assert (x - x_).sum() < 1e-3 + + +def test_scaler(ap): + scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy") + conf.stats_path = scaler_stats_path + conf.preemphasis = 0.0 + conf.do_trim_silence = True + conf.signal_norm = True + + ap = AudioProcessor(**conf) + mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path) + ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) + + ap.signal_norm = False + ap.preemphasis = 0.0 + + # test scaler forward and backward transforms + wav = ap.load_wav(WAV_FILE) + mel_reference = ap.melspectrogram(wav) + mel_norm = ap.melspectrogram(wav) + mel_denorm = ap.denormalize(mel_norm) + assert abs(mel_reference - mel_denorm).max() < 1e-4 + + +def test_compute_f0(ap): + wav = ap.load_wav(WAV_FILE) + pitch = ap.compute_f0(wav) + mel = ap.melspectrogram(wav) + assert pitch.shape[0] == mel.shape[1] diff --git a/tests/aux_tests/test_compute_statistics.py b/tests/aux_tests/test_compute_statistics.py new file mode 100644 index 0000000000..d6809eb480 --- /dev/null +++ b/tests/aux_tests/test_compute_statistics.py @@ -0,0 +1,10 @@ +from pathlib import Path + +from tests import get_tests_input_path, run_main +from TTS.bin.compute_statistics import main + + +def test_compute_statistics(tmp_path): + config_path = Path(get_tests_input_path()) / "test_glow_tts_config.json" + output_path = tmp_path / "scale_stats.npy" + run_main(main, ["--config_path", str(config_path), "--out_path", str(output_path)]) diff --git a/tests/aux_tests/test_extract_tts_spectrograms.py b/tests/aux_tests/test_extract_tts_spectrograms.py index f2d119ac35..563c5dae02 100644 --- a/tests/aux_tests/test_extract_tts_spectrograms.py +++ b/tests/aux_tests/test_extract_tts_spectrograms.py @@ -1,67 +1,23 @@ -import os -import unittest +from pathlib import Path +import pytest import torch -from tests import get_tests_input_path, get_tests_output_path, run_cli +from tests import get_tests_input_path, run_main +from TTS.bin.extract_tts_spectrograms import main from TTS.config import load_config from TTS.tts.models import setup_model torch.manual_seed(1) -# pylint: disable=protected-access -class TestExtractTTSSpectrograms(unittest.TestCase): - @staticmethod - def test_GlowTTS(): - # set paths - config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json") - checkpoint_path = os.path.join(get_tests_output_path(), "glowtts.pth") - output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") - # load config - c = load_config(config_path) - # create model - model = setup_model(c) - # save model - torch.save({"model": model.state_dict()}, checkpoint_path) - # run test - run_cli( - f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"' - ) - run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"') +@pytest.mark.parametrize("model", ["glow_tts", "tacotron", "tacotron2"]) +def test_extract_tts_spectrograms(tmp_path, model): + config_path = str(Path(get_tests_input_path()) / f"test_{model}_config.json") + checkpoint_path = str(tmp_path / f"{model}.pth") + output_path = str(tmp_path / "output_extract_tts_spectrograms") - @staticmethod - def test_Tacotron2(): - # set paths - config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json") - checkpoint_path = os.path.join(get_tests_output_path(), "tacotron2.pth") - output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") - # load config - c = load_config(config_path) - # create model - model = setup_model(c) - # save model - torch.save({"model": model.state_dict()}, checkpoint_path) - # run test - run_cli( - f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"' - ) - run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"') - - @staticmethod - def test_Tacotron(): - # set paths - config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json") - checkpoint_path = os.path.join(get_tests_output_path(), "tacotron.pth") - output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") - # load config - c = load_config(config_path) - # create model - model = setup_model(c) - # save model - torch.save({"model": model.state_dict()}, checkpoint_path) - # run test - run_cli( - f'CUDA_VISIBLE_DEVICES="" python TTS/bin/extract_tts_spectrograms.py --config_path "{config_path}" --checkpoint_path "{checkpoint_path}" --output_path "{output_path}"' - ) - run_cli(f'rm -rf "{output_path}" "{checkpoint_path}"') + config = load_config(config_path) + model = setup_model(config) + torch.save({"model": model.state_dict()}, checkpoint_path) + run_main(main, ["--config_path", config_path, "--checkpoint_path", checkpoint_path, "--output_path", output_path]) diff --git a/tests/aux_tests/test_find_unique_phonemes.py b/tests/aux_tests/test_find_unique_phonemes.py index 018679f573..53298cdebd 100644 --- a/tests/aux_tests/test_find_unique_phonemes.py +++ b/tests/aux_tests/test_find_unique_phonemes.py @@ -1,16 +1,12 @@ -import os -import unittest - import torch -from tests import get_tests_output_path, run_cli +from tests import run_main +from TTS.bin.find_unique_phonemes import main from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig torch.manual_seed(1) -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") - dataset_config_en = BaseDatasetConfig( formatter="ljspeech", meta_file_train="metadata.csv", @@ -30,52 +26,26 @@ """ -# pylint: disable=protected-access -class TestFindUniquePhonemes(unittest.TestCase): - @staticmethod - def test_espeak_phonemes(): - # prepare the config - config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - datasets=[dataset_config_en], - ) - config.save_json(config_path) - - # run test - run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"') - - @staticmethod - def test_no_espeak_phonemes(): - # prepare the config - config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - datasets=[dataset_config_en], - ) - config.save_json(config_path) - - # run test - run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"') +def test_find_phonemes(tmp_path): + # prepare the config + config_path = str(tmp_path / "test_model_config.json") + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + datasets=[dataset_config_en], + ) + config.save_json(config_path) + + # run test + run_main(main, ["--config_path", config_path]) diff --git a/tests/tts_tests/test_helpers.py b/tests/aux_tests/test_helpers.py similarity index 76% rename from tests/tts_tests/test_helpers.py rename to tests/aux_tests/test_helpers.py index d07efa3620..6781cbc5d4 100644 --- a/tests/tts_tests/test_helpers.py +++ b/tests/aux_tests/test_helpers.py @@ -1,6 +1,14 @@ import torch as T -from TTS.tts.utils.helpers import average_over_durations, generate_path, rand_segments, segment, sequence_mask +from TTS.tts.utils.helpers import ( + average_over_durations, + expand_encoder_outputs, + generate_attention, + generate_path, + rand_segments, + segment, + sequence_mask, +) def test_average_over_durations(): # pylint: disable=no-self-use @@ -86,3 +94,24 @@ def test_generate_path(): assert all(path[b, t, :current_idx] == 0.0) assert all(path[b, t, current_idx + durations[b, t].item() :] == 0.0) current_idx += durations[b, t].item() + + assert T.all(path == generate_attention(durations, x_mask, y_mask)) + assert T.all(path == generate_attention(durations, x_mask)) + + +def test_expand_encoder_outputs(): + inputs = T.rand(2, 5, 57) + durations = T.randint(1, 4, (2, 57)) + + x_mask = T.ones(2, 1, 57) + y_lengths = T.ones(2) * durations.sum(1).max() + + expanded, _, _ = expand_encoder_outputs(inputs, durations, x_mask, y_lengths) + + for b in range(durations.shape[0]): + index = 0 + for idx, dur in enumerate(durations[b]): + idx_expanded = expanded[b, :, index : index + dur.item()] + diff = (idx_expanded - inputs[b, :, idx].repeat(int(dur)).view(idx_expanded.shape)).sum() + assert abs(diff) < 1e-6, diff + index += dur diff --git a/tests/aux_tests/test_numpy_transforms.py b/tests/aux_tests/test_numpy_transforms.py index 00597a0f88..129ba5d86b 100644 --- a/tests/aux_tests/test_numpy_transforms.py +++ b/tests/aux_tests/test_numpy_transforms.py @@ -7,18 +7,12 @@ import numpy as np from coqpit import Coqpit -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +from tests import get_tests_input_path, get_tests_path from TTS.utils.audio import numpy_transforms as np_transforms TESTS_PATH = get_tests_path() -OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -os.makedirs(OUT_PATH, exist_ok=True) - - -# pylint: disable=no-self-use - class TestNumpyTransforms(unittest.TestCase): def setUp(self) -> None: diff --git a/tests/aux_tests/test_server.py b/tests/aux_tests/test_server.py new file mode 100644 index 0000000000..1b691f9596 --- /dev/null +++ b/tests/aux_tests/test_server.py @@ -0,0 +1,47 @@ +import os +import signal +import socket +import subprocess +import time +import wave + +import pytest +import requests + +PORT = 5003 + + +def wait_for_server(host, port, timeout=30): + start_time = time.time() + while time.time() - start_time < timeout: + try: + with socket.create_connection((host, port), timeout=2): + return True + except (OSError, ConnectionRefusedError): + time.sleep(1) + raise TimeoutError(f"Server at {host}:{port} did not start within {timeout} seconds.") + + +@pytest.fixture(scope="module", autouse=True) +def start_flask_server(): + server_process = subprocess.Popen( + ["python", "-m", "TTS.server.server", "--port", str(PORT)], + ) + wait_for_server("localhost", PORT) + yield + os.kill(server_process.pid, signal.SIGTERM) + server_process.wait() + + +def test_flask_server(tmp_path): + url = f"http://localhost:{PORT}/api/tts?text=synthesis%20schmynthesis" + response = requests.get(url) + assert response.status_code == 200, f"Request failed with status code {response.status_code}" + + wav_path = tmp_path / "output.wav" + with wav_path.open("wb") as f: + f.write(response.content) + + with wave.open(str(wav_path), "rb") as wav_file: + num_frames = wav_file.getnframes() + assert num_frames > 0, "WAV file contains no frames." diff --git a/tests/aux_tests/test_speaker_encoder_train.py b/tests/aux_tests/test_speaker_encoder_train.py deleted file mode 100644 index 5d8626faa6..0000000000 --- a/tests/aux_tests/test_speaker_encoder_train.py +++ /dev/null @@ -1,88 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.config.shared_configs import BaseAudioConfig -from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig - - -def run_test_train(): - command = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - ) - run_cli(command) - - -config_path = os.path.join(get_tests_output_path(), "test_speaker_encoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = SpeakerEncoderConfig( - batch_size=4, - num_classes_in_batch=4, - num_utter_per_class=2, - eval_num_classes_in_batch=4, - eval_num_utter_per_class=2, - num_loader_workers=1, - epochs=1, - print_step=1, - save_step=2, - print_eval=True, - run_eval=True, - audio=BaseAudioConfig(num_mels=80), -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.loss = "ge2e" -config.save_json(config_path) - -print(config) -# train the model for one epoch -run_test_train() - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) - -# test resnet speaker encoder -config.model_params["model_name"] = "resnet" -config.save_json(config_path) - -# train the model for one epoch -run_test_train() - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) - -# test model with ge2e loss function -# config.loss = "ge2e" -# config.save_json(config_path) -# run_test_train() - -# test model with angleproto loss function -# config.loss = "angleproto" -# config.save_json(config_path) -# run_test_train() - -# test model with softmaxproto loss function -config.loss = "softmaxproto" -config.save_json(config_path) -run_test_train() diff --git a/tests/aux_tests/test_stft_torch.py b/tests/aux_tests/test_stft_torch.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/aux_tests/test_torch_transforms.py b/tests/aux_tests/test_torch_transforms.py new file mode 100644 index 0000000000..2da5a359c1 --- /dev/null +++ b/tests/aux_tests/test_torch_transforms.py @@ -0,0 +1,16 @@ +import numpy as np +import torch + +from TTS.utils.audio import numpy_transforms as np_transforms +from TTS.utils.audio.torch_transforms import amp_to_db, db_to_amp + + +def test_amplitude_db_conversion(): + x = torch.rand(11) + o1 = amp_to_db(x=x, spec_gain=1.0) + o2 = db_to_amp(x=o1, spec_gain=1.0) + np_o1 = np_transforms.amp_to_db(x=x, base=np.e) + np_o2 = np_transforms.db_to_amp(x=np_o1, base=np.e) + assert torch.allclose(x, o2) + assert torch.allclose(o1, np_o1) + assert torch.allclose(o2, np_o2) diff --git a/tests/bash_tests/test_compute_statistics.sh b/tests/bash_tests/test_compute_statistics.sh deleted file mode 100755 index 721777f852..0000000000 --- a/tests/bash_tests/test_compute_statistics.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash -set -xe -BASEDIR=$(dirname "$0") -echo "$BASEDIR" -# run training -CUDA_VISIBLE_DEVICES="" python TTS/bin/compute_statistics.py --config_path $BASEDIR/../inputs/test_glow_tts.json --out_path $BASEDIR/../outputs/scale_stats.npy diff --git a/tests/bash_tests/test_demo_server.sh b/tests/bash_tests/test_demo_server.sh deleted file mode 100755 index ebd0bc8b89..0000000000 --- a/tests/bash_tests/test_demo_server.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -set -xe - -python -m TTS.server.server & -SERVER_PID=$! - -echo 'Waiting for server...' -sleep 30 - -curl -o /tmp/audio.wav "http://localhost:5002/api/tts?text=synthesis%20schmynthesis" -python -c 'import sys; import wave; print(wave.open(sys.argv[1]).getnframes())' /tmp/audio.wav - -kill $SERVER_PID - -rm /tmp/audio.wav diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index 252b429a16..975281c549 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -1,12 +1,12 @@ import os import shutil -import unittest import numpy as np +import pytest import torch from torch.utils.data import DataLoader -from tests import get_tests_data_path, get_tests_output_path +from tests import get_tests_data_path from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets.dataset import TTSDataset @@ -15,9 +15,6 @@ # pylint: disable=unused-variable -OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") -os.makedirs(OUTPATH, exist_ok=True) - # create a dummy config for testing data loaders. c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False) c.r = 5 @@ -47,210 +44,210 @@ dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac] +ap = AudioProcessor(**c.audio) +max_loader_iter = 4 + DATA_EXIST = True if not os.path.exists(c.data_path): DATA_EXIST = False -print(" > Dynamic data loader test: {}".format(DATA_EXIST)) - - -class TestTTSDataset(unittest.TestCase): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.max_loader_iter = 4 - self.ap = AudioProcessor(**c.audio) - - def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False): - # load dataset - meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) - items = meta_data_train + meta_data_eval - tokenizer, _ = TTSTokenizer.init_from_config(c) - dataset = TTSDataset( - outputs_per_step=r, - compute_linear_spec=True, - return_wav=True, - tokenizer=tokenizer, - ap=self.ap, - samples=items, - batch_group_size=bgs, - min_text_len=c.min_text_len, - max_text_len=c.max_text_len, - min_audio_len=c.min_audio_len, - max_audio_len=c.max_audio_len, - start_by_longest=start_by_longest, - ) - - # add preprocess to force the length computation - if preprocess_samples: - dataset.preprocess_samples() - - dataloader = DataLoader( - dataset, - batch_size=batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=True, - num_workers=c.num_loader_workers, - ) - return dataloader, dataset - - def test_loader(self): - for dataset_config in dataset_configs: - dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True) - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - text_input = data["token_id"] - _ = data["token_id_lengths"] - speaker_name = data["speaker_names"] - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - _ = data["stop_targets"] - _ = data["item_idxs"] - wavs = data["waveform"] - - neg_values = text_input[text_input < 0] - check_count = len(neg_values) - - # check basic conditions - self.assertEqual(check_count, 0) - self.assertEqual(linear_input.shape[0], mel_input.shape[0], c.batch_size) - self.assertEqual(linear_input.shape[2], self.ap.fft_size // 2 + 1) - self.assertEqual(mel_input.shape[2], c.audio["num_mels"]) - self.assertEqual(wavs.shape[1], mel_input.shape[1] * c.audio.hop_length) - self.assertIsInstance(speaker_name[0], str) - - # make sure that the computed mels and the waveform match and correctly computed - mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy()) - # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding - mel_new = mel_new[:, : mel_lengths[0]] - ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length) - mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg] - self.assertLess(abs(mel_diff.sum()), 1e-5) - - # check normalization ranges - if self.ap.symmetric_norm: - self.assertLessEqual(mel_input.max(), self.ap.max_norm) - self.assertGreaterEqual( - mel_input.min(), -self.ap.max_norm # pylint: disable=invalid-unary-operand-type - ) - self.assertLess(mel_input.min(), 0) - else: - self.assertLessEqual(mel_input.max(), self.ap.max_norm) - self.assertGreaterEqual(mel_input.min(), 0) - - def test_batch_group_shuffle(self): - dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav) - last_length = 0 - frames = dataset.samples - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - mel_lengths = data["mel_lengths"] - avg_length = mel_lengths.numpy().mean() - dataloader.dataset.preprocess_samples() - is_items_reordered = False - for idx, item in enumerate(dataloader.dataset.samples): - if item != frames[idx]: - is_items_reordered = True - break - self.assertGreaterEqual(avg_length, last_length) - self.assertTrue(is_items_reordered) - - def test_start_by_longest(self): - """Test start_by_longest option. - - Ther first item of the fist batch must be longer than all the other items. - """ - dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True) - dataloader.dataset.preprocess_samples() - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - mel_lengths = data["mel_lengths"] - if i == 0: - max_len = mel_lengths[0] - print(mel_lengths) - self.assertTrue(all(max_len >= mel_lengths)) - - def test_padding_and_spectrograms(self): - def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): - self.assertNotEqual(linear_input[idx, -1].sum(), 0) # check padding - self.assertNotEqual(linear_input[idx, -2].sum(), 0) - self.assertNotEqual(mel_input[idx, -1].sum(), 0) - self.assertNotEqual(mel_input[idx, -2].sum(), 0) - self.assertEqual(stop_target[idx, -1], 1) - self.assertEqual(stop_target[idx, -2], 0) - self.assertEqual(stop_target[idx].sum(), 1) - self.assertEqual(len(mel_lengths.shape), 1) - self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0]) - self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0]) - - dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav) - - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - stop_target = data["stop_targets"] - item_idx = data["item_idxs"] - - # check mel_spec consistency - wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) - mel = self.ap.melspectrogram(wav).astype("float32") - mel = torch.FloatTensor(mel).contiguous() - mel_dl = mel_input[0] - # NOTE: Below needs to check == 0 but due to an unknown reason - # there is a slight difference between two matrices. - # TODO: Check this assert cond more in detail. - self.assertLess(abs(mel.T - mel_dl).max(), 1e-5) - - # check mel-spec correctness - mel_spec = mel_input[0].cpu().numpy() - wav = self.ap.inv_melspectrogram(mel_spec.T) - self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") - shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") - - # check linear-spec - linear_spec = linear_input[0].cpu().numpy() - wav = self.ap.inv_spectrogram(linear_spec.T) - self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") - shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") - - # check the outputs - check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) - - # Test for batch size 2 - dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav) - - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - stop_target = data["stop_targets"] - item_idx = data["item_idxs"] - - # set id to the longest sequence in the batch - if mel_lengths[0] > mel_lengths[1]: - idx = 0 - else: - idx = 1 - - # check the longer item in the batch - check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) - - # check the other item in the batch - self.assertEqual(linear_input[1 - idx, -1].sum(), 0) - self.assertEqual(mel_input[1 - idx, -1].sum(), 0) - self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1) - self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1]) - self.assertEqual(len(mel_lengths.shape), 1) - - # check batch zero-frame conditions (zero-frame disabled) - # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 - # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 +print(f" > Dynamic data loader test: {DATA_EXIST}") + + +def _create_dataloader(batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False): + # load dataset + meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) + items = meta_data_train + meta_data_eval + tokenizer, _ = TTSTokenizer.init_from_config(c) + dataset = TTSDataset( + outputs_per_step=r, + compute_linear_spec=True, + return_wav=True, + tokenizer=tokenizer, + ap=ap, + samples=items, + batch_group_size=bgs, + min_text_len=c.min_text_len, + max_text_len=c.max_text_len, + min_audio_len=c.min_audio_len, + max_audio_len=c.max_audio_len, + start_by_longest=start_by_longest, + ) + + # add preprocess to force the length computation + if preprocess_samples: + dataset.preprocess_samples() + + dataloader = DataLoader( + dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=dataset.collate_fn, + drop_last=True, + num_workers=c.num_loader_workers, + ) + return dataloader, dataset + + +@pytest.mark.parametrize("dataset_config", dataset_configs) +def test_loader(dataset_config: BaseDatasetConfig): + batch_size = 1 + dataloader, _ = _create_dataloader(batch_size, 1, 0, dataset_config, preprocess_samples=True) + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + text_input = data["token_id"] + _ = data["token_id_lengths"] + speaker_name = data["speaker_names"] + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + _ = data["stop_targets"] + _ = data["item_idxs"] + wavs = data["waveform"] + + neg_values = text_input[text_input < 0] + check_count = len(neg_values) + + # check basic conditions + assert check_count == 0 + assert linear_input.shape[0] == mel_input.shape[0] == batch_size + assert linear_input.shape[2] == ap.fft_size // 2 + 1 + assert mel_input.shape[2] == c.audio["num_mels"] + assert wavs.shape[1] == mel_input.shape[1] * c.audio.hop_length + assert isinstance(speaker_name[0], str) + + # make sure that the computed mels and the waveform match and correctly computed + mel_new = ap.melspectrogram(wavs[0].squeeze().numpy()) + # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding + mel_new = mel_new[:, : mel_lengths[0]] + ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length) + mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg] + assert abs(mel_diff.sum()) < 1e-5 + + # check normalization ranges + if ap.symmetric_norm: + assert mel_input.max() <= ap.max_norm + assert mel_input.min() >= -ap.max_norm + assert mel_input.min() < 0 + else: + assert mel_input.max() <= ap.max_norm + assert mel_input.min() >= 0 + + +def test_batch_group_shuffle(): + dataloader, dataset = _create_dataloader(2, c.r, 16, dataset_config_wav) + last_length = 0 + frames = dataset.samples + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + mel_lengths = data["mel_lengths"] + avg_length = mel_lengths.numpy().mean() + dataloader.dataset.preprocess_samples() + is_items_reordered = False + for idx, item in enumerate(dataloader.dataset.samples): + if item != frames[idx]: + is_items_reordered = True + break + assert avg_length >= last_length + assert is_items_reordered + + +def test_start_by_longest(): + """Test start_by_longest option. + + The first item of the fist batch must be longer than all the other items. + """ + dataloader, _ = _create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True) + dataloader.dataset.preprocess_samples() + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + mel_lengths = data["mel_lengths"] + if i == 0: + max_len = mel_lengths[0] + print(mel_lengths) + assert all(max_len >= mel_lengths) + + +def test_padding_and_spectrograms(tmp_path): + def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): + assert linear_input[idx, -1].sum() != 0 # check padding + assert linear_input[idx, -2].sum() != 0 + assert mel_input[idx, -1].sum() != 0 + assert mel_input[idx, -2].sum() != 0 + assert stop_target[idx, -1] == 1 + assert stop_target[idx, -2] == 0 + assert stop_target[idx].sum() == 1 + assert len(mel_lengths.shape) == 1 + assert mel_lengths[idx] == linear_input[idx].shape[0] + assert mel_lengths[idx] == mel_input[idx].shape[0] + + dataloader, _ = _create_dataloader(1, 1, 0, dataset_config_wav) + + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + stop_target = data["stop_targets"] + item_idx = data["item_idxs"] + + # check mel_spec consistency + wav = np.asarray(ap.load_wav(item_idx[0]), dtype=np.float32) + mel = ap.melspectrogram(wav).astype("float32") + mel = torch.FloatTensor(mel).contiguous() + mel_dl = mel_input[0] + # NOTE: Below needs to check == 0 but due to an unknown reason + # there is a slight difference between two matrices. + # TODO: Check this assert cond more in detail. + assert abs(mel.T - mel_dl).max() < 1e-5 + + # check mel-spec correctness + mel_spec = mel_input[0].cpu().numpy() + wav = ap.inv_melspectrogram(mel_spec.T) + ap.save_wav(wav, tmp_path / "mel_inv_dataloader.wav") + shutil.copy(item_idx[0], tmp_path / "mel_target_dataloader.wav") + + # check linear-spec + linear_spec = linear_input[0].cpu().numpy() + wav = ap.inv_spectrogram(linear_spec.T) + ap.save_wav(wav, tmp_path / "linear_inv_dataloader.wav") + shutil.copy(item_idx[0], tmp_path / "linear_target_dataloader.wav") + + # check the outputs + check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) + + # Test for batch size 2 + dataloader, _ = _create_dataloader(2, 1, 0, dataset_config_wav) + + for i, data in enumerate(dataloader): + if i == max_loader_iter: + break + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + stop_target = data["stop_targets"] + item_idx = data["item_idxs"] + + # set id to the longest sequence in the batch + if mel_lengths[0] > mel_lengths[1]: + idx = 0 + else: + idx = 1 + + # check the longer item in the batch + check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) + + # check the other item in the batch + assert linear_input[1 - idx, -1].sum() == 0 + assert mel_input[1 - idx, -1].sum() == 0 + assert stop_target[1, mel_lengths[1] - 1] == 1 + assert stop_target[1, mel_lengths[1] :].sum() == stop_target.shape[1] - mel_lengths[1] + assert len(mel_lengths.shape) == 1 + + # check batch zero-frame conditions (zero-frame disabled) + # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 + # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py index 28a4088c96..beb7df689b 100644 --- a/tests/inference_tests/test_synthesize.py +++ b/tests/inference_tests/test_synthesize.py @@ -1,20 +1,17 @@ -import os +from tests import run_main +from TTS.bin.synthesize import main -from tests import get_tests_output_path, run_cli - -def test_synthesize(): +def test_synthesize(tmp_path): """Test synthesize.py with diffent arguments.""" - output_path = os.path.join(get_tests_output_path(), "output.wav") - run_cli("tts --list_models") + output_path = str(tmp_path / "output.wav") + + run_main(main, ["--list_models"]) # single speaker model - run_cli(f'tts --text "This is an example." --out_path "{output_path}"') - run_cli( - "tts --model_name tts_models/en/ljspeech/glow-tts " f'--text "This is an example." --out_path "{output_path}"' - ) - run_cli( - "tts --model_name tts_models/en/ljspeech/glow-tts " - "--vocoder_name vocoder_models/en/ljspeech/multiband-melgan " - f'--text "This is an example." --out_path "{output_path}"' - ) + args = ["--text", "This is an example.", "--out_path", output_path] + run_main(main, args) + + args = [*args, "--model_name", "tts_models/en/ljspeech/glow-tts"] + run_main(main, args) + run_main(main, [*args, "--vocoder_name", "vocoder_models/en/ljspeech/multiband-melgan"]) diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index ce4fc751c2..21cc194131 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -23,7 +23,7 @@ def test_in_out(self): tts_root_path = get_tests_input_path() tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth") tts_config = os.path.join(tts_root_path, "dummy_model_config.json") - synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None) + synthesizer = Synthesizer(tts_checkpoint=tts_checkpoint, tts_config_path=tts_config) synthesizer.tts("Better this test works!!") def test_split_into_sentences(self): diff --git a/tests/inputs/test_align_tts.json b/tests/inputs/test_align_tts_config.json similarity index 100% rename from tests/inputs/test_align_tts.json rename to tests/inputs/test_align_tts_config.json diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts_config.json similarity index 100% rename from tests/inputs/test_glow_tts.json rename to tests/inputs/test_glow_tts_config.json diff --git a/tests/inputs/test_speedy_speech.json b/tests/inputs/test_speedy_speech_config.json similarity index 100% rename from tests/inputs/test_speedy_speech.json rename to tests/inputs/test_speedy_speech_config.json diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad_config.json similarity index 100% rename from tests/inputs/test_vocoder_wavegrad.json rename to tests/inputs/test_vocoder_wavegrad_config.json diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000000..bd872c5b44 --- /dev/null +++ b/tests/integration/__init__.py @@ -0,0 +1,128 @@ +import json +import shutil +from pathlib import Path +from typing import Any, TypeVar, Union + +import torch +from trainer.io import get_last_checkpoint + +from tests import run_main +from TTS.bin.synthesize import main as synthesize +from TTS.bin.train_tts import main as train_tts +from TTS.tts.configs.shared_configs import BaseTTSConfig +from TTS.vc.configs.shared_configs import BaseVCConfig + +TEST_TTS_CONFIG = { + "batch_size": 8, + "eval_batch_size": 8, + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "text_cleaner": "english_cleaners", + "use_phonemes": True, + "phoneme_language": "en-us", + "run_eval": True, + "test_delay_epochs": -1, + "epochs": 1, + "print_step": 1, + "print_eval": True, + "test_sentences": ["Be a voice, not an echo."], +} + +TEST_VC_CONFIG = { + "batch_size": 8, + "eval_batch_size": 8, + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "run_eval": True, + "test_delay_epochs": -1, + "epochs": 1, + "seq_len": 8192, + "eval_split_size": 1, + "print_step": 1, + "print_eval": True, + "data_path": "tests/data/ljspeech", +} + +Config = TypeVar("Config", BaseTTSConfig, BaseVCConfig) + + +def create_config(config_class: type[Config], **overrides: Any) -> Config: + base_config = TEST_TTS_CONFIG if issubclass(config_class, BaseTTSConfig) else TEST_VC_CONFIG + params = {**base_config, **overrides} + return config_class(**params) + + +def run_tts_train(tmp_path: Path, config: BaseTTSConfig): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + # For NeuralHMM and Overflow + parameter_path = tmp_path / "lj_parameters.pt" + torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path) + config.mel_statistics_parameter_path = parameter_path + + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) + + # train the model for one epoch + is_multi_speaker = config.use_speaker_embedding or config.use_d_vector_file + formatter = "ljspeech_test" if is_multi_speaker else "ljspeech" + command_train = [ + "--config_path", + str(config_path), + "--coqpit.output_path", + str(output_path), + "--coqpit.phoneme_cache_path", + str(output_path / "phoneme_cache"), + "--coqpit.datasets.0.formatter", + formatter, + "--coqpit.datasets.0.meta_file_train", + "metadata.csv", + "--coqpit.datasets.0.meta_file_val", + "metadata.csv", + "--coqpit.datasets.0.path", + "tests/data/ljspeech", + "--coqpit.test_delay_epochs", + "0", + "--coqpit.datasets.0.meta_file_attn_mask", + "tests/data/ljspeech/metadata_attn_mask.txt", + ] + run_main(train_tts, command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 + + inference_command = [ + "--text", + "This is an example for the tests.", + "--config_path", + str(continue_config_path), + "--model_path", + str(continue_restore_path), + "--out_path", + str(out_wav_path), + ] + if config.use_speaker_embedding: + continue_speakers_path = continue_path / "speakers.json" + elif config.use_d_vector_file: + continue_speakers_path = config.d_vector_file + if is_multi_speaker: + inference_command.extend(["--speaker_idx", "ljspeech-1", "--speakers_file_path", str(continue_speakers_path)]) + run_main(synthesize, inference_command) + + # restore the model and continue training for one more epoch + run_main(train_tts, ["--continue_path", str(continue_path)]) + shutil.rmtree(tmp_path) diff --git a/tests/integration/test_speaker_encoder_train.py b/tests/integration/test_speaker_encoder_train.py new file mode 100644 index 0000000000..ce817680b7 --- /dev/null +++ b/tests/integration/test_speaker_encoder_train.py @@ -0,0 +1,87 @@ +import shutil + +from tests import run_main +from TTS.bin.train_encoder import main +from TTS.config.shared_configs import BaseAudioConfig +from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig + + +def test_train(tmp_path): + config_path = tmp_path / "test_speaker_encoder_config.json" + output_path = tmp_path / "train_outputs" + + def run_test_train(): + command = [ + "--config_path", + str(config_path), + "--coqpit.output_path", + str(output_path), + "--coqpit.datasets.0.formatter", + "ljspeech_test", + "--coqpit.datasets.0.meta_file_train", + "metadata.csv", + "--coqpit.datasets.0.meta_file_val", + "metadata.csv", + "--coqpit.datasets.0.path", + "tests/data/ljspeech", + ] + run_main(main, command) + + config = SpeakerEncoderConfig( + batch_size=4, + num_classes_in_batch=4, + num_utter_per_class=2, + eval_num_classes_in_batch=4, + eval_num_utter_per_class=2, + num_loader_workers=1, + epochs=1, + print_step=1, + save_step=2, + print_eval=True, + run_eval=True, + audio=BaseAudioConfig(num_mels=80), + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.loss = "ge2e" + config.save_json(config_path) + + print(config) + # train the model for one epoch + run_test_train() + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # restore the model and continue training for one more epoch + run_main(main, ["--continue_path", str(continue_path)]) + shutil.rmtree(continue_path) + + # test resnet speaker encoder + config.model_params["model_name"] = "resnet" + config.save_json(config_path) + + # train the model for one epoch + run_test_train() + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # restore the model and continue training for one more epoch + run_main(main, ["--continue_path", str(continue_path)]) + shutil.rmtree(continue_path) + + # test model with ge2e loss function + # config.loss = "ge2e" + # config.save_json(config_path) + # run_test_train() + + # test model with angleproto loss function + # config.loss = "angleproto" + # config.save_json(config_path) + # run_test_train() + + # test model with softmaxproto loss function + config.loss = "softmaxproto" + config.save_json(config_path) + run_test_train() diff --git a/tests/integration/test_train_tts.py b/tests/integration/test_train_tts.py new file mode 100644 index 0000000000..d1e35ae450 --- /dev/null +++ b/tests/integration/test_train_tts.py @@ -0,0 +1,109 @@ +import pytest + +from tests.integration import create_config, run_tts_train +from TTS.tts.configs.align_tts_config import AlignTTSConfig +from TTS.tts.configs.delightful_tts_config import DelightfulTTSConfig +from TTS.tts.configs.fast_pitch_config import FastPitchConfig +from TTS.tts.configs.fastspeech2_config import Fastspeech2Config +from TTS.tts.configs.glow_tts_config import GlowTTSConfig +from TTS.tts.configs.neuralhmm_tts_config import NeuralhmmTTSConfig +from TTS.tts.configs.overflow_config import OverflowConfig +from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig +from TTS.tts.configs.tacotron2_config import Tacotron2Config +from TTS.tts.configs.tacotron_config import TacotronConfig +from TTS.tts.configs.vits_config import VitsConfig + +SPEAKER_ARGS = ( + {}, + { + "use_d_vector_file": True, + "d_vector_file": "tests/data/ljspeech/speakers.json", + "d_vector_dim": 256, + }, + { + "use_speaker_embedding": True, + "num_speakers": 4, + }, +) +SPEAKER_ARG_IDS = ["single", "dvector", "speaker_emb"] + + +def test_train_align_tts(tmp_path): + config = create_config(AlignTTSConfig, use_phonemes=False) + run_tts_train(tmp_path, config) + + +@pytest.mark.parametrize("speaker_args", SPEAKER_ARGS, ids=SPEAKER_ARG_IDS) +def test_train_delightful_tts(tmp_path, speaker_args): + config = create_config( + DelightfulTTSConfig, + batch_size=2, + f0_cache_path=tmp_path / "f0_cache", # delightful f0 cache is incompatible with other models + binary_align_loss_alpha=0.0, + use_attn_priors=False, + **speaker_args, + ) + run_tts_train(tmp_path, config) + + +@pytest.mark.parametrize("speaker_args", SPEAKER_ARGS, ids=SPEAKER_ARG_IDS) +def test_train_fast_pitch(tmp_path, speaker_args): + config = create_config(FastPitchConfig, f0_cache_path="tests/data/ljspeech/f0_cache", **speaker_args) + config.audio.signal_norm = False + config.audio.mel_fmax = 8000 + config.audio.spec_gain = 1 + config.audio.log_func = "np.log" + run_tts_train(tmp_path, config) + + +@pytest.mark.parametrize("speaker_args", SPEAKER_ARGS, ids=SPEAKER_ARG_IDS) +def test_train_fast_speech2(tmp_path, speaker_args): + config = create_config( + Fastspeech2Config, + f0_cache_path="tests/data/ljspeech/f0_cache", + energy_cache_path=tmp_path / "energy_cache", + **speaker_args, + ) + config.audio.signal_norm = False + config.audio.mel_fmax = 8000 + config.audio.spec_gain = 1 + config.audio.log_func = "np.log" + run_tts_train(tmp_path, config) + + +@pytest.mark.parametrize("speaker_args", SPEAKER_ARGS, ids=SPEAKER_ARG_IDS) +def test_train_glow_tts(tmp_path, speaker_args): + config = create_config(GlowTTSConfig, batch_size=2, data_dep_init_steps=1, **speaker_args) + run_tts_train(tmp_path, config) + + +def test_train_neuralhmm(tmp_path): + config = create_config(NeuralhmmTTSConfig, batch_size=3, eval_batch_size=3, max_sampling_time=50) + run_tts_train(tmp_path, config) + + +def test_train_overflow(tmp_path): + config = create_config(OverflowConfig, batch_size=3, eval_batch_size=3, max_sampling_time=50) + run_tts_train(tmp_path, config) + + +def test_train_speedy_speech(tmp_path): + config = create_config(SpeedySpeechConfig) + run_tts_train(tmp_path, config) + + +def test_train_tacotron(tmp_path): + config = create_config(TacotronConfig, use_phonemes=False, r=5, max_decoder_steps=50) + run_tts_train(tmp_path, config) + + +@pytest.mark.parametrize("speaker_args", SPEAKER_ARGS, ids=SPEAKER_ARG_IDS) +def test_train_tacotron2(tmp_path, speaker_args): + config = create_config(Tacotron2Config, use_phonemes=False, r=5, max_decoder_steps=50, **speaker_args) + run_tts_train(tmp_path, config) + + +@pytest.mark.parametrize("speaker_args", SPEAKER_ARGS, ids=SPEAKER_ARG_IDS) +def test_train_vits(tmp_path, speaker_args): + config = create_config(VitsConfig, batch_size=2, eval_batch_size=2, **speaker_args) + run_tts_train(tmp_path, config) diff --git a/tests/integration/test_train_vocoder.py b/tests/integration/test_train_vocoder.py new file mode 100644 index 0000000000..8965de01ee --- /dev/null +++ b/tests/integration/test_train_vocoder.py @@ -0,0 +1,112 @@ +import glob +import os + +import pytest + +from tests import run_main +from TTS.bin.train_vocoder import main +from TTS.vocoder.configs import ( + FullbandMelganConfig, + HifiganConfig, + MelganConfig, + MultibandMelganConfig, + ParallelWaveganConfig, + WavegradConfig, + WavernnConfig, +) +from TTS.vocoder.models.wavernn import WavernnArgs + +GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" + +BASE_CONFIG = { + "batch_size": 8, + "eval_batch_size": 8, + "num_loader_workers": 0, + "num_eval_loader_workers": 0, + "run_eval": True, + "test_delay_epochs": -1, + "epochs": 1, + "seq_len": 8192, + "eval_split_size": 1, + "print_step": 1, + "print_eval": True, + "data_path": "tests/data/ljspeech", +} + +DISCRIMINATOR_MODEL_PARAMS = { + "base_channels": 16, + "max_channels": 64, + "downsample_factors": [4, 4, 4], +} + + +def create_config(config_class, **overrides): + params = {**BASE_CONFIG, **overrides} + return config_class(**params) + + +def run_train(tmp_path, config): + config_path = str(tmp_path / "test_vocoder_config.json") + output_path = tmp_path / "train_outputs" + config.output_path = output_path + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) + + # Train the model for one epoch + run_main(main, ["--config_path", config_path]) + + # Find the latest folder + continue_path = str(max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)) + + # Restore the model and continue training for one more epoch + run_main(main, ["--continue_path", continue_path]) + + +def test_train_hifigan(tmp_path): + config = create_config(HifiganConfig, seq_len=1024) + run_train(tmp_path, config) + + +def test_train_melgan(tmp_path): + config = create_config( + MelganConfig, + batch_size=4, + eval_batch_size=4, + seq_len=2048, + discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS, + ) + run_train(tmp_path, config) + + +def test_train_multiband_melgan(tmp_path): + config = create_config( + MultibandMelganConfig, steps_to_start_discriminator=1, discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS + ) + run_train(tmp_path, config) + + +def test_train_fullband_melgan(tmp_path): + config = create_config(FullbandMelganConfig, discriminator_model_params=DISCRIMINATOR_MODEL_PARAMS) + run_train(tmp_path, config) + + +def test_train_parallel_wavegan(tmp_path): + config = create_config(ParallelWaveganConfig, batch_size=4, eval_batch_size=4, seq_len=2048) + run_train(tmp_path, config) + + +# TODO: Reactivate after improving CI run times +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Takes ~2h on CI (15min/step vs 8sec/step locally)") +def test_train_wavegrad(tmp_path): + config = create_config(WavegradConfig, test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}) + run_train(tmp_path, config) + + +def test_train_wavernn(tmp_path): + config = create_config( + WavernnConfig, + model_args=WavernnArgs(), + seq_len=256, # For shorter test time + ) + run_train(tmp_path, config) diff --git a/tests/integration/test_vits_multilingual_speaker_emb_train.py b/tests/integration/test_vits_multilingual_speaker_emb_train.py new file mode 100644 index 0000000000..9b095935de --- /dev/null +++ b/tests/integration/test_vits_multilingual_speaker_emb_train.py @@ -0,0 +1,130 @@ +import json +import shutil + +from trainer.io import get_last_checkpoint + +from tests import run_main +from TTS.bin.synthesize import main as synthesize +from TTS.bin.train_tts import main as train_tts +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.configs.vits_config import VitsConfig + + +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + dataset_config_en = BaseDatasetConfig( + formatter="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en", + ) + + dataset_config_pt = BaseDatasetConfig( + formatter="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="pt-br", + ) + + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=output_path / "phoneme_cache", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech", None, "en"], + ["Be a voice, not an echo.", "ljspeech", None, "pt-br"], + ], + datasets=[dataset_config_en, dataset_config_pt], + ) + # set audio config + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + + # active multilingual mode + config.model_args.use_language_embedding = True + config.use_language_embedding = True + # active multispeaker mode + config.model_args.use_speaker_embedding = True + config.use_speaker_embedding = True + + # deactivate multispeaker d-vec mode + config.model_args.use_d_vector_file = False + config.use_d_vector_file = False + + # duration predictor + config.model_args.use_sdp = False + config.use_sdp = False + + # active language sampler + config.use_language_weighted_sampler = True + + config.save_json(config_path) + + # train the model for one epoch + command_train = [ + "--config_path", + str(config_path), + "--coqpit.output_path", + str(output_path), + "--coqpit.test_delay_epochs", + "0", + ] + run_main(train_tts, command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech" + language_id = "en" + continue_speakers_path = continue_path / "speakers.json" + continue_languages_path = continue_path / "language_ids.json" + + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 + + # Load the model and run inference + inference_command = [ + "--text", + "This is an example for the tests.", + "--speaker_idx", + speaker_id, + "--language_idx", + language_id, + "--speakers_file_path", + str(continue_speakers_path), + "--language_ids_file_path", + str(continue_languages_path), + "--config_path", + str(continue_config_path), + "--model_path", + str(continue_restore_path), + "--out_path", + str(out_wav_path), + ] + run_main(synthesize, inference_command) + + # restore the model and continue training for one more epoch + run_main(train_tts, ["--continue_path", str(continue_path)]) + shutil.rmtree(tmp_path) diff --git a/tests/integration/test_vits_multilingual_train-d_vectors.py b/tests/integration/test_vits_multilingual_train-d_vectors.py new file mode 100644 index 0000000000..de0f6ed2b9 --- /dev/null +++ b/tests/integration/test_vits_multilingual_train-d_vectors.py @@ -0,0 +1,136 @@ +import json +import shutil + +from trainer.io import get_last_checkpoint + +from tests import run_main +from TTS.bin.synthesize import main as synthesize +from TTS.bin.train_tts import main as train_tts +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.configs.vits_config import VitsConfig + + +def test_train(tmp_path): + config_path = tmp_path / "test_model_config.json" + output_path = tmp_path / "train_outputs" + + dataset_config_en = BaseDatasetConfig( + formatter="ljspeech_test", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en", + ) + + dataset_config_pt = BaseDatasetConfig( + formatter="ljspeech_test", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="pt-br", + ) + + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="multilingual_cleaners", + use_phonemes=False, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech-0", None, "en"], + ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"], + ], + datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt], + ) + # set audio config + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + + # active multilingual mode + config.model_args.use_language_embedding = True + config.use_language_embedding = True + + # deactivate multispeaker mode + config.model_args.use_speaker_embedding = False + config.use_speaker_embedding = False + + # active multispeaker d-vec mode + config.model_args.use_d_vector_file = True + config.use_d_vector_file = True + config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"] + config.d_vector_file = ["tests/data/ljspeech/speakers.json"] + config.model_args.d_vector_dim = 256 + config.d_vector_dim = 256 + + # duration predictor + config.model_args.use_sdp = True + config.use_sdp = True + + # activate language and speaker samplers + config.use_language_weighted_sampler = True + config.language_weighted_sampler_alpha = 10 + config.use_speaker_weighted_sampler = True + config.speaker_weighted_sampler_alpha = 5 + + config.save_json(config_path) + + # train the model for one epoch + command_train = [ + "--config_path", + str(config_path), + "--coqpit.output_path", + str(output_path), + "--coqpit.test_delay_epochs", + "0", + ] + run_main(train_tts, command_train) + + # Find latest folder + continue_path = max(output_path.iterdir(), key=lambda p: p.stat().st_mtime) + + # Inference using TTS API + continue_config_path = continue_path / "config.json" + continue_restore_path, _ = get_last_checkpoint(continue_path) + out_wav_path = tmp_path / "output.wav" + speaker_id = "ljspeech-1" + language_id = "en" + continue_speakers_path = config.d_vector_file + continue_languages_path = continue_path / "language_ids.json" + + # Check integrity of the config + with continue_config_path.open() as f: + config_loaded = json.load(f) + assert config_loaded["characters"] is not None + assert config_loaded["output_path"] in str(continue_path) + assert config_loaded["test_delay_epochs"] == 0 + + # Load the model and run inference + inference_command = [ + "--text", + "This is an example for the tests.", + "--speaker_idx", + speaker_id, + "--language_idx", + language_id, + "--speakers_file_path", + str(continue_speakers_path), + "--language_ids_file_path", + str(continue_languages_path), + "--config_path", + str(continue_config_path), + "--model_path", + str(continue_restore_path), + "--out_path", + str(out_wav_path), + ] + run_main(synthesize, inference_command) + + # restore the model and continue training for one more epoch + run_main(train_tts, ["--continue_path", str(continue_path)]) + shutil.rmtree(tmp_path) diff --git a/tests/integration/test_xtts_gpt_train.py b/tests/integration/test_xtts_gpt_train.py new file mode 100644 index 0000000000..4d22b8102f --- /dev/null +++ b/tests/integration/test_xtts_gpt_train.py @@ -0,0 +1,158 @@ +from pathlib import Path + +import pytest +import torch +from trainer import Trainer, TrainerArgs + +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.layers.xtts.dvae import DiscreteVAE +from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig +from TTS.tts.models.xtts import XttsAudioConfig + +config_dataset = BaseDatasetConfig( + formatter="ljspeech", + dataset_name="ljspeech", + path="tests/data/ljspeech/", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + language="en", +) + +DATASETS_CONFIG_LIST = [config_dataset] + +# Logging parameters +RUN_NAME = "GPT_XTTS_LJSpeech_FT" +PROJECT_NAME = "XTTS_trainer" +DASHBOARD_LOGGER = "tensorboard" +LOGGER_URI = None + +# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. +TOKENIZER_FILE = "tests/inputs/xtts_vocab.json" # vocab.json file +XTTS_CHECKPOINT = None # model.pth file + +# Training sentences generations +SPEAKER_REFERENCE = [ + "tests/data/ljspeech/wavs/LJ001-0002.wav" +] # speaker reference to be used in training test sentences +LANGUAGE = config_dataset.language + +# Training Parameters +OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False +START_WITH_EVAL = False # if True it will star with evaluation +BATCH_SIZE = 2 # set here the batch size +GRAD_ACUMM_STEPS = 1 # set here the grad accumulation steps +# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 +# for more efficient training. You can increase/decrease BATCH_SIZE but then set +# GRAD_ACUMM_STEPS accordingly. + +audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) + + +@pytest.mark.parametrize("use_perceiver", [False, True]) +def test_xtts_gpt_train(tmp_path: Path, use_perceiver: bool): + # Create DVAE checkpoint and mel_norms on test time + # DVAE parameters: For the training we need the dvae to extract the dvae tokens, + # given that you must provide the paths for this model + DVAE_CHECKPOINT = tmp_path / "dvae.pth" + # Mel spectrogram norms for dvae mel spectrogram extraction + MEL_NORM_FILE = tmp_path / "mel_stats.pth" + dvae = DiscreteVAE( + channels=80, + normalization=None, + positional_dims=1, + num_tokens=8192, + codebook_dim=512, + hidden_dim=512, + num_resnet_blocks=3, + kernel_size=3, + num_layers=2, + use_transposed_convs=False, + ) + torch.save(dvae.state_dict(), DVAE_CHECKPOINT) + mel_stats = torch.ones(80) + torch.save(mel_stats, MEL_NORM_FILE) + + # init args and config + model_args = GPTArgs( + max_conditioning_length=132300, # 6 secs + min_conditioning_length=66150, # 3 secs + debug_loading_failures=False, + max_wav_length=255995, # ~11.6 seconds + max_text_length=200, + mel_norm_file=MEL_NORM_FILE, + dvae_checkpoint=DVAE_CHECKPOINT, + xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune + tokenizer_file=TOKENIZER_FILE, + gpt_num_audio_tokens=8194, + gpt_start_audio_token=8192, + gpt_stop_audio_token=8193, + gpt_use_perceiver_resampler=use_perceiver, + ) + + config = GPTTrainerConfig( + epochs=1, + output_path=tmp_path, + model_args=model_args, + run_name=RUN_NAME, + project_name=PROJECT_NAME, + run_description="GPT XTTS training", + dashboard_logger=DASHBOARD_LOGGER, + logger_uri=LOGGER_URI, + audio=audio_config, + batch_size=BATCH_SIZE, + batch_group_size=48, + eval_batch_size=BATCH_SIZE, + num_loader_workers=8, + eval_split_max_size=256, + print_step=50, + plot_step=100, + log_model_step=1000, + save_step=10000, + save_n_checkpoints=1, + save_checkpoints=True, + # target_loss="loss", + print_eval=False, + # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. + optimizer="AdamW", + optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, + optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, + lr=5e-06, # learning rate + lr_scheduler="MultiStepLR", + # it was adjusted accordly for the new step scheme + lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, + test_sentences=[ + { + "text": "This cake is great. It's so delicious and moist.", + "speaker_wav": SPEAKER_REFERENCE, + "language": LANGUAGE, + }, + ], + ) + + # init the model from config + model = GPTTrainer.init_from_config(config) + + # load training samples + train_samples, eval_samples = load_tts_samples( + DATASETS_CONFIG_LIST, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, + ) + + # init the trainer and 🚀 + trainer = Trainer( + TrainerArgs( + restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter + skip_train_epoch=False, + start_with_eval=True, + grad_accum_steps=GRAD_ACUMM_STEPS, + ), + config, + output_path=tmp_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + ) + trainer.fit() diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py index f9067530e6..370a541b97 100644 --- a/tests/text_tests/test_phonemizer.py +++ b/tests/text_tests/test_phonemizer.py @@ -240,12 +240,8 @@ def test_is_available(self): class TestBN_Phonemizer(unittest.TestCase): def setUp(self): self.phonemizer = BN_Phonemizer() - self._TEST_CASES = ( - "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন" - ) - self._EXPECTED = ( - "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।" - ) + self._TEST_CASES = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন" + self._EXPECTED = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।" def test_phonemize(self): self.assertEqual(self.phonemizer.phonemize(self._TEST_CASES, separator=""), self._EXPECTED) diff --git a/tests/text_tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py index 9be1f0bf41..f5d342bb00 100644 --- a/tests/text_tests/test_text_cleaners.py +++ b/tests/text_tests/test_text_cleaners.py @@ -24,6 +24,8 @@ def test_currency() -> None: def test_expand_numbers() -> None: assert phoneme_cleaners("-1") == "minus one" assert phoneme_cleaners("1") == "one" + assert phoneme_cleaners("1" + "0" * 35) == "one hundred decillion" + assert phoneme_cleaners("1" + "0" * 36) == "one" + " zero" * 36 def test_multilingual_phoneme_cleaners() -> None: @@ -43,11 +45,11 @@ def test_normalize_unicode() -> None: ("na\u0303", "nã"), ("o\u0302u", "ôu"), ("n\u0303", "ñ"), - ("\u4E2D\u56FD", "中国"), + ("\u4e2d\u56fd", "中国"), ("niño", "niño"), ("a\u0308", "ä"), ("\u3053\u3093\u306b\u3061\u306f", "こんにちは"), - ("\u03B1\u03B2", "αβ"), + ("\u03b1\u03b2", "αβ"), ] for arg, expect in test_cases: assert normalize_unicode(arg) == expect diff --git a/tests/tts_tests2/test_delightful_tts_layers.py b/tests/tts_tests/test_delightful_tts_layers.py similarity index 100% rename from tests/tts_tests2/test_delightful_tts_layers.py rename to tests/tts_tests/test_delightful_tts_layers.py diff --git a/tests/tts_tests2/test_feed_forward_layers.py b/tests/tts_tests/test_feed_forward_layers.py similarity index 100% rename from tests/tts_tests2/test_feed_forward_layers.py rename to tests/tts_tests/test_feed_forward_layers.py diff --git a/tests/tts_tests2/test_forward_tts.py b/tests/tts_tests/test_forward_tts.py similarity index 87% rename from tests/tts_tests2/test_forward_tts.py rename to tests/tts_tests/test_forward_tts.py index cec0f211c8..13a2c270af 100644 --- a/tests/tts_tests2/test_forward_tts.py +++ b/tests/tts_tests/test_forward_tts.py @@ -6,29 +6,7 @@ # pylint: disable=unused-variable -def expand_encoder_outputs_test(): - model = ForwardTTS(ForwardTTSArgs(num_chars=10)) - - inputs = T.rand(2, 5, 57) - durations = T.randint(1, 4, (2, 57)) - - x_mask = T.ones(2, 1, 57) - y_mask = T.ones(2, 1, durations.sum(1).max()) - - expanded, _ = model.expand_encoder_outputs(inputs, durations, x_mask, y_mask) - - for b in range(durations.shape[0]): - index = 0 - for idx, dur in enumerate(durations[b]): - diff = ( - expanded[b, :, index : index + dur.item()] - - inputs[b, :, idx].repeat(dur.item()).view(expanded[b, :, index : index + dur.item()].shape) - ).sum() - assert abs(diff) < 1e-6, diff - index += dur - - -def model_input_output_test(): +def test_model_input_output(): """Assert the output shapes of the model in different modes""" # VANILLA MODEL diff --git a/tests/tts_tests2/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py similarity index 95% rename from tests/tts_tests2/test_glow_tts.py rename to tests/tts_tests/test_glow_tts.py index 3c7ac51556..c92063576f 100644 --- a/tests/tts_tests2/test_glow_tts.py +++ b/tests/tts_tests/test_glow_tts.py @@ -42,8 +42,8 @@ def _create_inputs(batch_size=8): def _check_parameter_changes(model, model_ref): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -107,7 +107,7 @@ def _test_forward(self, batch_size): config = GlowTTSConfig(num_chars=32) model = GlowTTS(config).to(device) model.train() - print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + print(f" > Num parameters for GlowTTS model:{count_parameters(model)}") # inference encoder and decoder with MAS y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths) self.assertEqual(y["z"].shape, mel_spec.shape) @@ -134,7 +134,7 @@ def _test_forward_with_d_vector(self, batch_size): ) model = GlowTTS.init_from_config(config).to(device) model.train() - print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + print(f" > Num parameters for GlowTTS model:{count_parameters(model)}") # inference encoder and decoder with MAS y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"d_vectors": d_vector}) self.assertEqual(y["z"].shape, mel_spec.shape) @@ -160,7 +160,7 @@ def _test_forward_with_speaker_id(self, batch_size): ) model = GlowTTS.init_from_config(config).to(device) model.train() - print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + print(f" > Num parameters for GlowTTS model:{count_parameters(model)}") # inference encoder and decoder with MAS y = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, {"speaker_ids": speaker_ids}) self.assertEqual(y["z"].shape, mel_spec.shape) @@ -241,10 +241,10 @@ def _test_inference_with_MAS(self, batch_size): # inference encoder and decoder with MAS y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths) y2 = model.decoder_inference(mel_spec, mel_lengths) - assert ( - y2["model_outputs"].shape == y["model_outputs"].shape - ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format( - y["model_outputs"].shape, y2["model_outputs"].shape + assert y2["model_outputs"].shape == y["model_outputs"].shape, ( + "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format( + y["model_outputs"].shape, y2["model_outputs"].shape + ) ) def test_inference_with_MAS(self): @@ -261,7 +261,7 @@ def test_train_step(self): # reference model to compare model weights model_ref = GlowTTS(config).to(device) model.train() - print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) + print(f" > Num parameters for GlowTTS model:{count_parameters(model)}") # pass the state to ref model model_ref.load_state_dict(copy.deepcopy(model.state_dict())) count = 0 diff --git a/tests/tts_tests/test_losses.py b/tests/tts_tests/test_losses.py index 794478dca3..2290e9a6cc 100644 --- a/tests/tts_tests/test_losses.py +++ b/tests/tts_tests/test_losses.py @@ -21,7 +21,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() @@ -29,14 +29,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" # seq_len_norm = True # test input == target @@ -52,7 +52,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() @@ -60,14 +60,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item()) + assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" class MSELossMaskedTests(unittest.TestCase): @@ -85,7 +85,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() @@ -93,14 +93,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" # seq_len_norm = True # test input == target @@ -116,7 +116,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() @@ -124,14 +124,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item()) + assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" class SSIMLossTests(unittest.TestCase): @@ -153,7 +153,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.ones(4) * 58).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() >= 1.0, "0 vs {}".format(output.item()) + assert output.item() >= 1.0, f"0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 57, 128).float() @@ -168,7 +168,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(54, 58)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" # seq_len_norm = True # test input == target @@ -184,7 +184,7 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_target = T.zeros(4, 57, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) - assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + assert output.item() == 1.0, f"1.0 vs {output.item()}" # test if padded values of input makes any difference dummy_input = T.ones(4, 57, 128).float() @@ -192,14 +192,14 @@ def test_in_out(self): # pylint: disable=no-self-use dummy_length = (T.arange(54, 58)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item()) + assert abs(output.item() - 1.0) < 1e-5, f"1.0 vs {output.item()}" dummy_input = T.rand(4, 57, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(54, 58)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) - assert output.item() == 0, "0 vs {}".format(output.item()) + assert output.item() == 0, f"0 vs {output.item()}" class BCELossTest(unittest.TestCase): diff --git a/tests/tts_tests/test_neuralhmm_tts_train.py b/tests/tts_tests/test_neuralhmm_tts_train.py deleted file mode 100644 index 25d9aa8148..0000000000 --- a/tests/tts_tests/test_neuralhmm_tts_train.py +++ /dev/null @@ -1,92 +0,0 @@ -import glob -import json -import os -import shutil - -import torch -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.neuralhmm_tts_config import NeuralhmmTTSConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -parameter_path = os.path.join(get_tests_output_path(), "lj_parameters.pt") - -torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path) - -config = NeuralhmmTTSConfig( - batch_size=3, - eval_batch_size=3, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="phoneme_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - mel_statistics_parameter_path=parameter_path, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - max_sampling_time=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - - -# train the model for one epoch when mel parameters exists -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) - - -# train the model for one epoch when mel parameters have to be computed from the dataset -if os.path.exists(parameter_path): - os.remove(parameter_path) -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_overflow_train.py b/tests/tts_tests/test_overflow_train.py deleted file mode 100644 index 86fa60af72..0000000000 --- a/tests/tts_tests/test_overflow_train.py +++ /dev/null @@ -1,92 +0,0 @@ -import glob -import json -import os -import shutil - -import torch -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.overflow_config import OverflowConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") -parameter_path = os.path.join(get_tests_output_path(), "lj_parameters.pt") - -torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path) - -config = OverflowConfig( - batch_size=3, - eval_batch_size=3, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="phoneme_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - mel_statistics_parameter_path=parameter_path, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - max_sampling_time=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - - -# train the model for one epoch when mel parameters exists -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) - - -# train the model for one epoch when mel parameters have to be computed from the dataset -if os.path.exists(parameter_path): - os.remove(parameter_path) -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py deleted file mode 100644 index 530781ef88..0000000000 --- a/tests/tts_tests/test_speedy_speech_train.py +++ /dev/null @@ -1,72 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig - -config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = SpeedySpeechConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py deleted file mode 100644 index 99ba4349c4..0000000000 --- a/tests/tts_tests/test_tacotron2_d-vectors_train.py +++ /dev/null @@ -1,79 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.tacotron2_config import Tacotron2Config - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = Tacotron2Config( - r=5, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - use_speaker_embedding=False, - use_d_vector_file=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - d_vector_file="tests/data/ljspeech/speakers.json", - d_vector_dim=256, - max_decoder_steps=50, -) - -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = config.d_vector_file - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index 72b6bcd46b..72069bf943 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -72,8 +72,8 @@ def test_train_step(self): # pylint: disable=no-self-use for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -131,8 +131,8 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -198,8 +198,8 @@ def test_train_step(self): if name == "gst_layer.encoder.recurrence.weight_hh_l0": # print(param.grad) continue - assert (param != param_ref).any(), "param {} {} with shape {} not updated!! \n{}\n{}".format( - name, count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -254,8 +254,8 @@ def test_train_step(self): if name == "gst_layer.encoder.recurrence.weight_hh_l0": # print(param.grad) continue - assert (param != param_ref).any(), "param {} {} with shape {} not updated!! \n{}\n{}".format( - name, count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {name} {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -321,8 +321,8 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -384,7 +384,7 @@ def test_train_step(): name, param = name_param if name == "gst_layer.encoder.recurrence.weight_hh_l0": continue - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py deleted file mode 100644 index 5f1bc3fd50..0000000000 --- a/tests/tts_tests/test_tacotron2_speaker_emb_train.py +++ /dev/null @@ -1,77 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.tacotron2_config import Tacotron2Config - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = Tacotron2Config( - r=5, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - use_speaker_embedding=True, - num_speakers=4, - max_decoder_steps=50, -) - -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py deleted file mode 100644 index 40107070e1..0000000000 --- a/tests/tts_tests/test_tacotron2_train.py +++ /dev/null @@ -1,72 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.tacotron2_config import Tacotron2Config - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = Tacotron2Config( - r=5, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - max_decoder_steps=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron_layers.py b/tests/tts_tests/test_tacotron_layers.py index 43e72417c2..9521cfea26 100644 --- a/tests/tts_tests/test_tacotron_layers.py +++ b/tests/tts_tests/test_tacotron_layers.py @@ -67,8 +67,8 @@ def test_in_out(): output, alignment, stop_tokens = layer(dummy_input, dummy_memory, mask=None) assert output.shape[0] == 4 - assert output.shape[1] == 80, "size not {}".format(output.shape[1]) - assert output.shape[2] == 2, "size not {}".format(output.shape[2]) + assert output.shape[1] == 80, f"size not {output.shape[1]}" + assert output.shape[2] == 2, f"size not {output.shape[2]}" assert stop_tokens.shape[0] == 4 diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 7ec3f0df1b..5f9af86e7e 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -51,7 +51,7 @@ def test_train_step(): criterion_st = nn.BCEWithLogitsLoss().to(device) model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -71,8 +71,8 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -105,7 +105,7 @@ def test_train_step(): config.d_vector_dim = 55 model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -127,8 +127,8 @@ def test_train_step(): for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -165,7 +165,7 @@ def test_train_step(): model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) - print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron GST model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -186,8 +186,8 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -217,7 +217,7 @@ def test_train_step(): model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) - print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron GST model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -238,8 +238,8 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -288,7 +288,7 @@ def test_train_step(): criterion = model.get_criterion() optimizer = model.get_optimizer() model.train() - print(" > Num parameters for Tacotron with Capacitron VAE model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron with Capacitron VAE model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -305,8 +305,8 @@ def test_train_step(): count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 @@ -341,7 +341,7 @@ def test_train_step(): config.d_vector_dim = 55 model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) + print(f" > Num parameters for Tacotron model:{count_parameters(model)}") model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): @@ -366,7 +366,7 @@ def test_train_step(): name, param = name_param if name == "gst_layer.encoder.recurrence.weight_hh_l0": continue - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {count} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count += 1 diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py deleted file mode 100644 index f7751931ae..0000000000 --- a/tests/tts_tests/test_tacotron_train.py +++ /dev/null @@ -1,64 +0,0 @@ -import glob -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.tacotron_config import TacotronConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = TacotronConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - r=5, - max_decoder_steps=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") - -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index 17992773ad..790439ecb2 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -13,14 +13,10 @@ Vits, VitsArgs, VitsAudioConfig, - amp_to_db, - db_to_amp, load_audio, - spec_to_mel, - wav_to_mel, - wav_to_spec, ) from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.audio.torch_transforms import amp_to_db, db_to_amp, spec_to_mel, wav_to_mel, wav_to_spec LANG_FILE = os.path.join(get_tests_input_path(), "language_ids.json") SPEAKER_ENCODER_CONFIG = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json") @@ -377,8 +373,8 @@ def _check_parameter_changes(model, model_ref): name = item1[0] param = item1[1] param_ref = item2[1] - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - name, param.shape, param, param_ref + assert (param != param_ref).any(), ( + f"param {name} with shape {param.shape} not updated!! \n{param}\n{param_ref}" ) count = count + 1 diff --git a/tests/tts_tests/test_vits_d-vectors_train.py b/tests/tts_tests/test_vits_d-vectors_train.py deleted file mode 100644 index 741bda91e9..0000000000 --- a/tests/tts_tests/test_vits_d-vectors_train.py +++ /dev/null @@ -1,61 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.vits_config import VitsConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech-0"], - ], -) -# set audio config -config.audio.do_trim_silence = True -config.audio.trim_db = 60 - -# active multispeaker d-vec mode -config.model_args.use_d_vector_file = True -config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"] -config.model_args.d_vector_dim = 256 - - -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py deleted file mode 100644 index 71597ef32f..0000000000 --- a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py +++ /dev/null @@ -1,110 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.config.shared_configs import BaseDatasetConfig -from TTS.tts.configs.vits_config import VitsConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -dataset_config_en = BaseDatasetConfig( - formatter="ljspeech", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - path="tests/data/ljspeech", - language="en", -) - -dataset_config_pt = BaseDatasetConfig( - formatter="ljspeech", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - path="tests/data/ljspeech", - language="pt-br", -) - -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech", None, "en"], - ["Be a voice, not an echo.", "ljspeech", None, "pt-br"], - ], - datasets=[dataset_config_en, dataset_config_pt], -) -# set audio config -config.audio.do_trim_silence = True -config.audio.trim_db = 60 - -# active multilingual mode -config.model_args.use_language_embedding = True -config.use_language_embedding = True -# active multispeaker mode -config.model_args.use_speaker_embedding = True -config.use_speaker_embedding = True - -# deactivate multispeaker d-vec mode -config.model_args.use_d_vector_file = False -config.use_d_vector_file = False - -# duration predictor -config.model_args.use_sdp = False -config.use_sdp = False - -# active language sampler -config.use_language_weighted_sampler = True - -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech" -languae_id = "en" -continue_speakers_path = os.path.join(continue_path, "speakers.json") -continue_languages_path = os.path.join(continue_path, "language_ids.json") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py deleted file mode 100644 index fd58db534a..0000000000 --- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py +++ /dev/null @@ -1,117 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.config.shared_configs import BaseDatasetConfig -from TTS.tts.configs.vits_config import VitsConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -dataset_config_en = BaseDatasetConfig( - formatter="ljspeech_test", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - path="tests/data/ljspeech", - language="en", -) - -dataset_config_pt = BaseDatasetConfig( - formatter="ljspeech_test", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - path="tests/data/ljspeech", - language="pt-br", -) - -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="multilingual_cleaners", - use_phonemes=False, - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech-0", None, "en"], - ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"], - ], - datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt], -) -# set audio config -config.audio.do_trim_silence = True -config.audio.trim_db = 60 - -# active multilingual mode -config.model_args.use_language_embedding = True -config.use_language_embedding = True - -# deactivate multispeaker mode -config.model_args.use_speaker_embedding = False -config.use_speaker_embedding = False - -# active multispeaker d-vec mode -config.model_args.use_d_vector_file = True -config.use_d_vector_file = True -config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"] -config.d_vector_file = ["tests/data/ljspeech/speakers.json"] -config.model_args.d_vector_dim = 256 -config.d_vector_dim = 256 - -# duration predictor -config.model_args.use_sdp = True -config.use_sdp = True - -# activate language and speaker samplers -config.use_language_weighted_sampler = True -config.language_weighted_sampler_alpha = 10 -config.use_speaker_weighted_sampler = True -config.speaker_weighted_sampler_alpha = 5 - -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -languae_id = "en" -continue_speakers_path = config.d_vector_file -continue_languages_path = os.path.join(continue_path, "language_ids.json") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py deleted file mode 100644 index b7fe197cfe..0000000000 --- a/tests/tts_tests/test_vits_speaker_emb_train.py +++ /dev/null @@ -1,83 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.vits_config import VitsConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech-1"], - ], -) -# set audio config -config.audio.do_trim_silence = True -config.audio.trim_db = 60 - -# active multispeaker d-vec mode -config.model_args.use_speaker_embedding = True -config.model_args.use_d_vector_file = False -config.model_args.d_vector_file = None -config.model_args.d_vector_dim = 256 - - -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py deleted file mode 100644 index ea5dc02405..0000000000 --- a/tests/tts_tests/test_vits_train.py +++ /dev/null @@ -1,72 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.vits_config import VitsConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = VitsConfig( - batch_size=2, - eval_batch_size=2, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo."], - ], -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/__init__.py b/tests/tts_tests2/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/tts_tests2/test_align_tts_train.py b/tests/tts_tests2/test_align_tts_train.py deleted file mode 100644 index 9b0b730df4..0000000000 --- a/tests/tts_tests2/test_align_tts_train.py +++ /dev/null @@ -1,72 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.align_tts_config import AlignTTSConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = AlignTTSConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], -) - -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py deleted file mode 100644 index 8fc4ea7e9b..0000000000 --- a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py +++ /dev/null @@ -1,100 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig -from TTS.tts.models.delightful_tts import DelightfulTtsArgs, VocoderConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -audio_config = DelightfulTtsAudioConfig() -model_args = DelightfulTtsArgs( - use_speaker_embedding=False, d_vector_dim=256, use_d_vector_file=True, speaker_embedding_channels=256 -) - -vocoder_config = VocoderConfig() - -config = DelightfulTTSConfig( - model_args=model_args, - audio=audio_config, - vocoder=vocoder_config, - batch_size=2, - eval_batch_size=8, - compute_f0=True, - run_eval=True, - test_delay_epochs=-1, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache_delightful/", ## delightful f0 cache is incompatible with other models - epochs=1, - print_step=1, - print_eval=True, - binary_align_loss_alpha=0.0, - use_attn_priors=False, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech-0"], - ], - output_path=output_path, - use_speaker_embedding=False, - use_d_vector_file=True, - d_vector_file="tests/data/ljspeech/speakers.json", - d_vector_dim=256, - speaker_embedding_channels=256, -) - -# active multispeaker d-vec mode -config.model_args.use_speaker_embedding = False -config.model_args.use_d_vector_file = True -config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" -config.model_args.d_vector_dim = 256 - - -config.save_json(config_path) - -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) - -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -speaker_id = "ljspeech-1" -continue_speakers_path = config.d_vector_file - -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --speakers_file_path {continue_speakers_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) -shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/") diff --git a/tests/tts_tests2/test_delightful_tts_emb_spk.py b/tests/tts_tests2/test_delightful_tts_emb_spk.py deleted file mode 100644 index 6fb70c5f61..0000000000 --- a/tests/tts_tests2/test_delightful_tts_emb_spk.py +++ /dev/null @@ -1,94 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig -from TTS.tts.models.delightful_tts import DelightfulTtsArgs, VocoderConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -audio_config = DelightfulTtsAudioConfig() -model_args = DelightfulTtsArgs(use_speaker_embedding=False) - -vocoder_config = VocoderConfig() - -config = DelightfulTTSConfig( - model_args=model_args, - audio=audio_config, - vocoder=vocoder_config, - batch_size=2, - eval_batch_size=8, - compute_f0=True, - run_eval=True, - test_delay_epochs=-1, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache_delightful/", ## delightful f0 cache is incompatible with other models - epochs=1, - print_step=1, - print_eval=True, - binary_align_loss_alpha=0.0, - use_attn_priors=False, - test_sentences=[ - ["Be a voice, not an echo.", "ljspeech"], - ], - output_path=output_path, - num_speakers=4, - use_speaker_embedding=True, -) - -# active multispeaker d-vec mode -config.model_args.use_speaker_embedding = True -config.model_args.use_d_vector_file = False -config.model_args.d_vector_file = None -config.model_args.d_vector_dim = 256 - - -config.save_json(config_path) - -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.dataset_name ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) - -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech" -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) -shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/") diff --git a/tests/tts_tests2/test_delightful_tts_train.py b/tests/tts_tests2/test_delightful_tts_train.py deleted file mode 100644 index a917d77657..0000000000 --- a/tests/tts_tests2/test_delightful_tts_train.py +++ /dev/null @@ -1,97 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.config.shared_configs import BaseAudioConfig -from TTS.tts.configs.delightful_tts_config import DelightfulTTSConfig -from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) - -audio_config = DelightfulTtsAudioConfig() -model_args = DelightfulTtsArgs() - -vocoder_config = VocoderConfig() - - -config = DelightfulTTSConfig( - audio=audio_config, - batch_size=2, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache_delightful/", ## delightful f0 cache is incompatible with other models - run_eval=True, - test_delay_epochs=-1, - binary_align_loss_alpha=0.0, - epochs=1, - print_step=1, - use_attn_priors=False, - print_eval=True, - test_sentences=[ - ["Be a voice, not an echo."], - ], - use_speaker_embedding=False, -) -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{'cpu'}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs -1" -) - -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == -1 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) -shutil.rmtree("tests/data/ljspeech/f0_cache_delightful/") diff --git a/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py b/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py deleted file mode 100644 index 7f79bfcab2..0000000000 --- a/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py +++ /dev/null @@ -1,92 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.config.shared_configs import BaseAudioConfig -from TTS.tts.configs.fast_pitch_config import FastPitchConfig - -config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) - -config = FastPitchConfig( - audio=audio_config, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - use_speaker_embedding=True, - test_sentences=[ - "Be a voice, not an echo.", - ], -) -config.audio.do_trim_silence = True -config.use_speaker_embedding = True -config.model_args.use_speaker_embedding = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_fast_pitch_train.py b/tests/tts_tests2/test_fast_pitch_train.py deleted file mode 100644 index a525715b53..0000000000 --- a/tests/tts_tests2/test_fast_pitch_train.py +++ /dev/null @@ -1,91 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.config.shared_configs import BaseAudioConfig -from TTS.tts.configs.fast_pitch_config import FastPitchConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) - -config = FastPitchConfig( - audio=audio_config, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - use_speaker_embedding=False, -) -config.audio.do_trim_silence = True -config.use_speaker_embedding = False -config.model_args.use_speaker_embedding = False -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) - -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py b/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py deleted file mode 100644 index 35bda597d5..0000000000 --- a/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py +++ /dev/null @@ -1,95 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.config.shared_configs import BaseAudioConfig -from TTS.tts.configs.fastspeech2_config import Fastspeech2Config - -config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) - -config = Fastspeech2Config( - audio=audio_config, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache/", - compute_f0=True, - compute_energy=True, - energy_cache_path="tests/data/ljspeech/energy_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - use_speaker_embedding=True, - test_sentences=[ - "Be a voice, not an echo.", - ], -) -config.audio.do_trim_silence = True -config.use_speaker_embedding = True -config.model_args.use_speaker_embedding = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_fastspeech_2_train.py b/tests/tts_tests2/test_fastspeech_2_train.py deleted file mode 100644 index dd4b07d240..0000000000 --- a/tests/tts_tests2/test_fastspeech_2_train.py +++ /dev/null @@ -1,94 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.config.shared_configs import BaseAudioConfig -from TTS.tts.configs.fastspeech2_config import Fastspeech2Config - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -audio_config = BaseAudioConfig( - sample_rate=22050, - do_trim_silence=True, - trim_db=60.0, - signal_norm=False, - mel_fmin=0.0, - mel_fmax=8000, - spec_gain=1.0, - log_func="np.log", - ref_level_db=20, - preemphasis=0.0, -) - -config = Fastspeech2Config( - audio=audio_config, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - f0_cache_path="tests/data/ljspeech/f0_cache/", - compute_f0=True, - compute_energy=True, - energy_cache_path="tests/data/ljspeech/energy_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - use_speaker_embedding=False, -) -config.audio.do_trim_silence = True -config.use_speaker_embedding = False -config.model_args.use_speaker_embedding = False -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) - -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_glow_tts_d-vectors_train.py b/tests/tts_tests2/test_glow_tts_d-vectors_train.py deleted file mode 100644 index f1cfd4368f..0000000000 --- a/tests/tts_tests2/test_glow_tts_d-vectors_train.py +++ /dev/null @@ -1,79 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.glow_tts_config import GlowTTSConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = GlowTTSConfig( - batch_size=2, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - data_dep_init_steps=1.0, - use_speaker_embedding=False, - use_d_vector_file=True, - d_vector_file="tests/data/ljspeech/speakers.json", - d_vector_dim=256, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = config.d_vector_file - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_glow_tts_speaker_emb_train.py b/tests/tts_tests2/test_glow_tts_speaker_emb_train.py deleted file mode 100644 index b1eb6237a4..0000000000 --- a/tests/tts_tests2/test_glow_tts_speaker_emb_train.py +++ /dev/null @@ -1,76 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.glow_tts_config import GlowTTSConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = GlowTTSConfig( - batch_size=2, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - data_dep_init_steps=1.0, - use_speaker_embedding=True, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech_test " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") -speaker_id = "ljspeech-1" -continue_speakers_path = os.path.join(continue_path, "speakers.json") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests2/test_glow_tts_train.py b/tests/tts_tests2/test_glow_tts_train.py deleted file mode 100644 index 0a8e226b65..0000000000 --- a/tests/tts_tests2/test_glow_tts_train.py +++ /dev/null @@ -1,73 +0,0 @@ -import glob -import json -import os -import shutil - -from trainer import get_last_checkpoint - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.glow_tts_config import GlowTTSConfig - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = GlowTTSConfig( - batch_size=2, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=True, - phoneme_language="en-us", - phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - print_eval=True, - test_sentences=[ - "Be a voice, not an echo.", - ], - data_dep_init_steps=1.0, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.formatter ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " - "--coqpit.test_delay_epochs 0" -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# Inference using TTS API -continue_config_path = os.path.join(continue_path, "config.json") -continue_restore_path, _ = get_last_checkpoint(continue_path) -out_wav_path = os.path.join(get_tests_output_path(), "output.wav") - -# Check integrity of the config -with open(continue_config_path, "r", encoding="utf-8") as f: - config_loaded = json.load(f) -assert config_loaded["characters"] is not None -assert config_loaded["output_path"] in continue_path -assert config_loaded["test_delay_epochs"] == 0 - -# Load the model and run inference -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" -run_cli(inference_command) - -# restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vc_tests/test_freevc.py b/tests/vc_tests/test_freevc.py index c90551b494..784e32a68d 100644 --- a/tests/vc_tests/test_freevc.py +++ b/tests/vc_tests/test_freevc.py @@ -22,31 +22,19 @@ class TestFreeVC(unittest.TestCase): def _create_inputs(self, config, batch_size=2): - input_dummy = torch.rand(batch_size, 30 * config.audio["hop_length"]).to(device) - input_lengths = torch.randint(100, 30 * config.audio["hop_length"], (batch_size,)).long().to(device) - input_lengths[-1] = 30 * config.audio["hop_length"] spec = torch.rand(batch_size, 30, config.audio["filter_length"] // 2 + 1).to(device) mel = torch.rand(batch_size, 30, config.audio["n_mel_channels"]).to(device) spec_lengths = torch.randint(20, 30, (batch_size,)).long().to(device) spec_lengths[-1] = spec.size(2) waveform = torch.rand(batch_size, spec.size(2) * config.audio["hop_length"]).to(device) - return input_dummy, input_lengths, mel, spec, spec_lengths, waveform + return mel, spec, spec_lengths, waveform @staticmethod def _create_inputs_inference(): - source_wav = torch.rand(16000) + source_wav = torch.rand(15999) target_wav = torch.rand(16000) return source_wav, target_wav - @staticmethod - def _check_parameter_changes(model, model_ref): - count = 0 - for param, param_ref in zip(model.parameters(), model_ref.parameters()): - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) - count += 1 - def test_methods(self): config = FreeVCConfig() model = FreeVC(config).to(device) @@ -67,9 +55,9 @@ def _test_forward(self, batch_size): config = FreeVCConfig() model = FreeVC(config).to(device) model.train() - print(" > Num parameters for FreeVC model:%s" % (count_parameters(model))) + print(f" > Num parameters for FreeVC model:{count_parameters(model)}") - _, _, mel, spec, spec_lengths, waveform = self._create_inputs(config, batch_size) + mel, spec, spec_lengths, waveform = self._create_inputs(config, batch_size) wavlm_vec = model.extract_wavlm_features(waveform) wavlm_vec_lengths = torch.ones(batch_size, dtype=torch.long) @@ -86,15 +74,15 @@ def _test_inference(self, batch_size): model = FreeVC(config).to(device) model.eval() - _, _, mel, _, _, waveform = self._create_inputs(config, batch_size) + mel, _, _, waveform = self._create_inputs(config, batch_size) wavlm_vec = model.extract_wavlm_features(waveform) wavlm_vec_lengths = torch.ones(batch_size, dtype=torch.long) output_wav = model.inference(wavlm_vec, None, mel, wavlm_vec_lengths) - assert ( - output_wav.shape[-1] // config.audio.hop_length == wavlm_vec.shape[-1] - ), f"{output_wav.shape[-1] // config.audio.hop_length} != {wavlm_vec.shape}" + assert output_wav.shape[-1] // config.audio.hop_length == wavlm_vec.shape[-1], ( + f"{output_wav.shape[-1] // config.audio.hop_length} != {wavlm_vec.shape}" + ) def test_inference(self): self._test_inference(1) @@ -107,9 +95,9 @@ def test_voice_conversion(self): source_wav, target_wav = self._create_inputs_inference() output_wav = model.voice_conversion(source_wav, target_wav) - assert ( - output_wav.shape[0] + config.audio.hop_length == source_wav.shape[0] - ), f"{output_wav.shape} != {source_wav.shape}" + assert output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length, ( + f"{output_wav.shape} != {source_wav.shape}, {config.audio.hop_length}" + ) def test_train_step(self): ... diff --git a/tests/vc_tests/test_openvoice.py b/tests/vc_tests/test_openvoice.py new file mode 100644 index 0000000000..703873ea47 --- /dev/null +++ b/tests/vc_tests/test_openvoice.py @@ -0,0 +1,41 @@ +import os +import unittest + +import torch + +from tests import get_tests_input_path +from TTS.vc.models.openvoice import OpenVoice, OpenVoiceConfig + +torch.manual_seed(1) +use_cuda = torch.cuda.is_available() +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + +c = OpenVoiceConfig() + +WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") + + +class TestOpenVoice(unittest.TestCase): + @staticmethod + def _create_inputs_inference(): + source_wav = torch.rand(16100) + target_wav = torch.rand(16000) + return source_wav, target_wav + + def test_load_audio(self): + config = OpenVoiceConfig() + model = OpenVoice(config).to(device) + wav = model.load_audio(WAV_FILE) + wav2 = model.load_audio(wav) + assert all(torch.isclose(wav, wav2)) + + def test_voice_conversion(self): + config = OpenVoiceConfig() + model = OpenVoice(config).to(device) + model.eval() + + source_wav, target_wav = self._create_inputs_inference() + output_wav = model.voice_conversion(source_wav, target_wav) + assert output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length, ( + f"{output_wav.shape} != {source_wav.shape}" + ) diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py deleted file mode 100644 index 9d4e193382..0000000000 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ /dev/null @@ -1,43 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import FullbandMelganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = FullbandMelganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py deleted file mode 100644 index c506fb48dc..0000000000 --- a/tests/vocoder_tests/test_hifigan_train.py +++ /dev/null @@ -1,43 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import HifiganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = HifiganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=1024, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py deleted file mode 100644 index 6ef9cd495b..0000000000 --- a/tests/vocoder_tests/test_melgan_train.py +++ /dev/null @@ -1,43 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import MelganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = MelganConfig( - batch_size=4, - eval_batch_size=4, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=2048, - eval_split_size=1, - print_step=1, - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py deleted file mode 100644 index 8002760706..0000000000 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ /dev/null @@ -1,44 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import MultibandMelganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = MultibandMelganConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - steps_to_start_discriminator=1, - data_path="tests/data/ljspeech", - discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py deleted file mode 100644 index a126befe2e..0000000000 --- a/tests/vocoder_tests/test_parallel_wavegan_train.py +++ /dev/null @@ -1,42 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import ParallelWaveganConfig - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = ParallelWaveganConfig( - batch_size=4, - eval_batch_size=4, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=2048, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_vocoder_gan_datasets.py b/tests/vocoder_tests/test_vocoder_gan_datasets.py index c39d70e94c..d540667ee8 100644 --- a/tests/vocoder_tests/test_vocoder_gan_datasets.py +++ b/tests/vocoder_tests/test_vocoder_gan_datasets.py @@ -3,16 +3,12 @@ import numpy as np from torch.utils.data import DataLoader -from tests import get_tests_output_path, get_tests_path +from tests import get_tests_path from TTS.utils.audio import AudioProcessor from TTS.vocoder.configs import BaseGANVocoderConfig from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data -file_path = os.path.dirname(os.path.realpath(__file__)) -OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") -os.makedirs(OUTPATH, exist_ok=True) - C = BaseGANVocoderConfig() test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") diff --git a/tests/vocoder_tests/test_vocoder_losses.py b/tests/vocoder_tests/test_vocoder_losses.py index 95501c2d39..c9432d7f4b 100644 --- a/tests/vocoder_tests/test_vocoder_losses.py +++ b/tests/vocoder_tests/test_vocoder_losses.py @@ -2,17 +2,12 @@ import torch -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +from tests import get_tests_input_path from TTS.config import BaseAudioConfig from TTS.utils.audio import AudioProcessor from TTS.utils.audio.numpy_transforms import stft from TTS.vocoder.layers.losses import MelganFeatureLoss, MultiScaleSTFTLoss, STFTLoss, TorchSTFT -TESTS_PATH = get_tests_path() - -OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") -os.makedirs(OUT_PATH, exist_ok=True) - WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") ap = AudioProcessor(**BaseAudioConfig().to_dict()) diff --git a/tests/vocoder_tests/test_vocoder_pqmf.py b/tests/vocoder_tests/test_vocoder_pqmf.py index afe8d1dc8f..9be492927d 100644 --- a/tests/vocoder_tests/test_vocoder_pqmf.py +++ b/tests/vocoder_tests/test_vocoder_pqmf.py @@ -4,14 +4,13 @@ import torch from librosa.core import load -from tests import get_tests_input_path, get_tests_output_path, get_tests_path +from tests import get_tests_input_path from TTS.vocoder.layers.pqmf import PQMF -TESTS_PATH = get_tests_path() WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -def test_pqmf(): +def test_pqmf(tmp_path): w, sr = load(WAV_FILE) layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) @@ -23,4 +22,4 @@ def test_pqmf(): print(w2_.max()) print(w2_.min()) print(w2_.mean()) - sf.write(os.path.join(get_tests_output_path(), "pqmf_output.wav"), w2_.flatten().detach(), sr) + sf.write(tmp_path / "pqmf_output.wav", w2_.flatten().detach(), sr) diff --git a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py index 503b4e2483..c3ae1309dc 100644 --- a/tests/vocoder_tests/test_vocoder_wavernn_datasets.py +++ b/tests/vocoder_tests/test_vocoder_wavernn_datasets.py @@ -1,29 +1,38 @@ import os -import shutil import numpy as np +import pytest from torch.utils.data import DataLoader -from tests import get_tests_output_path, get_tests_path +from tests import get_tests_path from TTS.utils.audio import AudioProcessor from TTS.vocoder.configs import WavernnConfig from TTS.vocoder.datasets.preprocess import load_wav_feat_data, preprocess_wav_files from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -file_path = os.path.dirname(os.path.realpath(__file__)) -OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") -os.makedirs(OUTPATH, exist_ok=True) - C = WavernnConfig() test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") -test_mel_feat_path = os.path.join(test_data_path, "mel") -test_quant_feat_path = os.path.join(test_data_path, "quant") -ok_ljspeech = os.path.exists(test_data_path) +params = [ + [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0], + [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4], + [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0], + [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0], + [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0], + [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2], + [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0], +] + + +@pytest.mark.parametrize("params", params) +def test_parametrized_wavernn_dataset(tmp_path, params): + """Run dataloader with given parameters and check conditions""" + print(params) + batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers = params + test_mel_feat_path = tmp_path / "mel" + test_quant_feat_path = tmp_path / "quant" -def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers): - """run dataloader with given parameters and check conditions""" ap = AudioProcessor(**C.audio) C.batch_size = batch_size @@ -31,7 +40,7 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor C.seq_len = seq_len C.data_path = test_data_path - preprocess_wav_files(test_data_path, C, ap) + preprocess_wav_files(tmp_path, C, ap) _, train_items = load_wav_feat_data(test_data_path, test_mel_feat_path, 5) dataset = WaveRNNDataset( @@ -50,35 +59,12 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor max_iter = 10 count_iter = 0 - try: - for data in loader: - x_input, mels, _ = data - expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2)) - assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}" - - assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1] - count_iter += 1 - if count_iter == max_iter: - break - # except AssertionError: - # shutil.rmtree(test_mel_feat_path) - # shutil.rmtree(test_quant_feat_path) - finally: - shutil.rmtree(test_mel_feat_path) - shutil.rmtree(test_quant_feat_path) - + for data in loader: + x_input, mels, _ = data + expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2)) + assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}" -def test_parametrized_wavernn_dataset(): - """test dataloader with different parameters""" - params = [ - [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0], - [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4], - [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0], - [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0], - [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0], - [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2], - [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0], - ] - for param in params: - print(param) - wavernn_dataset_case(*param) + assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1] + count_iter += 1 + if count_iter == max_iter: + break diff --git a/tests/vocoder_tests/test_wavegrad.py b/tests/vocoder_tests/test_wavegrad.py index 43b5f08042..d1d3610b70 100644 --- a/tests/vocoder_tests/test_wavegrad.py +++ b/tests/vocoder_tests/test_wavegrad.py @@ -1,5 +1,3 @@ -import unittest - import numpy as np import torch from torch import optim @@ -10,50 +8,43 @@ # pylint: disable=unused-variable torch.manual_seed(1) -use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -class WavegradTrainTest(unittest.TestCase): - def test_train_step(self): # pylint: disable=no-self-use - """Test if all layers are updated in a basic training cycle""" - input_dummy = torch.rand(8, 1, 20 * 300).to(device) - mel_spec = torch.rand(8, 80, 20).to(device) - - criterion = torch.nn.L1Loss().to(device) - args = WavegradArgs( - in_channels=80, - out_channels=1, - upsample_factors=[5, 5, 3, 2, 2], - upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - ) - config = WavegradConfig(model_params=args) - model = Wavegrad(config) - - model_ref = Wavegrad(config) - model.train() - model.to(device) - betas = np.linspace(1e-6, 1e-2, 1000) - model.compute_noise_level(betas) - model_ref.load_state_dict(model.state_dict()) - model_ref.to(device) - count = 0 - for param, param_ref in zip(model.parameters(), model_ref.parameters()): - assert (param - param_ref).sum() == 0, param - count += 1 - optimizer = optim.Adam(model.parameters(), lr=0.001) - for i in range(5): - y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device)) - optimizer.zero_grad() - loss = criterion(y_hat, input_dummy) - loss.backward() - optimizer.step() - # check parameter changes - count = 0 - for param, param_ref in zip(model.parameters(), model_ref.parameters()): - # ignore pre-higway layer since it works conditional - # if count not in [145, 59]: - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) - count += 1 +def test_train_step(): + """Test if all layers are updated in a basic training cycle""" + torch.set_grad_enabled(True) + input_dummy = torch.rand(8, 1, 20 * 300).to(device) + mel_spec = torch.rand(8, 80, 20).to(device) + + criterion = torch.nn.L1Loss().to(device) + args = WavegradArgs( + in_channels=80, + out_channels=1, + upsample_factors=[5, 5, 3, 2, 2], + upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], + ) + config = WavegradConfig(model_params=args) + model = Wavegrad(config) + + model_ref = Wavegrad(config) + model.train() + model.to(device) + betas = np.linspace(1e-6, 1e-2, 1000) + model.compute_noise_level(betas) + model_ref.load_state_dict(model.state_dict()) + model_ref.to(device) + for param, param_ref in zip(model.parameters(), model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + optimizer = optim.Adam(model.parameters(), lr=0.001) + for _ in range(5): + y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device)) + optimizer.zero_grad() + loss = criterion(y_hat, input_dummy) + loss.backward() + optimizer.step() + # check parameter changes + for i, (param, param_ref) in enumerate(zip(model.parameters(), model_ref.parameters())): + # ignore pre-higway layer since it works conditional + # if count not in [145, 59]: + assert (param != param_ref).any(), f"param {i} with shape {param.shape} not updated!! \n{param}\n{param_ref}" diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py deleted file mode 100644 index 9b10759505..0000000000 --- a/tests/vocoder_tests/test_wavegrad_train.py +++ /dev/null @@ -1,54 +0,0 @@ -import glob -import os -import shutil -import unittest - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import WavegradConfig - - -class WavegradTrainingTest(unittest.TestCase): - # TODO: Reactivate after improving CI run times - # This test currently takes ~2h on CI (15min/step vs 8sec/step locally) - if os.getenv("GITHUB_ACTIONS") == "true": - __test__ = False - - def test_train(self): # pylint: disable=no-self-use - config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") - output_path = os.path.join(get_tests_output_path(), "train_outputs") - - config = WavegradConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, - test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}, - ) - config.audio.do_trim_silence = True - config.audio.trim_db = 60 - config.save_json(config_path) - - # train the model for one epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " - ) - run_cli(command_train) - - # Find latest folder - continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - - # restore the model and continue training for one more epoch - command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " - ) - run_cli(command_train) - shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py deleted file mode 100644 index 337e24259f..0000000000 --- a/tests/vocoder_tests/test_wavernn_train.py +++ /dev/null @@ -1,45 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.vocoder.configs import WavernnConfig -from TTS.vocoder.models.wavernn import WavernnArgs - -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - - -config = WavernnConfig( - model_args=WavernnArgs(), - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=256, # for shorter test time - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/xtts_tests/test_xtts_gpt_train.py b/tests/xtts_tests/test_xtts_gpt_train.py deleted file mode 100644 index b8b9a4e388..0000000000 --- a/tests/xtts_tests/test_xtts_gpt_train.py +++ /dev/null @@ -1,163 +0,0 @@ -import os -import shutil - -import torch -from trainer import Trainer, TrainerArgs - -from tests import get_tests_output_path -from TTS.config.shared_configs import BaseDatasetConfig -from TTS.tts.datasets import load_tts_samples -from TTS.tts.layers.xtts.dvae import DiscreteVAE -from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig - -config_dataset = BaseDatasetConfig( - formatter="ljspeech", - dataset_name="ljspeech", - path="tests/data/ljspeech/", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - language="en", -) - -DATASETS_CONFIG_LIST = [config_dataset] - -# Logging parameters -RUN_NAME = "GPT_XTTS_LJSpeech_FT" -PROJECT_NAME = "XTTS_trainer" -DASHBOARD_LOGGER = "tensorboard" -LOGGER_URI = None - -# Set here the path that the checkpoints will be saved. Default: ./run/training/ -OUT_PATH = os.path.join(get_tests_output_path(), "train_outputs", "xtts_tests") -os.makedirs(OUT_PATH, exist_ok=True) - -# Create DVAE checkpoint and mel_norms on test time -# DVAE parameters: For the training we need the dvae to extract the dvae tokens, given that you must provide the paths for this model -DVAE_CHECKPOINT = os.path.join(OUT_PATH, "dvae.pth") # DVAE checkpoint -MEL_NORM_FILE = os.path.join( - OUT_PATH, "mel_stats.pth" -) # Mel spectrogram norms, required for dvae mel spectrogram extraction -dvae = DiscreteVAE( - channels=80, - normalization=None, - positional_dims=1, - num_tokens=8192, - codebook_dim=512, - hidden_dim=512, - num_resnet_blocks=3, - kernel_size=3, - num_layers=2, - use_transposed_convs=False, -) -torch.save(dvae.state_dict(), DVAE_CHECKPOINT) -mel_stats = torch.ones(80) -torch.save(mel_stats, MEL_NORM_FILE) - - -# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. -TOKENIZER_FILE = "tests/inputs/xtts_vocab.json" # vocab.json file -XTTS_CHECKPOINT = None # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_style_emb_repetition_fix_gt/132500_gpt_ema_coqui_tts_with_enhanced_hifigan.pth" # model.pth file - - -# Training sentences generations -SPEAKER_REFERENCE = [ - "tests/data/ljspeech/wavs/LJ001-0002.wav" -] # speaker reference to be used in training test sentences -LANGUAGE = config_dataset.language - - -# Training Parameters -OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False -START_WITH_EVAL = False # if True it will star with evaluation -BATCH_SIZE = 2 # set here the batch size -GRAD_ACUMM_STEPS = 1 # set here the grad accumulation steps -# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly. - - -# init args and config -model_args = GPTArgs( - max_conditioning_length=132300, # 6 secs - min_conditioning_length=66150, # 3 secs - debug_loading_failures=False, - max_wav_length=255995, # ~11.6 seconds - max_text_length=200, - mel_norm_file=MEL_NORM_FILE, - dvae_checkpoint=DVAE_CHECKPOINT, - xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune - tokenizer_file=TOKENIZER_FILE, - gpt_num_audio_tokens=8194, - gpt_start_audio_token=8192, - gpt_stop_audio_token=8193, -) -audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) -config = GPTTrainerConfig( - epochs=1, - output_path=OUT_PATH, - model_args=model_args, - run_name=RUN_NAME, - project_name=PROJECT_NAME, - run_description=""" - GPT XTTS training - """, - dashboard_logger=DASHBOARD_LOGGER, - logger_uri=LOGGER_URI, - audio=audio_config, - batch_size=BATCH_SIZE, - batch_group_size=48, - eval_batch_size=BATCH_SIZE, - num_loader_workers=8, - eval_split_max_size=256, - print_step=50, - plot_step=100, - log_model_step=1000, - save_step=10000, - save_n_checkpoints=1, - save_checkpoints=True, - # target_loss="loss", - print_eval=False, - # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. - optimizer="AdamW", - optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, - optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, - lr=5e-06, # learning rate - lr_scheduler="MultiStepLR", - # it was adjusted accordly for the new step scheme - lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, - test_sentences=[ - { - "text": "This cake is great. It's so delicious and moist.", - "speaker_wav": SPEAKER_REFERENCE, - "language": LANGUAGE, - }, - ], -) - -# init the model from config -model = GPTTrainer.init_from_config(config) - -# load training samples -train_samples, eval_samples = load_tts_samples( - DATASETS_CONFIG_LIST, - eval_split=True, - eval_split_max_size=config.eval_split_max_size, - eval_split_size=config.eval_split_size, -) - -# init the trainer and 🚀 -trainer = Trainer( - TrainerArgs( - restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter - skip_train_epoch=False, - start_with_eval=True, - grad_accum_steps=GRAD_ACUMM_STEPS, - ), - config, - output_path=OUT_PATH, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, -) -trainer.fit() - -# remove output path -shutil.rmtree(OUT_PATH) diff --git a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py b/tests/xtts_tests/test_xtts_v2-0_gpt_train.py deleted file mode 100644 index 6663433c12..0000000000 --- a/tests/xtts_tests/test_xtts_v2-0_gpt_train.py +++ /dev/null @@ -1,163 +0,0 @@ -import os -import shutil - -import torch -from trainer import Trainer, TrainerArgs - -from tests import get_tests_output_path -from TTS.config.shared_configs import BaseDatasetConfig -from TTS.tts.datasets import load_tts_samples -from TTS.tts.layers.xtts.dvae import DiscreteVAE -from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig - -config_dataset = BaseDatasetConfig( - formatter="ljspeech", - dataset_name="ljspeech", - path="tests/data/ljspeech/", - meta_file_train="metadata.csv", - meta_file_val="metadata.csv", - language="en", -) - -DATASETS_CONFIG_LIST = [config_dataset] - -# Logging parameters -RUN_NAME = "GPT_XTTS_LJSpeech_FT" -PROJECT_NAME = "XTTS_trainer" -DASHBOARD_LOGGER = "tensorboard" -LOGGER_URI = None - -OUT_PATH = os.path.join(get_tests_output_path(), "train_outputs", "xtts_tests") -os.makedirs(OUT_PATH, exist_ok=True) - -# Create DVAE checkpoint and mel_norms on test time -# DVAE parameters: For the training we need the dvae to extract the dvae tokens, given that you must provide the paths for this model -DVAE_CHECKPOINT = os.path.join(OUT_PATH, "dvae.pth") # DVAE checkpoint -# Mel spectrogram norms, required for dvae mel spectrogram extraction -MEL_NORM_FILE = os.path.join(OUT_PATH, "mel_stats.pth") -dvae = DiscreteVAE( - channels=80, - normalization=None, - positional_dims=1, - num_tokens=8192, - codebook_dim=512, - hidden_dim=512, - num_resnet_blocks=3, - kernel_size=3, - num_layers=2, - use_transposed_convs=False, -) -torch.save(dvae.state_dict(), DVAE_CHECKPOINT) -mel_stats = torch.ones(80) -torch.save(mel_stats, MEL_NORM_FILE) - - -# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. -TOKENIZER_FILE = "tests/inputs/xtts_vocab.json" # vocab.json file -XTTS_CHECKPOINT = None # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_style_emb_repetition_fix_gt/132500_gpt_ema_coqui_tts_with_enhanced_hifigan.pth" # model.pth file - - -# Training sentences generations -SPEAKER_REFERENCE = [ - "tests/data/ljspeech/wavs/LJ001-0002.wav" -] # speaker reference to be used in training test sentences -LANGUAGE = config_dataset.language - - -# Training Parameters -OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False -START_WITH_EVAL = False # if True it will star with evaluation -BATCH_SIZE = 2 # set here the batch size -GRAD_ACUMM_STEPS = 1 # set here the grad accumulation steps -# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly. - - -# init args and config -model_args = GPTArgs( - max_conditioning_length=132300, # 6 secs - min_conditioning_length=66150, # 3 secs - debug_loading_failures=False, - max_wav_length=255995, # ~11.6 seconds - max_text_length=200, - mel_norm_file=MEL_NORM_FILE, - dvae_checkpoint=DVAE_CHECKPOINT, - xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune - tokenizer_file=TOKENIZER_FILE, - gpt_num_audio_tokens=8194, - gpt_start_audio_token=8192, - gpt_stop_audio_token=8193, - gpt_use_masking_gt_prompt_approach=True, - gpt_use_perceiver_resampler=True, -) - -audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000) - -config = GPTTrainerConfig( - epochs=1, - output_path=OUT_PATH, - model_args=model_args, - run_name=RUN_NAME, - project_name=PROJECT_NAME, - run_description="GPT XTTS training", - dashboard_logger=DASHBOARD_LOGGER, - logger_uri=LOGGER_URI, - audio=audio_config, - batch_size=BATCH_SIZE, - batch_group_size=48, - eval_batch_size=BATCH_SIZE, - num_loader_workers=8, - eval_split_max_size=256, - print_step=50, - plot_step=100, - log_model_step=1000, - save_step=10000, - save_n_checkpoints=1, - save_checkpoints=True, - # target_loss="loss", - print_eval=False, - # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters. - optimizer="AdamW", - optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS, - optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2}, - lr=5e-06, # learning rate - lr_scheduler="MultiStepLR", - # it was adjusted accordly for the new step scheme - lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1}, - test_sentences=[ - { - "text": "This cake is great. It's so delicious and moist.", - "speaker_wav": SPEAKER_REFERENCE, - "language": LANGUAGE, - }, - ], -) - -# init the model from config -model = GPTTrainer.init_from_config(config) - -# load training samples -train_samples, eval_samples = load_tts_samples( - DATASETS_CONFIG_LIST, - eval_split=True, - eval_split_max_size=config.eval_split_max_size, - eval_split_size=config.eval_split_size, -) - -# init the trainer and 🚀 -trainer = Trainer( - TrainerArgs( - restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter - skip_train_epoch=False, - start_with_eval=True, - grad_accum_steps=GRAD_ACUMM_STEPS, - ), - config, - output_path=OUT_PATH, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, -) -trainer.fit() - -# remove output path -shutil.rmtree(OUT_PATH) diff --git a/tests/zoo_tests/test_big_models.py b/tests/zoo_tests/test_big_models.py new file mode 100644 index 0000000000..8a9780b4f0 --- /dev/null +++ b/tests/zoo_tests/test_big_models.py @@ -0,0 +1,193 @@ +"""These tests should be run locally because the models are too big for CI.""" + +import os + +import pytest +import torch + +from tests import get_tests_data_path, run_main +from TTS.bin.synthesize import main +from TTS.utils.manage import ModelManager + +GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" + + +@pytest.fixture(scope="session", autouse=True) +def set_env(): + os.environ["COQUI_TOS_AGREED"] = "1" + + +@pytest.fixture +def manager(): + """Set up model manager.""" + return ModelManager(progress_bar=False) + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_xtts(tmp_path): + """XTTS is too big to run on github actions. We need to test it locally""" + args = [ + "--model_name", + "tts_models/multilingual/multi-dataset/xtts_v1.1", + "--text", + "C'est un exemple.", + "--language_idx", + "fr", + "--out_path", + str(tmp_path / "output.wav"), + "--no-progress_bar", + "--speaker_wav", + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"), + ] + if torch.cuda.is_available(): + args.append("--use_cuda") + run_main(main, args) + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_xtts_streaming(manager): + """Testing the new inference_stream method""" + from TTS.tts.configs.xtts_config import XttsConfig + from TTS.tts.models.xtts import Xtts + + speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] + speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav") + speaker_wav.append(speaker_wav_2) + model_path, _, _ = manager.download_model("tts_models/multilingual/multi-dataset/xtts_v1.1") + config = XttsConfig() + config.load_json(model_path / "config.json") + model = Xtts.init_from_config(config) + model.load_checkpoint(config, checkpoint_dir=str(model_path)) + model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) + + print("Computing speaker latents...") + gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) + + print("Inference...") + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + ) + wav_chunks = [] + for i, chunk in enumerate(chunks): + if i == 0: + assert chunk.shape[-1] > 5000 + wav_chunks.append(chunk) + assert len(wav_chunks) > 1 + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_xtts_v2(tmp_path): + """XTTS is too big to run on github actions. We need to test it locally""" + args = [ + "--model_name", + "tts_models/multilingual/multi-dataset/xtts_v2", + "--text", + "C'est un exemple.", + "--language_idx", + "fr", + "--out_path", + str(tmp_path / "output.wav"), + "--no-progress_bar", + "--speaker_wav", + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"), + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav"), + ] + if torch.cuda.is_available(): + args.append("--use_cuda") + run_main(main, args) + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_xtts_v2_streaming(manager): + """Testing the new inference_stream method""" + from TTS.tts.configs.xtts_config import XttsConfig + from TTS.tts.models.xtts import Xtts + + speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] + model_path, _, _ = manager.download_model("tts_models/multilingual/multi-dataset/xtts_v2") + config = XttsConfig() + config.load_json(model_path / "config.json") + model = Xtts.init_from_config(config) + model.load_checkpoint(config, checkpoint_dir=str(model_path)) + model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) + + print("Computing speaker latents...") + gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) + + print("Inference...") + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + ) + wav_chunks = [] + for i, chunk in enumerate(chunks): + if i == 0: + assert chunk.shape[-1] > 5000 + wav_chunks.append(chunk) + assert len(wav_chunks) > 1 + normal_len = sum([len(chunk) for chunk in wav_chunks]) + + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + speed=1.5, + ) + wav_chunks = [] + for i, chunk in enumerate(chunks): + wav_chunks.append(chunk) + fast_len = sum([len(chunk) for chunk in wav_chunks]) + + chunks = model.inference_stream( + "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", + "en", + gpt_cond_latent, + speaker_embedding, + speed=0.66, + ) + wav_chunks = [] + for i, chunk in enumerate(chunks): + wav_chunks.append(chunk) + slow_len = sum([len(chunk) for chunk in wav_chunks]) + + assert slow_len > normal_len + assert normal_len > fast_len + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_tortoise(tmp_path): + args = [ + "--model_name", + "tts_models/en/multi-dataset/tortoise-v2", + "--text", + "This is an example.", + "--out_path", + str(tmp_path / "output.wav"), + "--no-progress_bar", + ] + if torch.cuda.is_available(): + args.append("--use_cuda") + run_main(main, args) + + +@pytest.mark.skipif(GITHUB_ACTIONS, reason="Model too big for CI") +def test_bark(tmp_path): + """Bark is too big to run on github actions. We need to test it locally""" + args = [ + "--model_name", + "tts_models/multilingual/multi-dataset/bark", + "--text", + "This is an example.", + "--out_path", + str(tmp_path / "output.wav"), + "--no-progress_bar", + ] + if torch.cuda.is_available(): + args.append("--use_cuda") + run_main(main, args) diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index b944423988..9f02672ef1 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3` -import glob import os import shutil -import torch -from trainer.io import get_user_data_dir +import pytest -from tests import get_tests_data_path, get_tests_output_path, run_cli +from tests import get_tests_data_path, run_main +from TTS.api import TTS +from TTS.bin.synthesize import main from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.manage import ModelManager @@ -19,252 +19,81 @@ ] -def run_models(offset=0, step=1): - """Check if all the models are downloadable and tts models run correctly.""" - print(" > Run synthesizer with all the models.") - output_path = os.path.join(get_tests_output_path(), "output.wav") - manager = ModelManager(output_prefix=get_tests_output_path(), progress_bar=False) - model_names = [name for name in manager.list_models() if name not in MODELS_WITH_SEP_TESTS] - print("Model names:", model_names) - for model_name in model_names[offset::step]: - print(f"\n > Run - {model_name}") - model_path, _, _ = manager.download_model(model_name) - if "tts_models" in model_name: - local_download_dir = os.path.dirname(model_path) - # download and run the model - speaker_files = glob.glob(local_download_dir + "/speaker*") - language_files = glob.glob(local_download_dir + "/language*") - language_id = "" - if len(speaker_files) > 0: - # multi-speaker model - if "speaker_ids" in speaker_files[0]: - speaker_manager = SpeakerManager(speaker_id_file_path=speaker_files[0]) - elif "speakers" in speaker_files[0]: - speaker_manager = SpeakerManager(d_vectors_file_path=speaker_files[0]) - - # multi-lingual model - Assuming multi-lingual models are also multi-speaker - if len(language_files) > 0 and "language_ids" in language_files[0]: - language_manager = LanguageManager(language_ids_file_path=language_files[0]) - language_id = language_manager.language_names[0] - - speaker_id = list(speaker_manager.name_to_id.keys())[0] - run_cli( - f"tts --model_name {model_name} " - f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" --no-progress_bar' - ) - else: - # single-speaker model - run_cli( - f"tts --model_name {model_name} " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar' - ) - # remove downloaded models - shutil.rmtree(local_download_dir) - shutil.rmtree(get_user_data_dir("tts")) - elif "voice_conversion_models" in model_name: - speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") - reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav") - run_cli( - f"tts --model_name {model_name} " - f'--out_path "{output_path}" --source_wav "{speaker_wav}" --target_wav "{reference_wav}" --no-progress_bar' - ) - else: - # only download the model - manager.download_model(model_name) - print(f" | > OK: {model_name}") - - -def test_xtts(): - """XTTS is too big to run on github actions. We need to test it locally""" - output_path = os.path.join(get_tests_output_path(), "output.wav") - speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") - use_gpu = torch.cuda.is_available() - if use_gpu: - run_cli( - "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda ' - f'--speaker_wav "{speaker_wav}" --language_idx "en"' - ) - else: - run_cli( - "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar ' - f'--speaker_wav "{speaker_wav}" --language_idx "en"' - ) - - -def test_xtts_streaming(): - """Testing the new inference_stream method""" - from TTS.tts.configs.xtts_config import XttsConfig - from TTS.tts.models.xtts import Xtts - - speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] - speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav") - speaker_wav.append(speaker_wav_2) - model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1.1") - config = XttsConfig() - config.load_json(os.path.join(model_path, "config.json")) - model = Xtts.init_from_config(config) - model.load_checkpoint(config, checkpoint_dir=model_path) - model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) - - print("Computing speaker latents...") - gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) - - print("Inference...") - chunks = model.inference_stream( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding, - ) - wav_chuncks = [] - for i, chunk in enumerate(chunks): - if i == 0: - assert chunk.shape[-1] > 5000 - wav_chuncks.append(chunk) - assert len(wav_chuncks) > 1 - - -def test_xtts_v2(): - """XTTS is too big to run on github actions. We need to test it locally""" - output_path = os.path.join(get_tests_output_path(), "output.wav") - speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") - speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav") - use_gpu = torch.cuda.is_available() - if use_gpu: - run_cli( - "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda ' - f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"' - ) +@pytest.fixture(autouse=True) +def run_around_tests(tmp_path): + """Download models to a temp folder and delete it afterwards.""" + os.environ["TTS_HOME"] = str(tmp_path) + yield + shutil.rmtree(tmp_path) + + +@pytest.fixture +def manager(tmp_path): + """Set up model manager.""" + return ModelManager(output_prefix=tmp_path, progress_bar=False) + + +# To split tests into different CI jobs +num_partitions = int(os.getenv("NUM_PARTITIONS", "1")) +partition = int(os.getenv("TEST_PARTITION", "0")) +model_names = [name for name in TTS.list_models() if name not in MODELS_WITH_SEP_TESTS] +model_names.extend(["tts_models/deu/fairseq/vits", "tts_models/sqi/fairseq/vits"]) +model_names = [name for i, name in enumerate(model_names) if i % num_partitions == partition] + + +@pytest.mark.parametrize("model_name", model_names) +def test_models(tmp_path, model_name, manager): + print(f"\n > Run - {model_name}") + output_path = str(tmp_path / "output.wav") + model_path, _, _ = manager.download_model(model_name) + args = ["--model_name", model_name, "--out_path", output_path, "--no-progress_bar"] + if "tts_models" in model_name: + local_download_dir = model_path.parent + # download and run the model + speaker_files = list(local_download_dir.glob("speaker*")) + language_files = list(local_download_dir.glob("language*")) + speaker_arg = [] + language_arg = [] + if len(speaker_files) > 0: + # multi-speaker model + if "speaker_ids" in speaker_files[0].stem: + speaker_manager = SpeakerManager(speaker_id_file_path=speaker_files[0]) + elif "speakers" in speaker_files[0].stem: + speaker_manager = SpeakerManager(d_vectors_file_path=speaker_files[0]) + speakers = list(speaker_manager.name_to_id.keys()) + if len(speakers) > 1: + speaker_arg = ["--speaker_idx", speakers[0]] + if len(language_files) > 0 and "language_ids" in language_files[0].stem: + # multi-lingual model + language_manager = LanguageManager(language_ids_file_path=language_files[0]) + languages = language_manager.language_names + if len(languages) > 1: + language_arg = ["--language_idx", languages[0]] + run_main(main, [*args, "--text", "This is an example.", *speaker_arg, *language_arg]) + elif "voice_conversion_models" in model_name: + speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") + reference_wav1 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0028.wav") + reference_wav2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav") + run_main(main, [*args, "--source_wav", speaker_wav, "--target_wav", reference_wav1, reference_wav2]) else: - run_cli( - "yes | " - f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar ' - f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"' - ) - - -def test_xtts_v2_streaming(): - """Testing the new inference_stream method""" - from TTS.tts.configs.xtts_config import XttsConfig - from TTS.tts.models.xtts import Xtts + # only download the model + manager.download_model(model_name) + print(f" | > OK: {model_name}") - speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] - model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2") - config = XttsConfig() - config.load_json(os.path.join(model_path, "config.json")) - model = Xtts.init_from_config(config) - model.load_checkpoint(config, checkpoint_dir=model_path) - model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) - - print("Computing speaker latents...") - gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav) - - print("Inference...") - chunks = model.inference_stream( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding, - ) - wav_chuncks = [] - for i, chunk in enumerate(chunks): - if i == 0: - assert chunk.shape[-1] > 5000 - wav_chuncks.append(chunk) - assert len(wav_chuncks) > 1 - normal_len = sum([len(chunk) for chunk in wav_chuncks]) - - chunks = model.inference_stream( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding, - speed=1.5, - ) - wav_chuncks = [] - for i, chunk in enumerate(chunks): - wav_chuncks.append(chunk) - fast_len = sum([len(chunk) for chunk in wav_chuncks]) - chunks = model.inference_stream( - "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.", - "en", - gpt_cond_latent, - speaker_embedding, - speed=0.66, - ) - wav_chuncks = [] - for i, chunk in enumerate(chunks): - wav_chuncks.append(chunk) - slow_len = sum([len(chunk) for chunk in wav_chuncks]) - - assert slow_len > normal_len - assert normal_len > fast_len - - -def test_tortoise(): - output_path = os.path.join(get_tests_output_path(), "output.wav") - use_gpu = torch.cuda.is_available() - if use_gpu: - run_cli( - f" tts --model_name tts_models/en/multi-dataset/tortoise-v2 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda' - ) - else: - run_cli( - f" tts --model_name tts_models/en/multi-dataset/tortoise-v2 " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar' - ) - - -def test_bark(): - """Bark is too big to run on github actions. We need to test it locally""" - output_path = os.path.join(get_tests_output_path(), "output.wav") - use_gpu = torch.cuda.is_available() - if use_gpu: - run_cli( - f" tts --model_name tts_models/multilingual/multi-dataset/bark " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar --use_cuda' - ) - else: - run_cli( - f" tts --model_name tts_models/multilingual/multi-dataset/bark " - f'--text "This is an example." --out_path "{output_path}" --no-progress_bar' - ) - - -def test_voice_conversion(): +def test_voice_conversion(tmp_path): print(" > Run voice conversion inference using YourTTS model.") - model_name = "tts_models/multilingual/multi-dataset/your_tts" - language_id = "en" - speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") - reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav") - output_path = os.path.join(get_tests_output_path(), "output.wav") - run_cli( - f"tts --model_name {model_name}" - f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} --no-progress_bar" - ) - - -""" -These are used to split tests into different actions on Github. -""" - - -def test_models_offset_0_step_3(): - run_models(offset=0, step=3) - - -def test_models_offset_1_step_3(): - run_models(offset=1, step=3) - - -def test_models_offset_2_step_3(): - run_models(offset=2, step=3) + args = [ + "--model_name", + "tts_models/multilingual/multi-dataset/your_tts", + "--out_path", + str(tmp_path / "output.wav"), + "--speaker_wav", + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav"), + "--reference_wav", + os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav"), + "--language_idx", + "en", + "--no-progress_bar", + ] + run_main(main, args)